test_common.py 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests that work on both the Python and C engines but do not have a
  4. specific classification into the other test modules.
  5. """
  6. import codecs
  7. from collections import OrderedDict
  8. import csv
  9. from datetime import datetime
  10. import os
  11. import platform
  12. from tempfile import TemporaryFile
  13. import numpy as np
  14. import pytest
  15. from pandas._libs.tslib import Timestamp
  16. from pandas.compat import BytesIO, StringIO, lrange, range, u
  17. from pandas.errors import DtypeWarning, EmptyDataError, ParserError
  18. from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
  19. import pandas.util.testing as tm
  20. from pandas.io.common import URLError
  21. from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
  22. def test_override_set_noconvert_columns():
  23. # see gh-17351
  24. #
  25. # Usecols needs to be sorted in _set_noconvert_columns based
  26. # on the test_usecols_with_parse_dates test from test_usecols.py
  27. class MyTextFileReader(TextFileReader):
  28. def __init__(self):
  29. self._currow = 0
  30. self.squeeze = False
  31. class MyCParserWrapper(CParserWrapper):
  32. def _set_noconvert_columns(self):
  33. if self.usecols_dtype == "integer":
  34. # self.usecols is a set, which is documented as unordered
  35. # but in practice, a CPython set of integers is sorted.
  36. # In other implementations this assumption does not hold.
  37. # The following code simulates a different order, which
  38. # before GH 17351 would cause the wrong columns to be
  39. # converted via the parse_dates parameter
  40. self.usecols = list(self.usecols)
  41. self.usecols.reverse()
  42. return CParserWrapper._set_noconvert_columns(self)
  43. data = """a,b,c,d,e
  44. 0,1,20140101,0900,4
  45. 0,1,20140102,1000,4"""
  46. parse_dates = [[1, 2]]
  47. cols = {
  48. "a": [0, 0],
  49. "c_d": [
  50. Timestamp("2014-01-01 09:00:00"),
  51. Timestamp("2014-01-02 10:00:00")
  52. ]
  53. }
  54. expected = DataFrame(cols, columns=["c_d", "a"])
  55. parser = MyTextFileReader()
  56. parser.options = {"usecols": [0, 2, 3],
  57. "parse_dates": parse_dates,
  58. "delimiter": ","}
  59. parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
  60. result = parser.read()
  61. tm.assert_frame_equal(result, expected)
  62. def test_bytes_io_input(all_parsers):
  63. if compat.PY2:
  64. pytest.skip("Bytes-related test does not need to work on Python 2.x")
  65. encoding = "cp1255"
  66. parser = all_parsers
  67. data = BytesIO("שלום:1234\n562:123".encode(encoding))
  68. result = parser.read_csv(data, sep=":", encoding=encoding)
  69. expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
  70. tm.assert_frame_equal(result, expected)
  71. def test_empty_decimal_marker(all_parsers):
  72. data = """A|B|C
  73. 1|2,334|5
  74. 10|13|10.
  75. """
  76. # Parsers support only length-1 decimals
  77. msg = "Only length-1 decimal markers supported"
  78. parser = all_parsers
  79. with pytest.raises(ValueError, match=msg):
  80. parser.read_csv(StringIO(data), decimal="")
  81. def test_bad_stream_exception(all_parsers, csv_dir_path):
  82. # see gh-13652
  83. #
  84. # This test validates that both the Python engine and C engine will
  85. # raise UnicodeDecodeError instead of C engine raising ParserError
  86. # and swallowing the exception that caused read to fail.
  87. path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
  88. codec = codecs.lookup("utf-8")
  89. utf8 = codecs.lookup('utf-8')
  90. parser = all_parsers
  91. msg = ("'utf-8' codec can't decode byte" if compat.PY3
  92. else "'utf8' codec can't decode byte")
  93. # Stream must be binary UTF8.
  94. with open(path, "rb") as handle, codecs.StreamRecoder(
  95. handle, utf8.encode, utf8.decode, codec.streamreader,
  96. codec.streamwriter) as stream:
  97. with pytest.raises(UnicodeDecodeError, match=msg):
  98. parser.read_csv(stream)
  99. @pytest.mark.skipif(compat.PY2, reason="PY3-only test")
  100. def test_read_csv_local(all_parsers, csv1):
  101. prefix = u("file:///") if compat.is_platform_windows() else u("file://")
  102. parser = all_parsers
  103. fname = prefix + compat.text_type(os.path.abspath(csv1))
  104. result = parser.read_csv(fname, index_col=0, parse_dates=True)
  105. expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738],
  106. [1.047916, -0.041232, -0.16181208307, 0.212549],
  107. [0.498581, 0.731168, -0.537677223318, 1.346270],
  108. [1.120202, 1.567621, 0.00364077397681, 0.675253],
  109. [-0.487094, 0.571455, -1.6116394093, 0.103469],
  110. [0.836649, 0.246462, 0.588542635376, 1.062782],
  111. [-0.157161, 1.340307, 1.1957779562, -1.097007]],
  112. columns=["A", "B", "C", "D"],
  113. index=Index([datetime(2000, 1, 3),
  114. datetime(2000, 1, 4),
  115. datetime(2000, 1, 5),
  116. datetime(2000, 1, 6),
  117. datetime(2000, 1, 7),
  118. datetime(2000, 1, 10),
  119. datetime(2000, 1, 11)], name="index"))
  120. tm.assert_frame_equal(result, expected)
  121. def test_1000_sep(all_parsers):
  122. parser = all_parsers
  123. data = """A|B|C
  124. 1|2,334|5
  125. 10|13|10.
  126. """
  127. expected = DataFrame({
  128. "A": [1, 10],
  129. "B": [2334, 13],
  130. "C": [5, 10.]
  131. })
  132. result = parser.read_csv(StringIO(data), sep="|", thousands=",")
  133. tm.assert_frame_equal(result, expected)
  134. def test_squeeze(all_parsers):
  135. data = """\
  136. a,1
  137. b,2
  138. c,3
  139. """
  140. parser = all_parsers
  141. index = Index(["a", "b", "c"], name=0)
  142. expected = Series([1, 2, 3], name=1, index=index)
  143. result = parser.read_csv(StringIO(data), index_col=0,
  144. header=None, squeeze=True)
  145. tm.assert_series_equal(result, expected)
  146. # see gh-8217
  147. #
  148. # Series should not be a view.
  149. assert not result._is_view
  150. def test_malformed(all_parsers):
  151. # see gh-6607
  152. parser = all_parsers
  153. data = """ignore
  154. A,B,C
  155. 1,2,3 # comment
  156. 1,2,3,4,5
  157. 2,3,4
  158. """
  159. msg = "Expected 3 fields in line 4, saw 5"
  160. with pytest.raises(ParserError, match=msg):
  161. parser.read_csv(StringIO(data), header=1, comment="#")
  162. @pytest.mark.parametrize("nrows", [5, 3, None])
  163. def test_malformed_chunks(all_parsers, nrows):
  164. data = """ignore
  165. A,B,C
  166. skip
  167. 1,2,3
  168. 3,5,10 # comment
  169. 1,2,3,4,5
  170. 2,3,4
  171. """
  172. parser = all_parsers
  173. msg = 'Expected 3 fields in line 6, saw 5'
  174. reader = parser.read_csv(StringIO(data), header=1, comment="#",
  175. iterator=True, chunksize=1, skiprows=[2])
  176. with pytest.raises(ParserError, match=msg):
  177. reader.read(nrows)
  178. def test_unnamed_columns(all_parsers):
  179. data = """A,B,C,,
  180. 1,2,3,4,5
  181. 6,7,8,9,10
  182. 11,12,13,14,15
  183. """
  184. parser = all_parsers
  185. expected = DataFrame([[1, 2, 3, 4, 5],
  186. [6, 7, 8, 9, 10],
  187. [11, 12, 13, 14, 15]],
  188. dtype=np.int64, columns=["A", "B", "C",
  189. "Unnamed: 3",
  190. "Unnamed: 4"])
  191. result = parser.read_csv(StringIO(data))
  192. tm.assert_frame_equal(result, expected)
  193. def test_csv_mixed_type(all_parsers):
  194. data = """A,B,C
  195. a,1,2
  196. b,3,4
  197. c,4,5
  198. """
  199. parser = all_parsers
  200. expected = DataFrame({"A": ["a", "b", "c"],
  201. "B": [1, 3, 4],
  202. "C": [2, 4, 5]})
  203. result = parser.read_csv(StringIO(data))
  204. tm.assert_frame_equal(result, expected)
  205. def test_read_csv_low_memory_no_rows_with_index(all_parsers):
  206. # see gh-21141
  207. parser = all_parsers
  208. if not parser.low_memory:
  209. pytest.skip("This is a low-memory specific test")
  210. data = """A,B,C
  211. 1,1,1,2
  212. 2,2,3,4
  213. 3,3,4,5
  214. """
  215. result = parser.read_csv(StringIO(data), low_memory=True,
  216. index_col=0, nrows=0)
  217. expected = DataFrame(columns=["A", "B", "C"])
  218. tm.assert_frame_equal(result, expected)
  219. def test_read_csv_dataframe(all_parsers, csv1):
  220. parser = all_parsers
  221. result = parser.read_csv(csv1, index_col=0, parse_dates=True)
  222. expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738],
  223. [1.047916, -0.041232, -0.16181208307, 0.212549],
  224. [0.498581, 0.731168, -0.537677223318, 1.346270],
  225. [1.120202, 1.567621, 0.00364077397681, 0.675253],
  226. [-0.487094, 0.571455, -1.6116394093, 0.103469],
  227. [0.836649, 0.246462, 0.588542635376, 1.062782],
  228. [-0.157161, 1.340307, 1.1957779562, -1.097007]],
  229. columns=["A", "B", "C", "D"],
  230. index=Index([datetime(2000, 1, 3),
  231. datetime(2000, 1, 4),
  232. datetime(2000, 1, 5),
  233. datetime(2000, 1, 6),
  234. datetime(2000, 1, 7),
  235. datetime(2000, 1, 10),
  236. datetime(2000, 1, 11)], name="index"))
  237. tm.assert_frame_equal(result, expected)
  238. def test_read_csv_no_index_name(all_parsers, csv_dir_path):
  239. parser = all_parsers
  240. csv2 = os.path.join(csv_dir_path, "test2.csv")
  241. result = parser.read_csv(csv2, index_col=0, parse_dates=True)
  242. expected = DataFrame([[0.980269, 3.685731, -0.364216805298,
  243. -1.159738, "foo"],
  244. [1.047916, -0.041232, -0.16181208307,
  245. 0.212549, "bar"],
  246. [0.498581, 0.731168, -0.537677223318,
  247. 1.346270, "baz"],
  248. [1.120202, 1.567621, 0.00364077397681,
  249. 0.675253, "qux"],
  250. [-0.487094, 0.571455, -1.6116394093,
  251. 0.103469, "foo2"]],
  252. columns=["A", "B", "C", "D", "E"],
  253. index=Index([datetime(2000, 1, 3),
  254. datetime(2000, 1, 4),
  255. datetime(2000, 1, 5),
  256. datetime(2000, 1, 6),
  257. datetime(2000, 1, 7)]))
  258. tm.assert_frame_equal(result, expected)
  259. def test_read_csv_unicode(all_parsers):
  260. parser = all_parsers
  261. data = BytesIO(u("\u0141aski, Jan;1").encode("utf-8"))
  262. result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
  263. expected = DataFrame([[u("\u0141aski, Jan"), 1]])
  264. tm.assert_frame_equal(result, expected)
  265. def test_read_csv_wrong_num_columns(all_parsers):
  266. # Too few columns.
  267. data = """A,B,C,D,E,F
  268. 1,2,3,4,5,6
  269. 6,7,8,9,10,11,12
  270. 11,12,13,14,15,16
  271. """
  272. parser = all_parsers
  273. msg = "Expected 6 fields in line 3, saw 7"
  274. with pytest.raises(ParserError, match=msg):
  275. parser.read_csv(StringIO(data))
  276. def test_read_duplicate_index_explicit(all_parsers):
  277. data = """index,A,B,C,D
  278. foo,2,3,4,5
  279. bar,7,8,9,10
  280. baz,12,13,14,15
  281. qux,12,13,14,15
  282. foo,12,13,14,15
  283. bar,12,13,14,15
  284. """
  285. parser = all_parsers
  286. result = parser.read_csv(StringIO(data), index_col=0)
  287. expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10],
  288. [12, 13, 14, 15], [12, 13, 14, 15],
  289. [12, 13, 14, 15], [12, 13, 14, 15]],
  290. columns=["A", "B", "C", "D"],
  291. index=Index(["foo", "bar", "baz",
  292. "qux", "foo", "bar"], name="index"))
  293. tm.assert_frame_equal(result, expected)
  294. def test_read_duplicate_index_implicit(all_parsers):
  295. data = """A,B,C,D
  296. foo,2,3,4,5
  297. bar,7,8,9,10
  298. baz,12,13,14,15
  299. qux,12,13,14,15
  300. foo,12,13,14,15
  301. bar,12,13,14,15
  302. """
  303. parser = all_parsers
  304. result = parser.read_csv(StringIO(data))
  305. expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10],
  306. [12, 13, 14, 15], [12, 13, 14, 15],
  307. [12, 13, 14, 15], [12, 13, 14, 15]],
  308. columns=["A", "B", "C", "D"],
  309. index=Index(["foo", "bar", "baz",
  310. "qux", "foo", "bar"]))
  311. tm.assert_frame_equal(result, expected)
  312. @pytest.mark.parametrize("data,kwargs,expected", [
  313. ("A,B\nTrue,1\nFalse,2\nTrue,3", dict(),
  314. DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])),
  315. ("A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
  316. dict(true_values=["yes", "Yes", "YES"],
  317. false_values=["no", "NO", "No"]),
  318. DataFrame([[True, 1], [False, 2], [True, 3],
  319. [False, 3], [True, 3]], columns=["A", "B"])),
  320. ("A,B\nTRUE,1\nFALSE,2\nTRUE,3", dict(),
  321. DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])),
  322. ("A,B\nfoo,bar\nbar,foo", dict(true_values=["foo"],
  323. false_values=["bar"]),
  324. DataFrame([[True, False], [False, True]], columns=["A", "B"]))
  325. ])
  326. def test_parse_bool(all_parsers, data, kwargs, expected):
  327. parser = all_parsers
  328. result = parser.read_csv(StringIO(data), **kwargs)
  329. tm.assert_frame_equal(result, expected)
  330. def test_int_conversion(all_parsers):
  331. data = """A,B
  332. 1.0,1
  333. 2.0,2
  334. 3.0,3
  335. """
  336. parser = all_parsers
  337. result = parser.read_csv(StringIO(data))
  338. expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
  339. tm.assert_frame_equal(result, expected)
  340. @pytest.mark.parametrize("nrows", [3, 3.0])
  341. def test_read_nrows(all_parsers, nrows):
  342. # see gh-10476
  343. data = """index,A,B,C,D
  344. foo,2,3,4,5
  345. bar,7,8,9,10
  346. baz,12,13,14,15
  347. qux,12,13,14,15
  348. foo2,12,13,14,15
  349. bar2,12,13,14,15
  350. """
  351. expected = DataFrame([["foo", 2, 3, 4, 5],
  352. ["bar", 7, 8, 9, 10],
  353. ["baz", 12, 13, 14, 15]],
  354. columns=["index", "A", "B", "C", "D"])
  355. parser = all_parsers
  356. result = parser.read_csv(StringIO(data), nrows=nrows)
  357. tm.assert_frame_equal(result, expected)
  358. @pytest.mark.parametrize("nrows", [1.2, "foo", -1])
  359. def test_read_nrows_bad(all_parsers, nrows):
  360. data = """index,A,B,C,D
  361. foo,2,3,4,5
  362. bar,7,8,9,10
  363. baz,12,13,14,15
  364. qux,12,13,14,15
  365. foo2,12,13,14,15
  366. bar2,12,13,14,15
  367. """
  368. msg = r"'nrows' must be an integer >=0"
  369. parser = all_parsers
  370. with pytest.raises(ValueError, match=msg):
  371. parser.read_csv(StringIO(data), nrows=nrows)
  372. @pytest.mark.parametrize("index_col", [0, "index"])
  373. def test_read_chunksize_with_index(all_parsers, index_col):
  374. parser = all_parsers
  375. data = """index,A,B,C,D
  376. foo,2,3,4,5
  377. bar,7,8,9,10
  378. baz,12,13,14,15
  379. qux,12,13,14,15
  380. foo2,12,13,14,15
  381. bar2,12,13,14,15
  382. """
  383. reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2)
  384. expected = DataFrame([["foo", 2, 3, 4, 5],
  385. ["bar", 7, 8, 9, 10],
  386. ["baz", 12, 13, 14, 15],
  387. ["qux", 12, 13, 14, 15],
  388. ["foo2", 12, 13, 14, 15],
  389. ["bar2", 12, 13, 14, 15]],
  390. columns=["index", "A", "B", "C", "D"])
  391. expected = expected.set_index("index")
  392. chunks = list(reader)
  393. tm.assert_frame_equal(chunks[0], expected[:2])
  394. tm.assert_frame_equal(chunks[1], expected[2:4])
  395. tm.assert_frame_equal(chunks[2], expected[4:])
  396. @pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
  397. def test_read_chunksize_bad(all_parsers, chunksize):
  398. data = """index,A,B,C,D
  399. foo,2,3,4,5
  400. bar,7,8,9,10
  401. baz,12,13,14,15
  402. qux,12,13,14,15
  403. foo2,12,13,14,15
  404. bar2,12,13,14,15
  405. """
  406. parser = all_parsers
  407. msg = r"'chunksize' must be an integer >=1"
  408. with pytest.raises(ValueError, match=msg):
  409. parser.read_csv(StringIO(data), chunksize=chunksize)
  410. @pytest.mark.parametrize("chunksize", [2, 8])
  411. def test_read_chunksize_and_nrows(all_parsers, chunksize):
  412. # see gh-15755
  413. data = """index,A,B,C,D
  414. foo,2,3,4,5
  415. bar,7,8,9,10
  416. baz,12,13,14,15
  417. qux,12,13,14,15
  418. foo2,12,13,14,15
  419. bar2,12,13,14,15
  420. """
  421. parser = all_parsers
  422. kwargs = dict(index_col=0, nrows=5)
  423. reader = parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs)
  424. expected = parser.read_csv(StringIO(data), **kwargs)
  425. tm.assert_frame_equal(concat(reader), expected)
  426. def test_read_chunksize_and_nrows_changing_size(all_parsers):
  427. data = """index,A,B,C,D
  428. foo,2,3,4,5
  429. bar,7,8,9,10
  430. baz,12,13,14,15
  431. qux,12,13,14,15
  432. foo2,12,13,14,15
  433. bar2,12,13,14,15
  434. """
  435. parser = all_parsers
  436. kwargs = dict(index_col=0, nrows=5)
  437. reader = parser.read_csv(StringIO(data), chunksize=8, **kwargs)
  438. expected = parser.read_csv(StringIO(data), **kwargs)
  439. tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
  440. tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
  441. with pytest.raises(StopIteration, match=""):
  442. reader.get_chunk(size=3)
  443. def test_get_chunk_passed_chunksize(all_parsers):
  444. parser = all_parsers
  445. data = """A,B,C
  446. 1,2,3
  447. 4,5,6
  448. 7,8,9
  449. 1,2,3"""
  450. reader = parser.read_csv(StringIO(data), chunksize=2)
  451. result = reader.get_chunk()
  452. expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
  453. tm.assert_frame_equal(result, expected)
  454. @pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)])
  455. def test_read_chunksize_compat(all_parsers, kwargs):
  456. # see gh-12185
  457. data = """index,A,B,C,D
  458. foo,2,3,4,5
  459. bar,7,8,9,10
  460. baz,12,13,14,15
  461. qux,12,13,14,15
  462. foo2,12,13,14,15
  463. bar2,12,13,14,15
  464. """
  465. parser = all_parsers
  466. reader = parser.read_csv(StringIO(data), chunksize=2, **kwargs)
  467. result = parser.read_csv(StringIO(data), **kwargs)
  468. tm.assert_frame_equal(concat(reader), result)
  469. def test_read_chunksize_jagged_names(all_parsers):
  470. # see gh-23509
  471. parser = all_parsers
  472. data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
  473. expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
  474. reader = parser.read_csv(StringIO(data), names=range(10), chunksize=4)
  475. result = concat(reader)
  476. tm.assert_frame_equal(result, expected)
  477. def test_read_data_list(all_parsers):
  478. parser = all_parsers
  479. kwargs = dict(index_col=0)
  480. data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
  481. data_list = [["A", "B", "C"], ["foo", "1", "2", "3"],
  482. ["bar", "4", "5", "6"]]
  483. expected = parser.read_csv(StringIO(data), **kwargs)
  484. parser = TextParser(data_list, chunksize=2, **kwargs)
  485. result = parser.read()
  486. tm.assert_frame_equal(result, expected)
  487. def test_iterator(all_parsers):
  488. # see gh-6607
  489. data = """index,A,B,C,D
  490. foo,2,3,4,5
  491. bar,7,8,9,10
  492. baz,12,13,14,15
  493. qux,12,13,14,15
  494. foo2,12,13,14,15
  495. bar2,12,13,14,15
  496. """
  497. parser = all_parsers
  498. kwargs = dict(index_col=0)
  499. expected = parser.read_csv(StringIO(data), **kwargs)
  500. reader = parser.read_csv(StringIO(data), iterator=True, **kwargs)
  501. first_chunk = reader.read(3)
  502. tm.assert_frame_equal(first_chunk, expected[:3])
  503. last_chunk = reader.read(5)
  504. tm.assert_frame_equal(last_chunk, expected[3:])
  505. def test_iterator2(all_parsers):
  506. parser = all_parsers
  507. data = """A,B,C
  508. foo,1,2,3
  509. bar,4,5,6
  510. baz,7,8,9
  511. """
  512. reader = parser.read_csv(StringIO(data), iterator=True)
  513. result = list(reader)
  514. expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  515. index=["foo", "bar", "baz"],
  516. columns=["A", "B", "C"])
  517. tm.assert_frame_equal(result[0], expected)
  518. def test_reader_list(all_parsers):
  519. data = """index,A,B,C,D
  520. foo,2,3,4,5
  521. bar,7,8,9,10
  522. baz,12,13,14,15
  523. qux,12,13,14,15
  524. foo2,12,13,14,15
  525. bar2,12,13,14,15
  526. """
  527. parser = all_parsers
  528. kwargs = dict(index_col=0)
  529. lines = list(csv.reader(StringIO(data)))
  530. reader = TextParser(lines, chunksize=2, **kwargs)
  531. expected = parser.read_csv(StringIO(data), **kwargs)
  532. chunks = list(reader)
  533. tm.assert_frame_equal(chunks[0], expected[:2])
  534. tm.assert_frame_equal(chunks[1], expected[2:4])
  535. tm.assert_frame_equal(chunks[2], expected[4:])
  536. def test_reader_list_skiprows(all_parsers):
  537. data = """index,A,B,C,D
  538. foo,2,3,4,5
  539. bar,7,8,9,10
  540. baz,12,13,14,15
  541. qux,12,13,14,15
  542. foo2,12,13,14,15
  543. bar2,12,13,14,15
  544. """
  545. parser = all_parsers
  546. kwargs = dict(index_col=0)
  547. lines = list(csv.reader(StringIO(data)))
  548. reader = TextParser(lines, chunksize=2, skiprows=[1], **kwargs)
  549. expected = parser.read_csv(StringIO(data), **kwargs)
  550. chunks = list(reader)
  551. tm.assert_frame_equal(chunks[0], expected[1:3])
  552. def test_iterator_stop_on_chunksize(all_parsers):
  553. # gh-3967: stopping iteration when chunksize is specified
  554. parser = all_parsers
  555. data = """A,B,C
  556. foo,1,2,3
  557. bar,4,5,6
  558. baz,7,8,9
  559. """
  560. reader = parser.read_csv(StringIO(data), chunksize=1)
  561. result = list(reader)
  562. assert len(result) == 3
  563. expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  564. index=["foo", "bar", "baz"],
  565. columns=["A", "B", "C"])
  566. tm.assert_frame_equal(concat(result), expected)
  567. @pytest.mark.parametrize("kwargs", [
  568. dict(iterator=True,
  569. chunksize=1),
  570. dict(iterator=True),
  571. dict(chunksize=1)
  572. ])
  573. def test_iterator_skipfooter_errors(all_parsers, kwargs):
  574. msg = "'skipfooter' not supported for 'iteration'"
  575. parser = all_parsers
  576. data = "a\n1\n2"
  577. with pytest.raises(ValueError, match=msg):
  578. parser.read_csv(StringIO(data), skipfooter=1, **kwargs)
  579. def test_nrows_skipfooter_errors(all_parsers):
  580. msg = "'skipfooter' not supported with 'nrows'"
  581. data = "a\n1\n2\n3\n4\n5\n6"
  582. parser = all_parsers
  583. with pytest.raises(ValueError, match=msg):
  584. parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
  585. @pytest.mark.parametrize("data,kwargs,expected", [
  586. ("""foo,2,3,4,5
  587. bar,7,8,9,10
  588. baz,12,13,14,15
  589. qux,12,13,14,15
  590. foo2,12,13,14,15
  591. bar2,12,13,14,15
  592. """, dict(index_col=0, names=["index", "A", "B", "C", "D"]),
  593. DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15],
  594. [12, 13, 14, 15], [12, 13, 14, 15], [12, 13, 14, 15]],
  595. index=Index(["foo", "bar", "baz", "qux",
  596. "foo2", "bar2"], name="index"),
  597. columns=["A", "B", "C", "D"])),
  598. ("""foo,one,2,3,4,5
  599. foo,two,7,8,9,10
  600. foo,three,12,13,14,15
  601. bar,one,12,13,14,15
  602. bar,two,12,13,14,15
  603. """, dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]),
  604. DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15],
  605. [12, 13, 14, 15], [12, 13, 14, 15]],
  606. index=MultiIndex.from_tuples([
  607. ("foo", "one"), ("foo", "two"), ("foo", "three"),
  608. ("bar", "one"), ("bar", "two")],
  609. names=["index1", "index2"]),
  610. columns=["A", "B", "C", "D"])),
  611. ])
  612. def test_pass_names_with_index(all_parsers, data, kwargs, expected):
  613. parser = all_parsers
  614. result = parser.read_csv(StringIO(data), **kwargs)
  615. tm.assert_frame_equal(result, expected)
  616. @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
  617. def test_multi_index_no_level_names(all_parsers, index_col):
  618. data = """index1,index2,A,B,C,D
  619. foo,one,2,3,4,5
  620. foo,two,7,8,9,10
  621. foo,three,12,13,14,15
  622. bar,one,12,13,14,15
  623. bar,two,12,13,14,15
  624. """
  625. headless_data = '\n'.join(data.split("\n")[1:])
  626. names = ["A", "B", "C", "D"]
  627. parser = all_parsers
  628. result = parser.read_csv(StringIO(headless_data),
  629. index_col=index_col,
  630. header=None, names=names)
  631. expected = parser.read_csv(StringIO(data), index_col=index_col)
  632. # No index names in headless data.
  633. expected.index.names = [None] * 2
  634. tm.assert_frame_equal(result, expected)
  635. def test_multi_index_no_level_names_implicit(all_parsers):
  636. parser = all_parsers
  637. data = """A,B,C,D
  638. foo,one,2,3,4,5
  639. foo,two,7,8,9,10
  640. foo,three,12,13,14,15
  641. bar,one,12,13,14,15
  642. bar,two,12,13,14,15
  643. """
  644. result = parser.read_csv(StringIO(data))
  645. expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15],
  646. [12, 13, 14, 15], [12, 13, 14, 15]],
  647. columns=["A", "B", "C", "D"],
  648. index=MultiIndex.from_tuples([
  649. ("foo", "one"), ("foo", "two"), ("foo", "three"),
  650. ("bar", "one"), ("bar", "two")]))
  651. tm.assert_frame_equal(result, expected)
  652. @pytest.mark.parametrize("data,expected,header", [
  653. ("a,b", DataFrame(columns=["a", "b"]), [0]),
  654. ("a,b\nc,d", DataFrame(columns=MultiIndex.from_tuples(
  655. [("a", "c"), ("b", "d")])), [0, 1]),
  656. ])
  657. @pytest.mark.parametrize("round_trip", [True, False])
  658. def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
  659. # see gh-14545
  660. parser = all_parsers
  661. data = expected.to_csv(index=False) if round_trip else data
  662. result = parser.read_csv(StringIO(data), header=header)
  663. tm.assert_frame_equal(result, expected)
  664. def test_no_unnamed_index(all_parsers):
  665. parser = all_parsers
  666. data = """ id c0 c1 c2
  667. 0 1 0 a b
  668. 1 2 0 c d
  669. 2 2 2 e f
  670. """
  671. result = parser.read_csv(StringIO(data), sep=" ")
  672. expected = DataFrame([[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"],
  673. [2, 2, 2, "e", "f"]], columns=["Unnamed: 0", "id",
  674. "c0", "c1", "c2"])
  675. tm.assert_frame_equal(result, expected)
  676. def test_read_csv_parse_simple_list(all_parsers):
  677. parser = all_parsers
  678. data = """foo
  679. bar baz
  680. qux foo
  681. foo
  682. bar"""
  683. result = parser.read_csv(StringIO(data), header=None)
  684. expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
  685. tm.assert_frame_equal(result, expected)
  686. @tm.network
  687. def test_url(all_parsers, csv_dir_path):
  688. # TODO: FTP testing
  689. parser = all_parsers
  690. kwargs = dict(sep="\t")
  691. url = ("https://raw.github.com/pandas-dev/pandas/master/"
  692. "pandas/tests/io/parser/data/salaries.csv")
  693. url_result = parser.read_csv(url, **kwargs)
  694. local_path = os.path.join(csv_dir_path, "salaries.csv")
  695. local_result = parser.read_csv(local_path, **kwargs)
  696. tm.assert_frame_equal(url_result, local_result)
  697. @pytest.mark.slow
  698. def test_local_file(all_parsers, csv_dir_path):
  699. parser = all_parsers
  700. kwargs = dict(sep="\t")
  701. local_path = os.path.join(csv_dir_path, "salaries.csv")
  702. local_result = parser.read_csv(local_path, **kwargs)
  703. url = "file://localhost/" + local_path
  704. try:
  705. url_result = parser.read_csv(url, **kwargs)
  706. tm.assert_frame_equal(url_result, local_result)
  707. except URLError:
  708. # Fails on some systems.
  709. pytest.skip("Failing on: " + " ".join(platform.uname()))
  710. def test_path_path_lib(all_parsers):
  711. parser = all_parsers
  712. df = tm.makeDataFrame()
  713. result = tm.round_trip_pathlib(
  714. df.to_csv, lambda p: parser.read_csv(p, index_col=0))
  715. tm.assert_frame_equal(df, result)
  716. def test_path_local_path(all_parsers):
  717. parser = all_parsers
  718. df = tm.makeDataFrame()
  719. result = tm.round_trip_localpath(
  720. df.to_csv, lambda p: parser.read_csv(p, index_col=0))
  721. tm.assert_frame_equal(df, result)
  722. def test_nonexistent_path(all_parsers):
  723. # gh-2428: pls no segfault
  724. # gh-14086: raise more helpful FileNotFoundError
  725. parser = all_parsers
  726. path = "%s.csv" % tm.rands(10)
  727. msg = ("does not exist" if parser.engine == "c"
  728. else r"\[Errno 2\]")
  729. with pytest.raises(compat.FileNotFoundError, match=msg) as e:
  730. parser.read_csv(path)
  731. filename = e.value.filename
  732. filename = filename.decode() if isinstance(
  733. filename, bytes) else filename
  734. assert path == filename
  735. def test_missing_trailing_delimiters(all_parsers):
  736. parser = all_parsers
  737. data = """A,B,C,D
  738. 1,2,3,4
  739. 1,3,3,
  740. 1,4,5"""
  741. result = parser.read_csv(StringIO(data))
  742. expected = DataFrame([[1, 2, 3, 4], [1, 3, 3, np.nan],
  743. [1, 4, 5, np.nan]], columns=["A", "B", "C", "D"])
  744. tm.assert_frame_equal(result, expected)
  745. def test_skip_initial_space(all_parsers):
  746. data = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
  747. '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, '
  748. '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, '
  749. '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, '
  750. '0.212036, 14.7674, 41.605, -9999.0, -9999.0, '
  751. '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128')
  752. parser = all_parsers
  753. result = parser.read_csv(StringIO(data), names=lrange(33), header=None,
  754. na_values=["-9999.0"], skipinitialspace=True)
  755. expected = DataFrame([["09-Apr-2012", "01:10:18.300", 2456026.548822908,
  756. 12849, 1.00361, 1.12551, 330.65659,
  757. 355626618.16711, 73.48821, 314.11625, 1917.09447,
  758. 179.71425, 80.0, 240.0, -350, 70.06056, 344.9837,
  759. 1, 1, -0.689265, -0.692787, 0.212036, 14.7674,
  760. 41.605, np.nan, np.nan, np.nan, np.nan, np.nan,
  761. np.nan, 0, 12, 128]])
  762. tm.assert_frame_equal(result, expected)
  763. @pytest.mark.parametrize("sep", [",", "\t"])
  764. @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
  765. def test_utf16_bom_skiprows(all_parsers, sep, encoding):
  766. # see gh-2298
  767. parser = all_parsers
  768. data = u("""skip this
  769. skip this too
  770. A,B,C
  771. 1,2,3
  772. 4,5,6""").replace(",", sep)
  773. path = "__%s__.csv" % tm.rands(10)
  774. kwargs = dict(sep=sep, skiprows=2)
  775. utf8 = "utf-8"
  776. with tm.ensure_clean(path) as path:
  777. bytes_data = data.encode(encoding)
  778. with open(path, "wb") as f:
  779. f.write(bytes_data)
  780. bytes_buffer = BytesIO(data.encode(utf8))
  781. if compat.PY3:
  782. from io import TextIOWrapper
  783. bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8)
  784. result = parser.read_csv(path, encoding=encoding, **kwargs)
  785. expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
  786. bytes_buffer.close()
  787. tm.assert_frame_equal(result, expected)
  788. @pytest.mark.parametrize("buffer", [
  789. False,
  790. pytest.param(True, marks=pytest.mark.skipif(
  791. compat.PY3, reason="Not supported on PY3"))])
  792. def test_utf16_example(all_parsers, csv_dir_path, buffer):
  793. path = os.path.join(csv_dir_path, "utf16_ex.txt")
  794. parser = all_parsers
  795. src = BytesIO(open(path, "rb").read()) if buffer else path
  796. result = parser.read_csv(src, encoding="utf-16", sep="\t")
  797. assert len(result) == 50
  798. def test_unicode_encoding(all_parsers, csv_dir_path):
  799. path = os.path.join(csv_dir_path, "unicode_series.csv")
  800. parser = all_parsers
  801. result = parser.read_csv(path, header=None, encoding="latin-1")
  802. result = result.set_index(0)
  803. got = result[1][1632]
  804. expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)')
  805. assert got == expected
  806. def test_trailing_delimiters(all_parsers):
  807. # see gh-2442
  808. data = """A,B,C
  809. 1,2,3,
  810. 4,5,6,
  811. 7,8,9,"""
  812. parser = all_parsers
  813. result = parser.read_csv(StringIO(data), index_col=False)
  814. expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
  815. tm.assert_frame_equal(result, expected)
  816. def test_escapechar(all_parsers):
  817. # http://stackoverflow.com/questions/13824840/feature-request-for-
  818. # pandas-read-csv
  819. data = '''SEARCH_TERM,ACTUAL_URL
  820. "bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
  821. "tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
  822. "SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa
  823. parser = all_parsers
  824. result = parser.read_csv(StringIO(data), escapechar='\\',
  825. quotechar='"', encoding='utf-8')
  826. assert result['SEARCH_TERM'][2] == ('SLAGBORD, "Bergslagen", '
  827. 'IKEA:s 1700-tals serie')
  828. tm.assert_index_equal(result.columns,
  829. Index(['SEARCH_TERM', 'ACTUAL_URL']))
  830. def test_int64_min_issues(all_parsers):
  831. # see gh-2599
  832. parser = all_parsers
  833. data = "A,B\n0,0\n0,"
  834. result = parser.read_csv(StringIO(data))
  835. expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
  836. tm.assert_frame_equal(result, expected)
  837. def test_parse_integers_above_fp_precision(all_parsers):
  838. data = """Numbers
  839. 17007000002000191
  840. 17007000002000191
  841. 17007000002000191
  842. 17007000002000191
  843. 17007000002000192
  844. 17007000002000192
  845. 17007000002000192
  846. 17007000002000192
  847. 17007000002000192
  848. 17007000002000194"""
  849. parser = all_parsers
  850. result = parser.read_csv(StringIO(data))
  851. expected = DataFrame({"Numbers": [17007000002000191,
  852. 17007000002000191,
  853. 17007000002000191,
  854. 17007000002000191,
  855. 17007000002000192,
  856. 17007000002000192,
  857. 17007000002000192,
  858. 17007000002000192,
  859. 17007000002000192,
  860. 17007000002000194]})
  861. tm.assert_frame_equal(result, expected)
  862. def test_chunks_have_consistent_numerical_type(all_parsers):
  863. parser = all_parsers
  864. integers = [str(i) for i in range(499999)]
  865. data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
  866. # Coercions should work without warnings.
  867. with tm.assert_produces_warning(None):
  868. result = parser.read_csv(StringIO(data))
  869. assert type(result.a[0]) is np.float64
  870. assert result.a.dtype == np.float
  871. def test_warn_if_chunks_have_mismatched_type(all_parsers):
  872. warning_type = None
  873. parser = all_parsers
  874. integers = [str(i) for i in range(499999)]
  875. data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
  876. # see gh-3866: if chunks are different types and can't
  877. # be coerced using numerical types, then issue warning.
  878. if parser.engine == "c" and parser.low_memory:
  879. warning_type = DtypeWarning
  880. with tm.assert_produces_warning(warning_type):
  881. df = parser.read_csv(StringIO(data))
  882. assert df.a.dtype == np.object
  883. @pytest.mark.parametrize("sep", [" ", r"\s+"])
  884. def test_integer_overflow_bug(all_parsers, sep):
  885. # see gh-2601
  886. data = "65248E10 11\n55555E55 22\n"
  887. parser = all_parsers
  888. result = parser.read_csv(StringIO(data), header=None, sep=sep)
  889. expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
  890. tm.assert_frame_equal(result, expected)
  891. def test_catch_too_many_names(all_parsers):
  892. # see gh-5156
  893. data = """\
  894. 1,2,3
  895. 4,,6
  896. 7,8,9
  897. 10,11,12\n"""
  898. parser = all_parsers
  899. msg = ("Too many columns specified: "
  900. "expected 4 and found 3" if parser.engine == "c"
  901. else "Number of passed names did not match "
  902. "number of header fields in the file")
  903. with pytest.raises(ValueError, match=msg):
  904. parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
  905. def test_ignore_leading_whitespace(all_parsers):
  906. # see gh-3374, gh-6607
  907. parser = all_parsers
  908. data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
  909. result = parser.read_csv(StringIO(data), sep=r"\s+")
  910. expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
  911. tm.assert_frame_equal(result, expected)
  912. def test_chunk_begins_with_newline_whitespace(all_parsers):
  913. # see gh-10022
  914. parser = all_parsers
  915. data = "\n hello\nworld\n"
  916. result = parser.read_csv(StringIO(data), header=None)
  917. expected = DataFrame([" hello", "world"])
  918. tm.assert_frame_equal(result, expected)
  919. def test_empty_with_index(all_parsers):
  920. # see gh-10184
  921. data = "x,y"
  922. parser = all_parsers
  923. result = parser.read_csv(StringIO(data), index_col=0)
  924. expected = DataFrame([], columns=["y"], index=Index([], name="x"))
  925. tm.assert_frame_equal(result, expected)
  926. def test_empty_with_multi_index(all_parsers):
  927. # see gh-10467
  928. data = "x,y,z"
  929. parser = all_parsers
  930. result = parser.read_csv(StringIO(data), index_col=["x", "y"])
  931. expected = DataFrame([], columns=["z"],
  932. index=MultiIndex.from_arrays(
  933. [[]] * 2, names=["x", "y"]))
  934. tm.assert_frame_equal(result, expected)
  935. def test_empty_with_reversed_multi_index(all_parsers):
  936. data = "x,y,z"
  937. parser = all_parsers
  938. result = parser.read_csv(StringIO(data), index_col=[1, 0])
  939. expected = DataFrame([], columns=["z"],
  940. index=MultiIndex.from_arrays(
  941. [[]] * 2, names=["y", "x"]))
  942. tm.assert_frame_equal(result, expected)
  943. def test_float_parser(all_parsers):
  944. # see gh-9565
  945. parser = all_parsers
  946. data = "45e-1,4.5,45.,inf,-inf"
  947. result = parser.read_csv(StringIO(data), header=None)
  948. expected = DataFrame([[float(s) for s in data.split(",")]])
  949. tm.assert_frame_equal(result, expected)
  950. def test_scientific_no_exponent(all_parsers):
  951. # see gh-12215
  952. df = DataFrame.from_dict(OrderedDict([("w", ["2e"]), ("x", ["3E"]),
  953. ("y", ["42e"]),
  954. ("z", ["632E"])]))
  955. data = df.to_csv(index=False)
  956. parser = all_parsers
  957. for precision in parser.float_precision_choices:
  958. df_roundtrip = parser.read_csv(StringIO(data),
  959. float_precision=precision)
  960. tm.assert_frame_equal(df_roundtrip, df)
  961. @pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
  962. def test_int64_overflow(all_parsers, conv):
  963. data = """ID
  964. 00013007854817840016671868
  965. 00013007854817840016749251
  966. 00013007854817840016754630
  967. 00013007854817840016781876
  968. 00013007854817840017028824
  969. 00013007854817840017963235
  970. 00013007854817840018860166"""
  971. parser = all_parsers
  972. if conv is None:
  973. # 13007854817840016671868 > UINT64_MAX, so this
  974. # will overflow and return object as the dtype.
  975. result = parser.read_csv(StringIO(data))
  976. expected = DataFrame(["00013007854817840016671868",
  977. "00013007854817840016749251",
  978. "00013007854817840016754630",
  979. "00013007854817840016781876",
  980. "00013007854817840017028824",
  981. "00013007854817840017963235",
  982. "00013007854817840018860166"], columns=["ID"])
  983. tm.assert_frame_equal(result, expected)
  984. else:
  985. # 13007854817840016671868 > UINT64_MAX, so attempts
  986. # to cast to either int64 or uint64 will result in
  987. # an OverflowError being raised.
  988. msg = ("(Python int too large to convert to C long)|"
  989. "(long too big to convert)|"
  990. "(int too big to convert)")
  991. with pytest.raises(OverflowError, match=msg):
  992. parser.read_csv(StringIO(data), converters={"ID": conv})
  993. @pytest.mark.parametrize("val", [
  994. np.iinfo(np.uint64).max,
  995. np.iinfo(np.int64).max,
  996. np.iinfo(np.int64).min
  997. ])
  998. def test_int64_uint64_range(all_parsers, val):
  999. # These numbers fall right inside the int64-uint64
  1000. # range, so they should be parsed as string.
  1001. parser = all_parsers
  1002. result = parser.read_csv(StringIO(str(val)), header=None)
  1003. expected = DataFrame([val])
  1004. tm.assert_frame_equal(result, expected)
  1005. @pytest.mark.parametrize("val", [
  1006. np.iinfo(np.uint64).max + 1,
  1007. np.iinfo(np.int64).min - 1
  1008. ])
  1009. def test_outside_int64_uint64_range(all_parsers, val):
  1010. # These numbers fall just outside the int64-uint64
  1011. # range, so they should be parsed as string.
  1012. parser = all_parsers
  1013. result = parser.read_csv(StringIO(str(val)), header=None)
  1014. expected = DataFrame([str(val)])
  1015. tm.assert_frame_equal(result, expected)
  1016. @pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)],
  1017. [str(2**63), str(-1)]])
  1018. def test_numeric_range_too_wide(all_parsers, exp_data):
  1019. # No numerical dtype can hold both negative and uint64
  1020. # values, so they should be cast as string.
  1021. parser = all_parsers
  1022. data = "\n".join(exp_data)
  1023. expected = DataFrame(exp_data)
  1024. result = parser.read_csv(StringIO(data), header=None)
  1025. tm.assert_frame_equal(result, expected)
  1026. @pytest.mark.parametrize("iterator", [True, False])
  1027. def test_empty_with_nrows_chunksize(all_parsers, iterator):
  1028. # see gh-9535
  1029. parser = all_parsers
  1030. expected = DataFrame([], columns=["foo", "bar"])
  1031. nrows = 10
  1032. data = StringIO("foo,bar\n")
  1033. if iterator:
  1034. result = next(iter(parser.read_csv(data, chunksize=nrows)))
  1035. else:
  1036. result = parser.read_csv(data, nrows=nrows)
  1037. tm.assert_frame_equal(result, expected)
  1038. @pytest.mark.parametrize("data,kwargs,expected,msg", [
  1039. # gh-10728: WHITESPACE_LINE
  1040. ("a,b,c\n4,5,6\n ", dict(),
  1041. DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
  1042. # gh-10548: EAT_LINE_COMMENT
  1043. ("a,b,c\n4,5,6\n#comment", dict(comment="#"),
  1044. DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
  1045. # EAT_CRNL_NOP
  1046. ("a,b,c\n4,5,6\n\r", dict(),
  1047. DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
  1048. # EAT_COMMENT
  1049. ("a,b,c\n4,5,6#comment", dict(comment="#"),
  1050. DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
  1051. # SKIP_LINE
  1052. ("a,b,c\n4,5,6\nskipme", dict(skiprows=[2]),
  1053. DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
  1054. # EAT_LINE_COMMENT
  1055. ("a,b,c\n4,5,6\n#comment", dict(comment="#", skip_blank_lines=False),
  1056. DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None),
  1057. # IN_FIELD
  1058. ("a,b,c\n4,5,6\n ", dict(skip_blank_lines=False),
  1059. DataFrame([["4", 5, 6], [" ", None, None]],
  1060. columns=["a", "b", "c"]), None),
  1061. # EAT_CRNL
  1062. ("a,b,c\n4,5,6\n\r", dict(skip_blank_lines=False),
  1063. DataFrame([[4, 5, 6], [None, None, None]],
  1064. columns=["a", "b", "c"]), None),
  1065. # ESCAPED_CHAR
  1066. ("a,b,c\n4,5,6\n\\", dict(escapechar="\\"),
  1067. None, "(EOF following escape character)|(unexpected end of data)"),
  1068. # ESCAPE_IN_QUOTED_FIELD
  1069. ('a,b,c\n4,5,6\n"\\', dict(escapechar="\\"),
  1070. None, "(EOF inside string starting at row 2)|(unexpected end of data)"),
  1071. # IN_QUOTED_FIELD
  1072. ('a,b,c\n4,5,6\n"', dict(escapechar="\\"),
  1073. None, "(EOF inside string starting at row 2)|(unexpected end of data)"),
  1074. ], ids=["whitespace-line", "eat-line-comment", "eat-crnl-nop", "eat-comment",
  1075. "skip-line", "eat-line-comment", "in-field", "eat-crnl",
  1076. "escaped-char", "escape-in-quoted-field", "in-quoted-field"])
  1077. def test_eof_states(all_parsers, data, kwargs, expected, msg):
  1078. # see gh-10728, gh-10548
  1079. parser = all_parsers
  1080. if expected is None:
  1081. with pytest.raises(ParserError, match=msg):
  1082. parser.read_csv(StringIO(data), **kwargs)
  1083. else:
  1084. result = parser.read_csv(StringIO(data), **kwargs)
  1085. tm.assert_frame_equal(result, expected)
  1086. @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
  1087. def test_uneven_lines_with_usecols(all_parsers, usecols):
  1088. # see gh-12203
  1089. parser = all_parsers
  1090. data = r"""a,b,c
  1091. 0,1,2
  1092. 3,4,5,6,7
  1093. 8,9,10"""
  1094. if usecols is None:
  1095. # Make sure that an error is still raised
  1096. # when the "usecols" parameter is not provided.
  1097. msg = r"Expected \d+ fields in line \d+, saw \d+"
  1098. with pytest.raises(ParserError, match=msg):
  1099. parser.read_csv(StringIO(data))
  1100. else:
  1101. expected = DataFrame({
  1102. "a": [0, 3, 8],
  1103. "b": [1, 4, 9]
  1104. })
  1105. result = parser.read_csv(StringIO(data), usecols=usecols)
  1106. tm.assert_frame_equal(result, expected)
  1107. @pytest.mark.parametrize("data,kwargs,expected", [
  1108. # First, check to see that the response of parser when faced with no
  1109. # provided columns raises the correct error, with or without usecols.
  1110. ("", dict(), None),
  1111. ("", dict(usecols=["X"]), None),
  1112. (",,", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]),
  1113. DataFrame(columns=["X"], index=[0], dtype=np.float64)),
  1114. ("", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]),
  1115. DataFrame(columns=["X"])),
  1116. ])
  1117. def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
  1118. # see gh-12493
  1119. parser = all_parsers
  1120. if expected is None:
  1121. msg = "No columns to parse from file"
  1122. with pytest.raises(EmptyDataError, match=msg):
  1123. parser.read_csv(StringIO(data), **kwargs)
  1124. else:
  1125. result = parser.read_csv(StringIO(data), **kwargs)
  1126. tm.assert_frame_equal(result, expected)
  1127. @pytest.mark.parametrize("kwargs,expected", [
  1128. # gh-8661, gh-8679: this should ignore six lines, including
  1129. # lines with trailing whitespace and blank lines.
  1130. (dict(header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6],
  1131. skip_blank_lines=True), DataFrame([[1., 2., 4.],
  1132. [5.1, np.nan, 10.]])),
  1133. # gh-8983: test skipping set of rows after a row with trailing spaces.
  1134. (dict(delim_whitespace=True, skiprows=[1, 2, 3, 5, 6],
  1135. skip_blank_lines=True), DataFrame({"A": [1., 5.1],
  1136. "B": [2., np.nan],
  1137. "C": [4., 10]})),
  1138. ])
  1139. def test_trailing_spaces(all_parsers, kwargs, expected):
  1140. data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa
  1141. parser = all_parsers
  1142. result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
  1143. tm.assert_frame_equal(result, expected)
  1144. def test_raise_on_sep_with_delim_whitespace(all_parsers):
  1145. # see gh-6607
  1146. data = "a b c\n1 2 3"
  1147. parser = all_parsers
  1148. with pytest.raises(ValueError, match="you can only specify one"):
  1149. parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
  1150. @pytest.mark.parametrize("delim_whitespace", [True, False])
  1151. def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
  1152. # see gh-9710
  1153. parser = all_parsers
  1154. data = """\
  1155. MyColumn
  1156. a
  1157. b
  1158. a
  1159. b\n"""
  1160. expected = DataFrame({"MyColumn": list("abab")})
  1161. result = parser.read_csv(StringIO(data), skipinitialspace=True,
  1162. delim_whitespace=delim_whitespace)
  1163. tm.assert_frame_equal(result, expected)
  1164. @pytest.mark.parametrize("sep,skip_blank_lines,exp_data", [
  1165. (",", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]),
  1166. (r"\s+", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]),
  1167. (",", False, [[1., 2., 4.], [np.nan, np.nan, np.nan],
  1168. [np.nan, np.nan, np.nan], [5., np.nan, 10.],
  1169. [np.nan, np.nan, np.nan], [-70., .4, 1.]]),
  1170. ])
  1171. def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
  1172. parser = all_parsers
  1173. data = """\
  1174. A,B,C
  1175. 1,2.,4.
  1176. 5.,NaN,10.0
  1177. -70,.4,1
  1178. """
  1179. if sep == r"\s+":
  1180. data = data.replace(",", " ")
  1181. result = parser.read_csv(StringIO(data), sep=sep,
  1182. skip_blank_lines=skip_blank_lines)
  1183. expected = DataFrame(exp_data, columns=["A", "B", "C"])
  1184. tm.assert_frame_equal(result, expected)
  1185. def test_whitespace_lines(all_parsers):
  1186. parser = all_parsers
  1187. data = """
  1188. \t \t\t
  1189. \t
  1190. A,B,C
  1191. \t 1,2.,4.
  1192. 5.,NaN,10.0
  1193. """
  1194. expected = DataFrame([[1, 2., 4.], [5., np.nan, 10.]],
  1195. columns=["A", "B", "C"])
  1196. result = parser.read_csv(StringIO(data))
  1197. tm.assert_frame_equal(result, expected)
  1198. @pytest.mark.parametrize("data,expected", [
  1199. (""" A B C D
  1200. a 1 2 3 4
  1201. b 1 2 3 4
  1202. c 1 2 3 4
  1203. """, DataFrame([[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
  1204. columns=["A", "B", "C", "D"], index=["a", "b", "c"])),
  1205. (" a b c\n1 2 3 \n4 5 6\n 7 8 9",
  1206. DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])),
  1207. ])
  1208. def test_whitespace_regex_separator(all_parsers, data, expected):
  1209. # see gh-6607
  1210. parser = all_parsers
  1211. result = parser.read_csv(StringIO(data), sep=r"\s+")
  1212. tm.assert_frame_equal(result, expected)
  1213. def test_verbose_read(all_parsers, capsys):
  1214. parser = all_parsers
  1215. data = """a,b,c,d
  1216. one,1,2,3
  1217. one,1,2,3
  1218. ,1,2,3
  1219. one,1,2,3
  1220. ,1,2,3
  1221. ,1,2,3
  1222. one,1,2,3
  1223. two,1,2,3"""
  1224. # Engines are verbose in different ways.
  1225. parser.read_csv(StringIO(data), verbose=True)
  1226. captured = capsys.readouterr()
  1227. if parser.engine == "c":
  1228. assert "Tokenization took:" in captured.out
  1229. assert "Parser memory cleanup took:" in captured.out
  1230. else: # Python engine
  1231. assert captured.out == "Filled 3 NA values in column a\n"
  1232. def test_verbose_read2(all_parsers, capsys):
  1233. parser = all_parsers
  1234. data = """a,b,c,d
  1235. one,1,2,3
  1236. two,1,2,3
  1237. three,1,2,3
  1238. four,1,2,3
  1239. five,1,2,3
  1240. ,1,2,3
  1241. seven,1,2,3
  1242. eight,1,2,3"""
  1243. parser.read_csv(StringIO(data), verbose=True, index_col=0)
  1244. captured = capsys.readouterr()
  1245. # Engines are verbose in different ways.
  1246. if parser.engine == "c":
  1247. assert "Tokenization took:" in captured.out
  1248. assert "Parser memory cleanup took:" in captured.out
  1249. else: # Python engine
  1250. assert captured.out == "Filled 1 NA values in column a\n"
  1251. def test_iteration_open_handle(all_parsers):
  1252. parser = all_parsers
  1253. kwargs = dict(squeeze=True, header=None)
  1254. with tm.ensure_clean() as path:
  1255. with open(path, "wb" if compat.PY2 else "w") as f:
  1256. f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
  1257. with open(path, "rb" if compat.PY2 else "r") as f:
  1258. for line in f:
  1259. if "CCC" in line:
  1260. break
  1261. if parser.engine == "c" and compat.PY2:
  1262. msg = "Mixing iteration and read methods would lose data"
  1263. with pytest.raises(ValueError, match=msg):
  1264. parser.read_csv(f, **kwargs)
  1265. else:
  1266. result = parser.read_csv(f, **kwargs)
  1267. expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0)
  1268. tm.assert_series_equal(result, expected)
  1269. @pytest.mark.parametrize("data,thousands,decimal", [
  1270. ("""A|B|C
  1271. 1|2,334.01|5
  1272. 10|13|10.
  1273. """, ",", "."),
  1274. ("""A|B|C
  1275. 1|2.334,01|5
  1276. 10|13|10,
  1277. """, ".", ","),
  1278. ])
  1279. def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
  1280. parser = all_parsers
  1281. expected = DataFrame({
  1282. "A": [1, 10],
  1283. "B": [2334.01, 13],
  1284. "C": [5, 10.]
  1285. })
  1286. result = parser.read_csv(StringIO(data), sep="|",
  1287. thousands=thousands,
  1288. decimal=decimal)
  1289. tm.assert_frame_equal(result, expected)
  1290. def test_euro_decimal_format(all_parsers):
  1291. parser = all_parsers
  1292. data = """Id;Number1;Number2;Text1;Text2;Number3
  1293. 1;1521,1541;187101,9543;ABC;poi;4,738797819
  1294. 2;121,12;14897,76;DEF;uyt;0,377320872
  1295. 3;878,158;108013,434;GHI;rez;2,735694704"""
  1296. result = parser.read_csv(StringIO(data), sep=";", decimal=",")
  1297. expected = DataFrame([
  1298. [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
  1299. [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
  1300. [3, 878.158, 108013.434, "GHI", "rez", 2.735694704]
  1301. ], columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"])
  1302. tm.assert_frame_equal(result, expected)
  1303. @pytest.mark.parametrize("na_filter", [True, False])
  1304. def test_inf_parsing(all_parsers, na_filter):
  1305. parser = all_parsers
  1306. data = """\
  1307. ,A
  1308. a,inf
  1309. b,-inf
  1310. c,+Inf
  1311. d,-Inf
  1312. e,INF
  1313. f,-INF
  1314. g,+INf
  1315. h,-INf
  1316. i,inF
  1317. j,-inF"""
  1318. expected = DataFrame({"A": [float("inf"), float("-inf")] * 5},
  1319. index=["a", "b", "c", "d", "e",
  1320. "f", "g", "h", "i", "j"])
  1321. result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
  1322. tm.assert_frame_equal(result, expected)
  1323. @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
  1324. def test_raise_on_no_columns(all_parsers, nrows):
  1325. parser = all_parsers
  1326. data = "\n" * nrows
  1327. msg = "No columns to parse from file"
  1328. with pytest.raises(EmptyDataError, match=msg):
  1329. parser.read_csv(StringIO(data))
  1330. def test_memory_map(all_parsers, csv_dir_path):
  1331. mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
  1332. parser = all_parsers
  1333. expected = DataFrame({
  1334. "a": [1, 2, 3],
  1335. "b": ["one", "two", "three"],
  1336. "c": ["I", "II", "III"]
  1337. })
  1338. result = parser.read_csv(mmap_file, memory_map=True)
  1339. tm.assert_frame_equal(result, expected)
  1340. def test_null_byte_char(all_parsers):
  1341. # see gh-2741
  1342. data = "\x00,foo"
  1343. names = ["a", "b"]
  1344. parser = all_parsers
  1345. if parser.engine == "c":
  1346. expected = DataFrame([[np.nan, "foo"]], columns=names)
  1347. out = parser.read_csv(StringIO(data), names=names)
  1348. tm.assert_frame_equal(out, expected)
  1349. else:
  1350. msg = "NULL byte detected"
  1351. with pytest.raises(ParserError, match=msg):
  1352. parser.read_csv(StringIO(data), names=names)
  1353. @pytest.mark.parametrize("data,kwargs,expected", [
  1354. # Basic test
  1355. ("a\n1", dict(), DataFrame({"a": [1]})),
  1356. # "Regular" quoting
  1357. ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})),
  1358. # Test in a data row instead of header
  1359. ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})),
  1360. # Test in empty data row with skipping
  1361. ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})),
  1362. # Test in empty data row without skipping
  1363. ("\n1", dict(names=["a"], skip_blank_lines=False),
  1364. DataFrame({"a": [np.nan, 1]})),
  1365. ])
  1366. def test_utf8_bom(all_parsers, data, kwargs, expected):
  1367. # see gh-4793
  1368. parser = all_parsers
  1369. bom = u("\ufeff")
  1370. utf8 = "utf-8"
  1371. def _encode_data_with_bom(_data):
  1372. bom_data = (bom + _data).encode(utf8)
  1373. return BytesIO(bom_data)
  1374. result = parser.read_csv(_encode_data_with_bom(data),
  1375. encoding=utf8, **kwargs)
  1376. tm.assert_frame_equal(result, expected)
  1377. def test_temporary_file(all_parsers):
  1378. # see gh-13398
  1379. parser = all_parsers
  1380. data = "0 0"
  1381. new_file = TemporaryFile("w+")
  1382. new_file.write(data)
  1383. new_file.flush()
  1384. new_file.seek(0)
  1385. result = parser.read_csv(new_file, sep=r"\s+", header=None)
  1386. new_file.close()
  1387. expected = DataFrame([[0, 0]])
  1388. tm.assert_frame_equal(result, expected)
  1389. @pytest.mark.parametrize("byte", [8, 16])
  1390. @pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}",
  1391. "UTF-{0}", "UTF_{0}"])
  1392. def test_read_csv_utf_aliases(all_parsers, byte, fmt):
  1393. # see gh-13549
  1394. expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
  1395. parser = all_parsers
  1396. encoding = fmt.format(byte)
  1397. data = "mb_num,multibyte\n4.8,test".encode(encoding)
  1398. result = parser.read_csv(BytesIO(data), encoding=encoding)
  1399. tm.assert_frame_equal(result, expected)
  1400. def test_internal_eof_byte(all_parsers):
  1401. # see gh-5500
  1402. parser = all_parsers
  1403. data = "a,b\n1\x1a,2"
  1404. expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
  1405. result = parser.read_csv(StringIO(data))
  1406. tm.assert_frame_equal(result, expected)
  1407. def test_internal_eof_byte_to_file(all_parsers):
  1408. # see gh-16559
  1409. parser = all_parsers
  1410. data = b'c1,c2\r\n"test \x1a test", test\r\n'
  1411. expected = DataFrame([["test \x1a test", " test"]],
  1412. columns=["c1", "c2"])
  1413. path = "__%s__.csv" % tm.rands(10)
  1414. with tm.ensure_clean(path) as path:
  1415. with open(path, "wb") as f:
  1416. f.write(data)
  1417. result = parser.read_csv(path)
  1418. tm.assert_frame_equal(result, expected)
  1419. def test_sub_character(all_parsers, csv_dir_path):
  1420. # see gh-16893
  1421. filename = os.path.join(csv_dir_path, "sub_char.csv")
  1422. expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
  1423. parser = all_parsers
  1424. result = parser.read_csv(filename)
  1425. tm.assert_frame_equal(result, expected)
  1426. def test_file_handle_string_io(all_parsers):
  1427. # gh-14418
  1428. #
  1429. # Don't close user provided file handles.
  1430. parser = all_parsers
  1431. data = "a,b\n1,2"
  1432. fh = StringIO(data)
  1433. parser.read_csv(fh)
  1434. assert not fh.closed
  1435. def test_file_handles_with_open(all_parsers, csv1):
  1436. # gh-14418
  1437. #
  1438. # Don't close user provided file handles.
  1439. parser = all_parsers
  1440. with open(csv1, "r") as f:
  1441. parser.read_csv(f)
  1442. assert not f.closed
  1443. def test_invalid_file_buffer_class(all_parsers):
  1444. # see gh-15337
  1445. class InvalidBuffer(object):
  1446. pass
  1447. parser = all_parsers
  1448. msg = "Invalid file path or buffer object type"
  1449. with pytest.raises(ValueError, match=msg):
  1450. parser.read_csv(InvalidBuffer())
  1451. def test_invalid_file_buffer_mock(all_parsers):
  1452. # see gh-15337
  1453. parser = all_parsers
  1454. msg = "Invalid file path or buffer object type"
  1455. class Foo():
  1456. pass
  1457. with pytest.raises(ValueError, match=msg):
  1458. parser.read_csv(Foo())
  1459. def test_valid_file_buffer_seems_invalid(all_parsers):
  1460. # gh-16135: we want to ensure that "tell" and "seek"
  1461. # aren't actually being used when we call `read_csv`
  1462. #
  1463. # Thus, while the object may look "invalid" (these
  1464. # methods are attributes of the `StringIO` class),
  1465. # it is still a valid file-object for our purposes.
  1466. class NoSeekTellBuffer(StringIO):
  1467. def tell(self):
  1468. raise AttributeError("No tell method")
  1469. def seek(self, pos, whence=0):
  1470. raise AttributeError("No seek method")
  1471. data = "a\n1"
  1472. parser = all_parsers
  1473. expected = DataFrame({"a": [1]})
  1474. result = parser.read_csv(NoSeekTellBuffer(data))
  1475. tm.assert_frame_equal(result, expected)
  1476. @pytest.mark.parametrize("kwargs", [
  1477. dict(), # Default is True.
  1478. dict(error_bad_lines=True), # Explicitly pass in.
  1479. ])
  1480. @pytest.mark.parametrize("warn_kwargs", [
  1481. dict(), dict(warn_bad_lines=True),
  1482. dict(warn_bad_lines=False)
  1483. ])
  1484. def test_error_bad_lines(all_parsers, kwargs, warn_kwargs):
  1485. # see gh-15925
  1486. parser = all_parsers
  1487. kwargs.update(**warn_kwargs)
  1488. data = "a\n1\n1,2,3\n4\n5,6,7"
  1489. msg = "Expected 1 fields in line 3, saw 3"
  1490. with pytest.raises(ParserError, match=msg):
  1491. parser.read_csv(StringIO(data), **kwargs)
  1492. def test_warn_bad_lines(all_parsers, capsys):
  1493. # see gh-15925
  1494. parser = all_parsers
  1495. data = "a\n1\n1,2,3\n4\n5,6,7"
  1496. expected = DataFrame({"a": [1, 4]})
  1497. result = parser.read_csv(StringIO(data),
  1498. error_bad_lines=False,
  1499. warn_bad_lines=True)
  1500. tm.assert_frame_equal(result, expected)
  1501. captured = capsys.readouterr()
  1502. assert "Skipping line 3" in captured.err
  1503. assert "Skipping line 5" in captured.err
  1504. def test_suppress_error_output(all_parsers, capsys):
  1505. # see gh-15925
  1506. parser = all_parsers
  1507. data = "a\n1\n1,2,3\n4\n5,6,7"
  1508. expected = DataFrame({"a": [1, 4]})
  1509. result = parser.read_csv(StringIO(data),
  1510. error_bad_lines=False,
  1511. warn_bad_lines=False)
  1512. tm.assert_frame_equal(result, expected)
  1513. captured = capsys.readouterr()
  1514. assert captured.err == ""
  1515. def test_filename_with_special_chars(all_parsers):
  1516. # see gh-15086.
  1517. parser = all_parsers
  1518. df = DataFrame({"a": [1, 2, 3]})
  1519. with tm.ensure_clean("sé-es-vé.csv") as path:
  1520. df.to_csv(path, index=False)
  1521. result = parser.read_csv(path)
  1522. tm.assert_frame_equal(result, df)
  1523. def test_read_csv_memory_growth_chunksize(all_parsers):
  1524. # see gh-24805
  1525. #
  1526. # Let's just make sure that we don't crash
  1527. # as we iteratively process all chunks.
  1528. parser = all_parsers
  1529. with tm.ensure_clean() as path:
  1530. with open(path, "w") as f:
  1531. for i in range(1000):
  1532. f.write(str(i) + "\n")
  1533. result = parser.read_csv(path, chunksize=20)
  1534. for _ in result:
  1535. pass
  1536. def test_read_table_deprecated(all_parsers):
  1537. # see gh-21948
  1538. parser = all_parsers
  1539. data = "a\tb\n1\t2\n3\t4"
  1540. expected = parser.read_csv(StringIO(data), sep="\t")
  1541. with tm.assert_produces_warning(FutureWarning,
  1542. check_stacklevel=False):
  1543. result = parser.read_table(StringIO(data))
  1544. tm.assert_frame_equal(result, expected)