test_textreader.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests the TextReader class in parsers.pyx, which
  4. is integral to the C engine in parsers.py
  5. """
  6. import os
  7. import numpy as np
  8. from numpy import nan
  9. import pytest
  10. import pandas._libs.parsers as parser
  11. from pandas._libs.parsers import TextReader
  12. import pandas.compat as compat
  13. from pandas.compat import BytesIO, StringIO, map
  14. from pandas import DataFrame
  15. import pandas.util.testing as tm
  16. from pandas.util.testing import assert_frame_equal
  17. from pandas.io.parsers import TextFileReader, read_csv
  18. class TestTextReader(object):
  19. @pytest.fixture(autouse=True)
  20. def setup_method(self, datapath):
  21. self.dirpath = datapath('io', 'parser', 'data')
  22. self.csv1 = os.path.join(self.dirpath, 'test1.csv')
  23. self.csv2 = os.path.join(self.dirpath, 'test2.csv')
  24. self.xls1 = os.path.join(self.dirpath, 'test.xls')
  25. def test_file_handle(self):
  26. with open(self.csv1, 'rb') as f:
  27. reader = TextReader(f)
  28. reader.read()
  29. def test_string_filename(self):
  30. reader = TextReader(self.csv1, header=None)
  31. reader.read()
  32. def test_file_handle_mmap(self):
  33. with open(self.csv1, 'rb') as f:
  34. reader = TextReader(f, memory_map=True, header=None)
  35. reader.read()
  36. def test_StringIO(self):
  37. with open(self.csv1, 'rb') as f:
  38. text = f.read()
  39. src = BytesIO(text)
  40. reader = TextReader(src, header=None)
  41. reader.read()
  42. def test_string_factorize(self):
  43. # should this be optional?
  44. data = 'a\nb\na\nb\na'
  45. reader = TextReader(StringIO(data), header=None)
  46. result = reader.read()
  47. assert len(set(map(id, result[0]))) == 2
  48. def test_skipinitialspace(self):
  49. data = ('a, b\n'
  50. 'a, b\n'
  51. 'a, b\n'
  52. 'a, b')
  53. reader = TextReader(StringIO(data), skipinitialspace=True,
  54. header=None)
  55. result = reader.read()
  56. tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'],
  57. dtype=np.object_))
  58. tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'],
  59. dtype=np.object_))
  60. def test_parse_booleans(self):
  61. data = 'True\nFalse\nTrue\nTrue'
  62. reader = TextReader(StringIO(data), header=None)
  63. result = reader.read()
  64. assert result[0].dtype == np.bool_
  65. def test_delimit_whitespace(self):
  66. data = 'a b\na\t\t "b"\n"a"\t \t b'
  67. reader = TextReader(StringIO(data), delim_whitespace=True,
  68. header=None)
  69. result = reader.read()
  70. tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'],
  71. dtype=np.object_))
  72. tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'],
  73. dtype=np.object_))
  74. def test_embedded_newline(self):
  75. data = 'a\n"hello\nthere"\nthis'
  76. reader = TextReader(StringIO(data), header=None)
  77. result = reader.read()
  78. expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_)
  79. tm.assert_numpy_array_equal(result[0], expected)
  80. def test_euro_decimal(self):
  81. data = '12345,67\n345,678'
  82. reader = TextReader(StringIO(data), delimiter=':',
  83. decimal=',', header=None)
  84. result = reader.read()
  85. expected = np.array([12345.67, 345.678])
  86. tm.assert_almost_equal(result[0], expected)
  87. def test_integer_thousands(self):
  88. data = '123,456\n12,500'
  89. reader = TextReader(StringIO(data), delimiter=':',
  90. thousands=',', header=None)
  91. result = reader.read()
  92. expected = np.array([123456, 12500], dtype=np.int64)
  93. tm.assert_almost_equal(result[0], expected)
  94. def test_integer_thousands_alt(self):
  95. data = '123.456\n12.500'
  96. reader = TextFileReader(StringIO(data), delimiter=':',
  97. thousands='.', header=None)
  98. result = reader.read()
  99. expected = DataFrame([123456, 12500])
  100. tm.assert_frame_equal(result, expected)
  101. def test_skip_bad_lines(self, capsys):
  102. # too many lines, see #2430 for why
  103. data = ('a:b:c\n'
  104. 'd:e:f\n'
  105. 'g:h:i\n'
  106. 'j:k:l:m\n'
  107. 'l:m:n\n'
  108. 'o:p:q:r')
  109. reader = TextReader(StringIO(data), delimiter=':',
  110. header=None)
  111. msg = (r"Error tokenizing data\. C error: Expected 3 fields in"
  112. " line 4, saw 4")
  113. with pytest.raises(parser.ParserError, match=msg):
  114. reader.read()
  115. reader = TextReader(StringIO(data), delimiter=':',
  116. header=None,
  117. error_bad_lines=False,
  118. warn_bad_lines=False)
  119. result = reader.read()
  120. expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object),
  121. 1: np.array(['b', 'e', 'h', 'm'], dtype=object),
  122. 2: np.array(['c', 'f', 'i', 'n'], dtype=object)}
  123. assert_array_dicts_equal(result, expected)
  124. reader = TextReader(StringIO(data), delimiter=':',
  125. header=None,
  126. error_bad_lines=False,
  127. warn_bad_lines=True)
  128. reader.read()
  129. captured = capsys.readouterr()
  130. assert 'Skipping line 4' in captured.err
  131. assert 'Skipping line 6' in captured.err
  132. def test_header_not_enough_lines(self):
  133. data = ('skip this\n'
  134. 'skip this\n'
  135. 'a,b,c\n'
  136. '1,2,3\n'
  137. '4,5,6')
  138. reader = TextReader(StringIO(data), delimiter=',', header=2)
  139. header = reader.header
  140. expected = [['a', 'b', 'c']]
  141. assert header == expected
  142. recs = reader.read()
  143. expected = {0: np.array([1, 4], dtype=np.int64),
  144. 1: np.array([2, 5], dtype=np.int64),
  145. 2: np.array([3, 6], dtype=np.int64)}
  146. assert_array_dicts_equal(recs, expected)
  147. def test_escapechar(self):
  148. data = ('\\"hello world\"\n'
  149. '\\"hello world\"\n'
  150. '\\"hello world\"')
  151. reader = TextReader(StringIO(data), delimiter=',', header=None,
  152. escapechar='\\')
  153. result = reader.read()
  154. expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
  155. assert_array_dicts_equal(result, expected)
  156. def test_eof_has_eol(self):
  157. # handling of new line at EOF
  158. pass
  159. def test_na_substitution(self):
  160. pass
  161. def test_numpy_string_dtype(self):
  162. data = """\
  163. a,1
  164. aa,2
  165. aaa,3
  166. aaaa,4
  167. aaaaa,5"""
  168. def _make_reader(**kwds):
  169. return TextReader(StringIO(data), delimiter=',', header=None,
  170. **kwds)
  171. reader = _make_reader(dtype='S5,i4')
  172. result = reader.read()
  173. assert result[0].dtype == 'S5'
  174. ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5')
  175. assert (result[0] == ex_values).all()
  176. assert result[1].dtype == 'i4'
  177. reader = _make_reader(dtype='S4')
  178. result = reader.read()
  179. assert result[0].dtype == 'S4'
  180. ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4')
  181. assert (result[0] == ex_values).all()
  182. assert result[1].dtype == 'S4'
  183. def test_pass_dtype(self):
  184. data = """\
  185. one,two
  186. 1,a
  187. 2,b
  188. 3,c
  189. 4,d"""
  190. def _make_reader(**kwds):
  191. return TextReader(StringIO(data), delimiter=',', **kwds)
  192. reader = _make_reader(dtype={'one': 'u1', 1: 'S1'})
  193. result = reader.read()
  194. assert result[0].dtype == 'u1'
  195. assert result[1].dtype == 'S1'
  196. reader = _make_reader(dtype={'one': np.uint8, 1: object})
  197. result = reader.read()
  198. assert result[0].dtype == 'u1'
  199. assert result[1].dtype == 'O'
  200. reader = _make_reader(dtype={'one': np.dtype('u1'),
  201. 1: np.dtype('O')})
  202. result = reader.read()
  203. assert result[0].dtype == 'u1'
  204. assert result[1].dtype == 'O'
  205. def test_usecols(self):
  206. data = """\
  207. a,b,c
  208. 1,2,3
  209. 4,5,6
  210. 7,8,9
  211. 10,11,12"""
  212. def _make_reader(**kwds):
  213. return TextReader(StringIO(data), delimiter=',', **kwds)
  214. reader = _make_reader(usecols=(1, 2))
  215. result = reader.read()
  216. exp = _make_reader().read()
  217. assert len(result) == 2
  218. assert (result[1] == exp[1]).all()
  219. assert (result[2] == exp[2]).all()
  220. def test_cr_delimited(self):
  221. def _test(text, **kwargs):
  222. nice_text = text.replace('\r', '\r\n')
  223. result = TextReader(StringIO(text), **kwargs).read()
  224. expected = TextReader(StringIO(nice_text), **kwargs).read()
  225. assert_array_dicts_equal(result, expected)
  226. data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
  227. _test(data, delimiter=',')
  228. data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12'
  229. _test(data, delim_whitespace=True)
  230. data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
  231. _test(data, delimiter=',')
  232. sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
  233. 'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
  234. ',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
  235. _test(sample, delimiter=',')
  236. data = 'A B C\r 2 3\r4 5 6'
  237. _test(data, delim_whitespace=True)
  238. data = 'A B C\r2 3\r4 5 6'
  239. _test(data, delim_whitespace=True)
  240. def test_empty_field_eof(self):
  241. data = 'a,b,c\n1,2,3\n4,,'
  242. result = TextReader(StringIO(data), delimiter=',').read()
  243. expected = {0: np.array([1, 4], dtype=np.int64),
  244. 1: np.array(['2', ''], dtype=object),
  245. 2: np.array(['3', ''], dtype=object)}
  246. assert_array_dicts_equal(result, expected)
  247. # GH5664
  248. a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
  249. b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
  250. columns=list('abcd'),
  251. index=[1, 1])
  252. c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
  253. [8, 9, 10, 11], [13, 14, nan, nan]],
  254. columns=list('abcd'),
  255. index=[0, 5, 7, 12])
  256. for _ in range(100):
  257. df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
  258. names=['a'], engine='c')
  259. assert_frame_equal(df, a)
  260. df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2),
  261. names=list("abcd"), engine='c')
  262. assert_frame_equal(df, b)
  263. df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
  264. names=list('abcd'), engine='c')
  265. assert_frame_equal(df, c)
  266. def test_empty_csv_input(self):
  267. # GH14867
  268. df = read_csv(StringIO(), chunksize=20, header=None,
  269. names=['a', 'b', 'c'])
  270. assert isinstance(df, TextFileReader)
  271. def assert_array_dicts_equal(left, right):
  272. for k, v in compat.iteritems(left):
  273. assert tm.assert_numpy_array_equal(np.asarray(v),
  274. np.asarray(right[k]))