test_network.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests parsers ability to read and parse non-local files
  4. and hence require a network connection to be read.
  5. """
  6. import logging
  7. import numpy as np
  8. import pytest
  9. from pandas.compat import BytesIO, StringIO
  10. import pandas.util._test_decorators as td
  11. from pandas import DataFrame
  12. import pandas.util.testing as tm
  13. from pandas.io.parsers import read_csv
  14. @pytest.mark.network
  15. @pytest.mark.parametrize(
  16. "compress_type, extension", [
  17. ('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'),
  18. pytest.param('xz', '.xz', marks=td.skip_if_no_lzma)
  19. ]
  20. )
  21. @pytest.mark.parametrize('mode', ['explicit', 'infer'])
  22. @pytest.mark.parametrize('engine', ['python', 'c'])
  23. def test_compressed_urls(salaries_table, compress_type, extension, mode,
  24. engine):
  25. check_compressed_urls(salaries_table, compress_type, extension, mode,
  26. engine)
  27. @tm.network
  28. def check_compressed_urls(salaries_table, compression, extension, mode,
  29. engine):
  30. # test reading compressed urls with various engines and
  31. # extension inference
  32. base_url = ('https://github.com/pandas-dev/pandas/raw/master/'
  33. 'pandas/tests/io/parser/data/salaries.csv')
  34. url = base_url + extension
  35. if mode != 'explicit':
  36. compression = mode
  37. url_table = read_csv(url, sep='\t', compression=compression, engine=engine)
  38. tm.assert_frame_equal(url_table, salaries_table)
  39. @pytest.fixture
  40. def tips_df(datapath):
  41. """DataFrame with the tips dataset."""
  42. return read_csv(datapath('io', 'parser', 'data', 'tips.csv'))
  43. @pytest.mark.usefixtures("s3_resource")
  44. @td.skip_if_not_us_locale()
  45. class TestS3(object):
  46. def test_parse_public_s3_bucket(self, tips_df):
  47. pytest.importorskip('s3fs')
  48. # more of an integration test due to the not-public contents portion
  49. # can probably mock this though.
  50. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
  51. df = read_csv('s3://pandas-test/tips.csv' +
  52. ext, compression=comp)
  53. assert isinstance(df, DataFrame)
  54. assert not df.empty
  55. tm.assert_frame_equal(df, tips_df)
  56. # Read public file from bucket with not-public contents
  57. df = read_csv('s3://cant_get_it/tips.csv')
  58. assert isinstance(df, DataFrame)
  59. assert not df.empty
  60. tm.assert_frame_equal(df, tips_df)
  61. def test_parse_public_s3n_bucket(self, tips_df):
  62. # Read from AWS s3 as "s3n" URL
  63. df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
  64. assert isinstance(df, DataFrame)
  65. assert not df.empty
  66. tm.assert_frame_equal(tips_df.iloc[:10], df)
  67. def test_parse_public_s3a_bucket(self, tips_df):
  68. # Read from AWS s3 as "s3a" URL
  69. df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
  70. assert isinstance(df, DataFrame)
  71. assert not df.empty
  72. tm.assert_frame_equal(tips_df.iloc[:10], df)
  73. def test_parse_public_s3_bucket_nrows(self, tips_df):
  74. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
  75. df = read_csv('s3://pandas-test/tips.csv' +
  76. ext, nrows=10, compression=comp)
  77. assert isinstance(df, DataFrame)
  78. assert not df.empty
  79. tm.assert_frame_equal(tips_df.iloc[:10], df)
  80. def test_parse_public_s3_bucket_chunked(self, tips_df):
  81. # Read with a chunksize
  82. chunksize = 5
  83. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
  84. df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
  85. chunksize=chunksize, compression=comp)
  86. assert df_reader.chunksize == chunksize
  87. for i_chunk in [0, 1, 2]:
  88. # Read a couple of chunks and make sure we see them
  89. # properly.
  90. df = df_reader.get_chunk()
  91. assert isinstance(df, DataFrame)
  92. assert not df.empty
  93. true_df = tips_df.iloc[
  94. chunksize * i_chunk: chunksize * (i_chunk + 1)]
  95. tm.assert_frame_equal(true_df, df)
  96. def test_parse_public_s3_bucket_chunked_python(self, tips_df):
  97. # Read with a chunksize using the Python parser
  98. chunksize = 5
  99. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
  100. df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
  101. chunksize=chunksize, compression=comp,
  102. engine='python')
  103. assert df_reader.chunksize == chunksize
  104. for i_chunk in [0, 1, 2]:
  105. # Read a couple of chunks and make sure we see them properly.
  106. df = df_reader.get_chunk()
  107. assert isinstance(df, DataFrame)
  108. assert not df.empty
  109. true_df = tips_df.iloc[
  110. chunksize * i_chunk: chunksize * (i_chunk + 1)]
  111. tm.assert_frame_equal(true_df, df)
  112. def test_parse_public_s3_bucket_python(self, tips_df):
  113. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
  114. df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
  115. compression=comp)
  116. assert isinstance(df, DataFrame)
  117. assert not df.empty
  118. tm.assert_frame_equal(df, tips_df)
  119. def test_infer_s3_compression(self, tips_df):
  120. for ext in ['', '.gz', '.bz2']:
  121. df = read_csv('s3://pandas-test/tips.csv' + ext,
  122. engine='python', compression='infer')
  123. assert isinstance(df, DataFrame)
  124. assert not df.empty
  125. tm.assert_frame_equal(df, tips_df)
  126. def test_parse_public_s3_bucket_nrows_python(self, tips_df):
  127. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
  128. df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
  129. nrows=10, compression=comp)
  130. assert isinstance(df, DataFrame)
  131. assert not df.empty
  132. tm.assert_frame_equal(tips_df.iloc[:10], df)
  133. def test_s3_fails(self):
  134. with pytest.raises(IOError):
  135. read_csv('s3://nyqpug/asdf.csv')
  136. # Receive a permission error when trying to read a private bucket.
  137. # It's irrelevant here that this isn't actually a table.
  138. with pytest.raises(IOError):
  139. read_csv('s3://cant_get_it/')
  140. def test_read_csv_handles_boto_s3_object(self,
  141. s3_resource,
  142. tips_file):
  143. # see gh-16135
  144. s3_object = s3_resource.meta.client.get_object(
  145. Bucket='pandas-test',
  146. Key='tips.csv')
  147. result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
  148. assert isinstance(result, DataFrame)
  149. assert not result.empty
  150. expected = read_csv(tips_file)
  151. tm.assert_frame_equal(result, expected)
  152. def test_read_csv_chunked_download(self, s3_resource, caplog):
  153. # 8 MB, S3FS usees 5MB chunks
  154. df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
  155. buf = BytesIO()
  156. str_buf = StringIO()
  157. df.to_csv(str_buf)
  158. buf = BytesIO(str_buf.getvalue().encode('utf-8'))
  159. s3_resource.Bucket("pandas-test").put_object(
  160. Key="large-file.csv",
  161. Body=buf)
  162. with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
  163. read_csv("s3://pandas-test/large-file.csv", nrows=5)
  164. # log of fetch_range (start, stop)
  165. assert ((0, 5505024) in {x.args[-2:] for x in caplog.records})