test_compression.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests compressed data parsing functionality for all
  4. of the parsers defined in parsers.py
  5. """
  6. import os
  7. import zipfile
  8. import pytest
  9. import pandas as pd
  10. import pandas.util.testing as tm
  11. @pytest.fixture(params=[True, False])
  12. def buffer(request):
  13. return request.param
  14. @pytest.fixture
  15. def parser_and_data(all_parsers, csv1):
  16. parser = all_parsers
  17. with open(csv1, "rb") as f:
  18. data = f.read()
  19. expected = parser.read_csv(csv1)
  20. return parser, data, expected
  21. @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
  22. def test_zip(parser_and_data, compression):
  23. parser, data, expected = parser_and_data
  24. with tm.ensure_clean("test_file.zip") as path:
  25. with zipfile.ZipFile(path, mode="w") as tmp:
  26. tmp.writestr("test_file", data)
  27. if compression == "zip2":
  28. with open(path, "rb") as f:
  29. result = parser.read_csv(f, compression="zip")
  30. else:
  31. result = parser.read_csv(path, compression=compression)
  32. tm.assert_frame_equal(result, expected)
  33. @pytest.mark.parametrize("compression", ["zip", "infer"])
  34. def test_zip_error_multiple_files(parser_and_data, compression):
  35. parser, data, expected = parser_and_data
  36. with tm.ensure_clean("combined_zip.zip") as path:
  37. inner_file_names = ["test_file", "second_file"]
  38. with zipfile.ZipFile(path, mode="w") as tmp:
  39. for file_name in inner_file_names:
  40. tmp.writestr(file_name, data)
  41. with pytest.raises(ValueError, match="Multiple files"):
  42. parser.read_csv(path, compression=compression)
  43. def test_zip_error_no_files(parser_and_data):
  44. parser, _, _ = parser_and_data
  45. with tm.ensure_clean() as path:
  46. with zipfile.ZipFile(path, mode="w"):
  47. pass
  48. with pytest.raises(ValueError, match="Zero files"):
  49. parser.read_csv(path, compression="zip")
  50. def test_zip_error_invalid_zip(parser_and_data):
  51. parser, _, _ = parser_and_data
  52. with tm.ensure_clean() as path:
  53. with open(path, "wb") as f:
  54. with pytest.raises(zipfile.BadZipfile,
  55. match="File is not a zip file"):
  56. parser.read_csv(f, compression="zip")
  57. @pytest.mark.parametrize("filename", [None, "test.{ext}"])
  58. def test_compression(parser_and_data, compression_only, buffer, filename):
  59. parser, data, expected = parser_and_data
  60. compress_type = compression_only
  61. ext = "gz" if compress_type == "gzip" else compress_type
  62. filename = filename if filename is None else filename.format(ext=ext)
  63. if filename and buffer:
  64. pytest.skip("Cannot deduce compression from "
  65. "buffer of compressed data.")
  66. with tm.ensure_clean(filename=filename) as path:
  67. tm.write_to_compressed(compress_type, path, data)
  68. compression = "infer" if filename else compress_type
  69. if buffer:
  70. with open(path, "rb") as f:
  71. result = parser.read_csv(f, compression=compression)
  72. else:
  73. result = parser.read_csv(path, compression=compression)
  74. tm.assert_frame_equal(result, expected)
  75. @pytest.mark.parametrize("ext", [None, "gz", "bz2"])
  76. def test_infer_compression(all_parsers, csv1, buffer, ext):
  77. # see gh-9770
  78. parser = all_parsers
  79. kwargs = dict(index_col=0, parse_dates=True)
  80. expected = parser.read_csv(csv1, **kwargs)
  81. kwargs["compression"] = "infer"
  82. if buffer:
  83. with open(csv1) as f:
  84. result = parser.read_csv(f, **kwargs)
  85. else:
  86. ext = "." + ext if ext else ""
  87. result = parser.read_csv(csv1 + ext, **kwargs)
  88. tm.assert_frame_equal(result, expected)
  89. def test_compression_utf16_encoding(all_parsers, csv_dir_path):
  90. # see gh-18071
  91. parser = all_parsers
  92. path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
  93. result = parser.read_csv(path, encoding="utf-16",
  94. compression="zip", sep="\t")
  95. expected = pd.DataFrame({
  96. u"Country": [u"Venezuela", u"Venezuela"],
  97. u"Twitter": [u"Hugo Chávez Frías", u"Henrique Capriles R."]
  98. })
  99. tm.assert_frame_equal(result, expected)
  100. @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
  101. def test_invalid_compression(all_parsers, invalid_compression):
  102. parser = all_parsers
  103. compress_kwargs = dict(compression=invalid_compression)
  104. msg = ("Unrecognized compression "
  105. "type: {compression}".format(**compress_kwargs))
  106. with pytest.raises(ValueError, match=msg):
  107. parser.read_csv("test_file.zip", **compress_kwargs)