test_xport.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. import os
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. import pandas.util.testing as tm
  6. from pandas.io.sas.sasreader import read_sas
  7. # CSV versions of test xpt files were obtained using the R foreign library
  8. # Numbers in a SAS xport file are always float64, so need to convert
  9. # before making comparisons.
  10. def numeric_as_float(data):
  11. for v in data.columns:
  12. if data[v].dtype is np.dtype('int64'):
  13. data[v] = data[v].astype(np.float64)
  14. class TestXport(object):
  15. @pytest.fixture(autouse=True)
  16. def setup_method(self, datapath):
  17. self.dirpath = datapath("io", "sas", "data")
  18. self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt")
  19. self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt")
  20. self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt")
  21. self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
  22. def test1_basic(self):
  23. # Tests with DEMO_G.xpt (all numeric file)
  24. # Compare to this
  25. data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
  26. numeric_as_float(data_csv)
  27. # Read full file
  28. data = read_sas(self.file01, format="xport")
  29. tm.assert_frame_equal(data, data_csv)
  30. num_rows = data.shape[0]
  31. # Test reading beyond end of file
  32. reader = read_sas(self.file01, format="xport", iterator=True)
  33. data = reader.read(num_rows + 100)
  34. assert data.shape[0] == num_rows
  35. reader.close()
  36. # Test incremental read with `read` method.
  37. reader = read_sas(self.file01, format="xport", iterator=True)
  38. data = reader.read(10)
  39. reader.close()
  40. tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
  41. # Test incremental read with `get_chunk` method.
  42. reader = read_sas(self.file01, format="xport", chunksize=10)
  43. data = reader.get_chunk()
  44. reader.close()
  45. tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
  46. # Test read in loop
  47. m = 0
  48. reader = read_sas(self.file01, format="xport", chunksize=100)
  49. for x in reader:
  50. m += x.shape[0]
  51. reader.close()
  52. assert m == num_rows
  53. # Read full file with `read_sas` method
  54. data = read_sas(self.file01)
  55. tm.assert_frame_equal(data, data_csv)
  56. def test1_index(self):
  57. # Tests with DEMO_G.xpt using index (all numeric file)
  58. # Compare to this
  59. data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
  60. data_csv = data_csv.set_index("SEQN")
  61. numeric_as_float(data_csv)
  62. # Read full file
  63. data = read_sas(self.file01, index="SEQN", format="xport")
  64. tm.assert_frame_equal(data, data_csv, check_index_type=False)
  65. # Test incremental read with `read` method.
  66. reader = read_sas(self.file01, index="SEQN", format="xport",
  67. iterator=True)
  68. data = reader.read(10)
  69. reader.close()
  70. tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
  71. check_index_type=False)
  72. # Test incremental read with `get_chunk` method.
  73. reader = read_sas(self.file01, index="SEQN", format="xport",
  74. chunksize=10)
  75. data = reader.get_chunk()
  76. reader.close()
  77. tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
  78. check_index_type=False)
  79. def test1_incremental(self):
  80. # Test with DEMO_G.xpt, reading full file incrementally
  81. data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
  82. data_csv = data_csv.set_index("SEQN")
  83. numeric_as_float(data_csv)
  84. reader = read_sas(self.file01, index="SEQN", chunksize=1000)
  85. all_data = [x for x in reader]
  86. data = pd.concat(all_data, axis=0)
  87. tm.assert_frame_equal(data, data_csv, check_index_type=False)
  88. def test2(self):
  89. # Test with SSHSV1_A.xpt
  90. # Compare to this
  91. data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
  92. numeric_as_float(data_csv)
  93. data = read_sas(self.file02)
  94. tm.assert_frame_equal(data, data_csv)
  95. def test_multiple_types(self):
  96. # Test with DRXFCD_G.xpt (contains text and numeric variables)
  97. # Compare to this
  98. data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv"))
  99. data = read_sas(self.file03, encoding="utf-8")
  100. tm.assert_frame_equal(data, data_csv)
  101. def test_truncated_float_support(self):
  102. # Test with paxraw_d_short.xpt, a shortened version of:
  103. # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
  104. # This file has truncated floats (5 bytes in this case).
  105. # GH 11713
  106. data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
  107. data = read_sas(self.file04, format="xport")
  108. tm.assert_frame_equal(data.astype('int64'), data_csv)