sas_xport.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. """
  2. Read a SAS XPort format file into a Pandas DataFrame.
  3. Based on code from Jack Cushman (github.com/jcushman/xport).
  4. The file format is defined here:
  5. https://support.sas.com/techsup/technote/ts140.pdf
  6. """
  7. from datetime import datetime
  8. import struct
  9. import warnings
  10. import numpy as np
  11. from pandas.util._decorators import Appender
  12. import pandas as pd
  13. from pandas import compat
  14. from pandas.io.common import BaseIterator, get_filepath_or_buffer
  15. _correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
  16. "000000000000000000000000000000 ")
  17. _correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
  18. "000000000000000001600000000")
  19. _correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
  20. "000000000000000000000000000000 ")
  21. _correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
  22. "000000000000000000000000000000 ")
  23. _fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
  24. 'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
  25. 'nifl', 'nifd', 'npos', '_']
  26. _base_params_doc = """\
  27. Parameters
  28. ----------
  29. filepath_or_buffer : string or file-like object
  30. Path to SAS file or object implementing binary read method."""
  31. _params2_doc = """\
  32. index : identifier of index column
  33. Identifier of column that should be used as index of the DataFrame.
  34. encoding : string
  35. Encoding for text data.
  36. chunksize : int
  37. Read file `chunksize` lines at a time, returns iterator."""
  38. _format_params_doc = """\
  39. format : string
  40. File format, only `xport` is currently supported."""
  41. _iterator_doc = """\
  42. iterator : boolean, default False
  43. Return XportReader object for reading file incrementally."""
  44. _read_sas_doc = """Read a SAS file into a DataFrame.
  45. %(_base_params_doc)s
  46. %(_format_params_doc)s
  47. %(_params2_doc)s
  48. %(_iterator_doc)s
  49. Returns
  50. -------
  51. DataFrame or XportReader
  52. Examples
  53. --------
  54. Read a SAS Xport file:
  55. >>> df = pd.read_sas('filename.XPT')
  56. Read a Xport file in 10,000 line chunks:
  57. >>> itr = pd.read_sas('filename.XPT', chunksize=10000)
  58. >>> for chunk in itr:
  59. >>> do_something(chunk)
  60. """ % {"_base_params_doc": _base_params_doc,
  61. "_format_params_doc": _format_params_doc,
  62. "_params2_doc": _params2_doc,
  63. "_iterator_doc": _iterator_doc}
  64. _xport_reader_doc = """\
  65. Class for reading SAS Xport files.
  66. %(_base_params_doc)s
  67. %(_params2_doc)s
  68. Attributes
  69. ----------
  70. member_info : list
  71. Contains information about the file
  72. fields : list
  73. Contains information about the variables in the file
  74. """ % {"_base_params_doc": _base_params_doc,
  75. "_params2_doc": _params2_doc}
  76. _read_method_doc = """\
  77. Read observations from SAS Xport file, returning as data frame.
  78. Parameters
  79. ----------
  80. nrows : int
  81. Number of rows to read from data file; if None, read whole
  82. file.
  83. Returns
  84. -------
  85. A DataFrame.
  86. """
  87. def _parse_date(datestr):
  88. """ Given a date in xport format, return Python date. """
  89. try:
  90. # e.g. "16FEB11:10:07:55"
  91. return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
  92. except ValueError:
  93. return pd.NaT
  94. def _split_line(s, parts):
  95. """
  96. Parameters
  97. ----------
  98. s: string
  99. Fixed-length string to split
  100. parts: list of (name, length) pairs
  101. Used to break up string, name '_' will be filtered from output.
  102. Returns
  103. -------
  104. Dict of name:contents of string at given location.
  105. """
  106. out = {}
  107. start = 0
  108. for name, length in parts:
  109. out[name] = s[start:start + length].strip()
  110. start += length
  111. del out['_']
  112. return out
  113. def _handle_truncated_float_vec(vec, nbytes):
  114. # This feature is not well documented, but some SAS XPORT files
  115. # have 2-7 byte "truncated" floats. To read these truncated
  116. # floats, pad them with zeros on the right to make 8 byte floats.
  117. #
  118. # References:
  119. # https://github.com/jcushman/xport/pull/3
  120. # The R "foreign" library
  121. if nbytes != 8:
  122. vec1 = np.zeros(len(vec), np.dtype('S8'))
  123. dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
  124. vec2 = vec1.view(dtype=dtype)
  125. vec2['f0'] = vec
  126. return vec2
  127. return vec
  128. def _parse_float_vec(vec):
  129. """
  130. Parse a vector of float values representing IBM 8 byte floats into
  131. native 8 byte floats.
  132. """
  133. dtype = np.dtype('>u4,>u4')
  134. vec1 = vec.view(dtype=dtype)
  135. xport1 = vec1['f0']
  136. xport2 = vec1['f1']
  137. # Start by setting first half of ieee number to first half of IBM
  138. # number sans exponent
  139. ieee1 = xport1 & 0x00ffffff
  140. # The fraction bit to the left of the binary point in the ieee
  141. # format was set and the number was shifted 0, 1, 2, or 3
  142. # places. This will tell us how to adjust the ibm exponent to be a
  143. # power of 2 ieee exponent and how to shift the fraction bits to
  144. # restore the correct magnitude.
  145. shift = np.zeros(len(vec), dtype=np.uint8)
  146. shift[np.where(xport1 & 0x00200000)] = 1
  147. shift[np.where(xport1 & 0x00400000)] = 2
  148. shift[np.where(xport1 & 0x00800000)] = 3
  149. # shift the ieee number down the correct number of places then
  150. # set the second half of the ieee number to be the second half
  151. # of the ibm number shifted appropriately, ored with the bits
  152. # from the first half that would have been shifted in if we
  153. # could shift a double. All we are worried about are the low
  154. # order 3 bits of the first half since we're only shifting by
  155. # 1, 2, or 3.
  156. ieee1 >>= shift
  157. ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
  158. # clear the 1 bit to the left of the binary point
  159. ieee1 &= 0xffefffff
  160. # set the exponent of the ieee number to be the actual exponent
  161. # plus the shift count + 1023. Or this into the first half of the
  162. # ieee number. The ibm exponent is excess 64 but is adjusted by 65
  163. # since during conversion to ibm format the exponent is
  164. # incremented by 1 and the fraction bits left 4 positions to the
  165. # right of the radix point. (had to add >> 24 because C treats &
  166. # 0x7f as 0x7f000000 and Python doesn't)
  167. ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
  168. shift + 1023) << 20) | (xport1 & 0x80000000)
  169. ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
  170. ieee['f0'] = ieee1
  171. ieee['f1'] = ieee2
  172. ieee = ieee.view(dtype='>f8')
  173. ieee = ieee.astype('f8')
  174. return ieee
  175. class XportReader(BaseIterator):
  176. __doc__ = _xport_reader_doc
  177. def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
  178. chunksize=None):
  179. self._encoding = encoding
  180. self._lines_read = 0
  181. self._index = index
  182. self._chunksize = chunksize
  183. if isinstance(filepath_or_buffer, str):
  184. (filepath_or_buffer, encoding,
  185. compression, should_close) = get_filepath_or_buffer(
  186. filepath_or_buffer, encoding=encoding)
  187. if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
  188. self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
  189. else:
  190. # Copy to BytesIO, and ensure no encoding
  191. contents = filepath_or_buffer.read()
  192. try:
  193. contents = contents.encode(self._encoding)
  194. except UnicodeEncodeError:
  195. pass
  196. self.filepath_or_buffer = compat.BytesIO(contents)
  197. self._read_header()
  198. def close(self):
  199. self.filepath_or_buffer.close()
  200. def _get_row(self):
  201. return self.filepath_or_buffer.read(80).decode()
  202. def _read_header(self):
  203. self.filepath_or_buffer.seek(0)
  204. # read file header
  205. line1 = self._get_row()
  206. if line1 != _correct_line1:
  207. self.close()
  208. raise ValueError("Header record is not an XPORT file.")
  209. line2 = self._get_row()
  210. fif = [['prefix', 24], ['version', 8], ['OS', 8],
  211. ['_', 24], ['created', 16]]
  212. file_info = _split_line(line2, fif)
  213. if file_info['prefix'] != "SAS SAS SASLIB":
  214. self.close()
  215. raise ValueError("Header record has invalid prefix.")
  216. file_info['created'] = _parse_date(file_info['created'])
  217. self.file_info = file_info
  218. line3 = self._get_row()
  219. file_info['modified'] = _parse_date(line3[:16])
  220. # read member header
  221. header1 = self._get_row()
  222. header2 = self._get_row()
  223. headflag1 = header1.startswith(_correct_header1)
  224. headflag2 = (header2 == _correct_header2)
  225. if not (headflag1 and headflag2):
  226. self.close()
  227. raise ValueError("Member header not found")
  228. # usually 140, could be 135
  229. fieldnamelength = int(header1[-5:-2])
  230. # member info
  231. mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
  232. ['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
  233. member_info = _split_line(self._get_row(), mem)
  234. mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
  235. member_info.update(_split_line(self._get_row(), mem))
  236. member_info['modified'] = _parse_date(member_info['modified'])
  237. member_info['created'] = _parse_date(member_info['created'])
  238. self.member_info = member_info
  239. # read field names
  240. types = {1: 'numeric', 2: 'char'}
  241. fieldcount = int(self._get_row()[54:58])
  242. datalength = fieldnamelength * fieldcount
  243. # round up to nearest 80
  244. if datalength % 80:
  245. datalength += 80 - datalength % 80
  246. fielddata = self.filepath_or_buffer.read(datalength)
  247. fields = []
  248. obs_length = 0
  249. while len(fielddata) >= fieldnamelength:
  250. # pull data for one field
  251. field, fielddata = (fielddata[:fieldnamelength],
  252. fielddata[fieldnamelength:])
  253. # rest at end gets ignored, so if field is short, pad out
  254. # to match struct pattern below
  255. field = field.ljust(140)
  256. fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
  257. field = dict(zip(_fieldkeys, fieldstruct))
  258. del field['_']
  259. field['ntype'] = types[field['ntype']]
  260. fl = field['field_length']
  261. if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
  262. self.close()
  263. msg = "Floating field width {0} is not between 2 and 8."
  264. raise TypeError(msg.format(fl))
  265. for k, v in field.items():
  266. try:
  267. field[k] = v.strip()
  268. except AttributeError:
  269. pass
  270. obs_length += field['field_length']
  271. fields += [field]
  272. header = self._get_row()
  273. if not header == _correct_obs_header:
  274. self.close()
  275. raise ValueError("Observation header not found.")
  276. self.fields = fields
  277. self.record_length = obs_length
  278. self.record_start = self.filepath_or_buffer.tell()
  279. self.nobs = self._record_count()
  280. self.columns = [x['name'].decode() for x in self.fields]
  281. # Setup the dtype.
  282. dtypel = [('s' + str(i), "S" + str(field['field_length']))
  283. for i, field in enumerate(self.fields)]
  284. dtype = np.dtype(dtypel)
  285. self._dtype = dtype
  286. def __next__(self):
  287. return self.read(nrows=self._chunksize or 1)
  288. def _record_count(self):
  289. """
  290. Get number of records in file.
  291. This is maybe suboptimal because we have to seek to the end of
  292. the file.
  293. Side effect: returns file position to record_start.
  294. """
  295. self.filepath_or_buffer.seek(0, 2)
  296. total_records_length = (self.filepath_or_buffer.tell() -
  297. self.record_start)
  298. if total_records_length % 80 != 0:
  299. warnings.warn("xport file may be corrupted")
  300. if self.record_length > 80:
  301. self.filepath_or_buffer.seek(self.record_start)
  302. return total_records_length // self.record_length
  303. self.filepath_or_buffer.seek(-80, 2)
  304. last_card = self.filepath_or_buffer.read(80)
  305. last_card = np.frombuffer(last_card, dtype=np.uint64)
  306. # 8 byte blank
  307. ix = np.flatnonzero(last_card == 2314885530818453536)
  308. if len(ix) == 0:
  309. tail_pad = 0
  310. else:
  311. tail_pad = 8 * len(ix)
  312. self.filepath_or_buffer.seek(self.record_start)
  313. return (total_records_length - tail_pad) // self.record_length
  314. def get_chunk(self, size=None):
  315. """
  316. Reads lines from Xport file and returns as dataframe
  317. Parameters
  318. ----------
  319. size : int, defaults to None
  320. Number of lines to read. If None, reads whole file.
  321. Returns
  322. -------
  323. DataFrame
  324. """
  325. if size is None:
  326. size = self._chunksize
  327. return self.read(nrows=size)
  328. def _missing_double(self, vec):
  329. v = vec.view(dtype='u1,u1,u2,u4')
  330. miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
  331. miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
  332. (v['f0'] == 0x5f) | (v['f0'] == 0x2e))
  333. miss &= miss1
  334. return miss
  335. @Appender(_read_method_doc)
  336. def read(self, nrows=None):
  337. if nrows is None:
  338. nrows = self.nobs
  339. read_lines = min(nrows, self.nobs - self._lines_read)
  340. read_len = read_lines * self.record_length
  341. if read_len <= 0:
  342. self.close()
  343. raise StopIteration
  344. raw = self.filepath_or_buffer.read(read_len)
  345. data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
  346. df = pd.DataFrame(index=range(read_lines))
  347. for j, x in enumerate(self.columns):
  348. vec = data['s%d' % j]
  349. ntype = self.fields[j]['ntype']
  350. if ntype == "numeric":
  351. vec = _handle_truncated_float_vec(
  352. vec, self.fields[j]['field_length'])
  353. miss = self._missing_double(vec)
  354. v = _parse_float_vec(vec)
  355. v[miss] = np.nan
  356. elif self.fields[j]['ntype'] == 'char':
  357. v = [y.rstrip() for y in vec]
  358. if compat.PY3:
  359. if self._encoding is not None:
  360. v = [y.decode(self._encoding) for y in v]
  361. df[x] = v
  362. if self._index is None:
  363. df.index = range(self._lines_read, self._lines_read + read_lines)
  364. else:
  365. df = df.set_index(self._index)
  366. self._lines_read += read_lines
  367. return df