sas7bdat.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. """
  2. Read SAS7BDAT files
  3. Based on code written by Jared Hobbs:
  4. https://bitbucket.org/jaredhobbs/sas7bdat
  5. See also:
  6. https://github.com/BioStatMatt/sas7bdat
  7. Partial documentation of the file format:
  8. https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
  9. Reference for binary data compression:
  10. http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
  11. """
  12. from datetime import datetime
  13. import struct
  14. import numpy as np
  15. from pandas.errors import EmptyDataError
  16. import pandas as pd
  17. from pandas import compat
  18. from pandas.io.common import BaseIterator, get_filepath_or_buffer
  19. from pandas.io.sas._sas import Parser
  20. import pandas.io.sas.sas_constants as const
  21. class _subheader_pointer(object):
  22. pass
  23. class _column(object):
  24. pass
  25. # SAS7BDAT represents a SAS data file in SAS7BDAT format.
  26. class SAS7BDATReader(BaseIterator):
  27. """
  28. Read SAS files in SAS7BDAT format.
  29. Parameters
  30. ----------
  31. path_or_buf : path name or buffer
  32. Name of SAS file or file-like object pointing to SAS file
  33. contents.
  34. index : column identifier, defaults to None
  35. Column to use as index.
  36. convert_dates : boolean, defaults to True
  37. Attempt to convert dates to Pandas datetime values. Note that
  38. some rarely used SAS date formats may be unsupported.
  39. blank_missing : boolean, defaults to True
  40. Convert empty strings to missing values (SAS uses blanks to
  41. indicate missing character variables).
  42. chunksize : int, defaults to None
  43. Return SAS7BDATReader object for iterations, returns chunks
  44. with given number of lines.
  45. encoding : string, defaults to None
  46. String encoding.
  47. convert_text : bool, defaults to True
  48. If False, text variables are left as raw bytes.
  49. convert_header_text : bool, defaults to True
  50. If False, header text, including column names, are left as raw
  51. bytes.
  52. """
  53. def __init__(self, path_or_buf, index=None, convert_dates=True,
  54. blank_missing=True, chunksize=None, encoding=None,
  55. convert_text=True, convert_header_text=True):
  56. self.index = index
  57. self.convert_dates = convert_dates
  58. self.blank_missing = blank_missing
  59. self.chunksize = chunksize
  60. self.encoding = encoding
  61. self.convert_text = convert_text
  62. self.convert_header_text = convert_header_text
  63. self.default_encoding = "latin-1"
  64. self.compression = ""
  65. self.column_names_strings = []
  66. self.column_names = []
  67. self.column_formats = []
  68. self.columns = []
  69. self._current_page_data_subheader_pointers = []
  70. self._cached_page = None
  71. self._column_data_lengths = []
  72. self._column_data_offsets = []
  73. self._column_types = []
  74. self._current_row_in_file_index = 0
  75. self._current_row_on_page_index = 0
  76. self._current_row_in_file_index = 0
  77. self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
  78. if isinstance(self._path_or_buf, compat.string_types):
  79. self._path_or_buf = open(self._path_or_buf, 'rb')
  80. self.handle = self._path_or_buf
  81. self._get_properties()
  82. self._parse_metadata()
  83. def column_data_lengths(self):
  84. """Return a numpy int64 array of the column data lengths"""
  85. return np.asarray(self._column_data_lengths, dtype=np.int64)
  86. def column_data_offsets(self):
  87. """Return a numpy int64 array of the column offsets"""
  88. return np.asarray(self._column_data_offsets, dtype=np.int64)
  89. def column_types(self):
  90. """Returns a numpy character array of the column types:
  91. s (string) or d (double)"""
  92. return np.asarray(self._column_types, dtype=np.dtype('S1'))
  93. def close(self):
  94. try:
  95. self.handle.close()
  96. except AttributeError:
  97. pass
  98. def _get_properties(self):
  99. # Check magic number
  100. self._path_or_buf.seek(0)
  101. self._cached_page = self._path_or_buf.read(288)
  102. if self._cached_page[0:len(const.magic)] != const.magic:
  103. self.close()
  104. raise ValueError("magic number mismatch (not a SAS file?)")
  105. # Get alignment information
  106. align1, align2 = 0, 0
  107. buf = self._read_bytes(const.align_1_offset, const.align_1_length)
  108. if buf == const.u64_byte_checker_value:
  109. align2 = const.align_2_value
  110. self.U64 = True
  111. self._int_length = 8
  112. self._page_bit_offset = const.page_bit_offset_x64
  113. self._subheader_pointer_length = const.subheader_pointer_length_x64
  114. else:
  115. self.U64 = False
  116. self._page_bit_offset = const.page_bit_offset_x86
  117. self._subheader_pointer_length = const.subheader_pointer_length_x86
  118. self._int_length = 4
  119. buf = self._read_bytes(const.align_2_offset, const.align_2_length)
  120. if buf == const.align_1_checker_value:
  121. align1 = const.align_2_value
  122. total_align = align1 + align2
  123. # Get endianness information
  124. buf = self._read_bytes(const.endianness_offset,
  125. const.endianness_length)
  126. if buf == b'\x01':
  127. self.byte_order = "<"
  128. else:
  129. self.byte_order = ">"
  130. # Get encoding information
  131. buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
  132. if buf in const.encoding_names:
  133. self.file_encoding = const.encoding_names[buf]
  134. else:
  135. self.file_encoding = "unknown (code={name!s})".format(name=buf)
  136. # Get platform information
  137. buf = self._read_bytes(const.platform_offset, const.platform_length)
  138. if buf == b'1':
  139. self.platform = "unix"
  140. elif buf == b'2':
  141. self.platform = "windows"
  142. else:
  143. self.platform = "unknown"
  144. buf = self._read_bytes(const.dataset_offset, const.dataset_length)
  145. self.name = buf.rstrip(b'\x00 ')
  146. if self.convert_header_text:
  147. self.name = self.name.decode(
  148. self.encoding or self.default_encoding)
  149. buf = self._read_bytes(const.file_type_offset, const.file_type_length)
  150. self.file_type = buf.rstrip(b'\x00 ')
  151. if self.convert_header_text:
  152. self.file_type = self.file_type.decode(
  153. self.encoding or self.default_encoding)
  154. # Timestamp is epoch 01/01/1960
  155. epoch = datetime(1960, 1, 1)
  156. x = self._read_float(const.date_created_offset + align1,
  157. const.date_created_length)
  158. self.date_created = epoch + pd.to_timedelta(x, unit='s')
  159. x = self._read_float(const.date_modified_offset + align1,
  160. const.date_modified_length)
  161. self.date_modified = epoch + pd.to_timedelta(x, unit='s')
  162. self.header_length = self._read_int(const.header_size_offset + align1,
  163. const.header_size_length)
  164. # Read the rest of the header into cached_page.
  165. buf = self._path_or_buf.read(self.header_length - 288)
  166. self._cached_page += buf
  167. if len(self._cached_page) != self.header_length:
  168. self.close()
  169. raise ValueError("The SAS7BDAT file appears to be truncated.")
  170. self._page_length = self._read_int(const.page_size_offset + align1,
  171. const.page_size_length)
  172. self._page_count = self._read_int(const.page_count_offset + align1,
  173. const.page_count_length)
  174. buf = self._read_bytes(const.sas_release_offset + total_align,
  175. const.sas_release_length)
  176. self.sas_release = buf.rstrip(b'\x00 ')
  177. if self.convert_header_text:
  178. self.sas_release = self.sas_release.decode(
  179. self.encoding or self.default_encoding)
  180. buf = self._read_bytes(const.sas_server_type_offset + total_align,
  181. const.sas_server_type_length)
  182. self.server_type = buf.rstrip(b'\x00 ')
  183. if self.convert_header_text:
  184. self.server_type = self.server_type.decode(
  185. self.encoding or self.default_encoding)
  186. buf = self._read_bytes(const.os_version_number_offset + total_align,
  187. const.os_version_number_length)
  188. self.os_version = buf.rstrip(b'\x00 ')
  189. if self.convert_header_text:
  190. self.os_version = self.os_version.decode(
  191. self.encoding or self.default_encoding)
  192. buf = self._read_bytes(const.os_name_offset + total_align,
  193. const.os_name_length)
  194. buf = buf.rstrip(b'\x00 ')
  195. if len(buf) > 0:
  196. self.os_name = buf.decode(self.encoding or self.default_encoding)
  197. else:
  198. buf = self._read_bytes(const.os_maker_offset + total_align,
  199. const.os_maker_length)
  200. self.os_name = buf.rstrip(b'\x00 ')
  201. if self.convert_header_text:
  202. self.os_name = self.os_name.decode(
  203. self.encoding or self.default_encoding)
  204. def __next__(self):
  205. da = self.read(nrows=self.chunksize or 1)
  206. if da is None:
  207. raise StopIteration
  208. return da
  209. # Read a single float of the given width (4 or 8).
  210. def _read_float(self, offset, width):
  211. if width not in (4, 8):
  212. self.close()
  213. raise ValueError("invalid float width")
  214. buf = self._read_bytes(offset, width)
  215. fd = "f" if width == 4 else "d"
  216. return struct.unpack(self.byte_order + fd, buf)[0]
  217. # Read a single signed integer of the given width (1, 2, 4 or 8).
  218. def _read_int(self, offset, width):
  219. if width not in (1, 2, 4, 8):
  220. self.close()
  221. raise ValueError("invalid int width")
  222. buf = self._read_bytes(offset, width)
  223. it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
  224. iv = struct.unpack(self.byte_order + it, buf)[0]
  225. return iv
  226. def _read_bytes(self, offset, length):
  227. if self._cached_page is None:
  228. self._path_or_buf.seek(offset)
  229. buf = self._path_or_buf.read(length)
  230. if len(buf) < length:
  231. self.close()
  232. msg = "Unable to read {:d} bytes from file position {:d}."
  233. raise ValueError(msg.format(length, offset))
  234. return buf
  235. else:
  236. if offset + length > len(self._cached_page):
  237. self.close()
  238. raise ValueError("The cached page is too small.")
  239. return self._cached_page[offset:offset + length]
  240. def _parse_metadata(self):
  241. done = False
  242. while not done:
  243. self._cached_page = self._path_or_buf.read(self._page_length)
  244. if len(self._cached_page) <= 0:
  245. break
  246. if len(self._cached_page) != self._page_length:
  247. self.close()
  248. raise ValueError(
  249. "Failed to read a meta data page from the SAS file.")
  250. done = self._process_page_meta()
  251. def _process_page_meta(self):
  252. self._read_page_header()
  253. pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
  254. if self._current_page_type in pt:
  255. self._process_page_metadata()
  256. is_data_page = self._current_page_type & const.page_data_type
  257. is_mix_page = self._current_page_type in const.page_mix_types
  258. return (is_data_page or is_mix_page
  259. or self._current_page_data_subheader_pointers != [])
  260. def _read_page_header(self):
  261. bit_offset = self._page_bit_offset
  262. tx = const.page_type_offset + bit_offset
  263. self._current_page_type = self._read_int(tx, const.page_type_length)
  264. tx = const.block_count_offset + bit_offset
  265. self._current_page_block_count = self._read_int(
  266. tx, const.block_count_length)
  267. tx = const.subheader_count_offset + bit_offset
  268. self._current_page_subheaders_count = (
  269. self._read_int(tx, const.subheader_count_length))
  270. def _process_page_metadata(self):
  271. bit_offset = self._page_bit_offset
  272. for i in range(self._current_page_subheaders_count):
  273. pointer = self._process_subheader_pointers(
  274. const.subheader_pointers_offset + bit_offset, i)
  275. if pointer.length == 0:
  276. continue
  277. if pointer.compression == const.truncated_subheader_id:
  278. continue
  279. subheader_signature = self._read_subheader_signature(
  280. pointer.offset)
  281. subheader_index = (
  282. self._get_subheader_index(subheader_signature,
  283. pointer.compression, pointer.ptype))
  284. self._process_subheader(subheader_index, pointer)
  285. def _get_subheader_index(self, signature, compression, ptype):
  286. index = const.subheader_signature_to_index.get(signature)
  287. if index is None:
  288. f1 = ((compression == const.compressed_subheader_id) or
  289. (compression == 0))
  290. f2 = (ptype == const.compressed_subheader_type)
  291. if (self.compression != "") and f1 and f2:
  292. index = const.SASIndex.data_subheader_index
  293. else:
  294. self.close()
  295. raise ValueError("Unknown subheader signature")
  296. return index
  297. def _process_subheader_pointers(self, offset, subheader_pointer_index):
  298. subheader_pointer_length = self._subheader_pointer_length
  299. total_offset = (offset +
  300. subheader_pointer_length * subheader_pointer_index)
  301. subheader_offset = self._read_int(total_offset, self._int_length)
  302. total_offset += self._int_length
  303. subheader_length = self._read_int(total_offset, self._int_length)
  304. total_offset += self._int_length
  305. subheader_compression = self._read_int(total_offset, 1)
  306. total_offset += 1
  307. subheader_type = self._read_int(total_offset, 1)
  308. x = _subheader_pointer()
  309. x.offset = subheader_offset
  310. x.length = subheader_length
  311. x.compression = subheader_compression
  312. x.ptype = subheader_type
  313. return x
  314. def _read_subheader_signature(self, offset):
  315. subheader_signature = self._read_bytes(offset, self._int_length)
  316. return subheader_signature
  317. def _process_subheader(self, subheader_index, pointer):
  318. offset = pointer.offset
  319. length = pointer.length
  320. if subheader_index == const.SASIndex.row_size_index:
  321. processor = self._process_rowsize_subheader
  322. elif subheader_index == const.SASIndex.column_size_index:
  323. processor = self._process_columnsize_subheader
  324. elif subheader_index == const.SASIndex.column_text_index:
  325. processor = self._process_columntext_subheader
  326. elif subheader_index == const.SASIndex.column_name_index:
  327. processor = self._process_columnname_subheader
  328. elif subheader_index == const.SASIndex.column_attributes_index:
  329. processor = self._process_columnattributes_subheader
  330. elif subheader_index == const.SASIndex.format_and_label_index:
  331. processor = self._process_format_subheader
  332. elif subheader_index == const.SASIndex.column_list_index:
  333. processor = self._process_columnlist_subheader
  334. elif subheader_index == const.SASIndex.subheader_counts_index:
  335. processor = self._process_subheader_counts
  336. elif subheader_index == const.SASIndex.data_subheader_index:
  337. self._current_page_data_subheader_pointers.append(pointer)
  338. return
  339. else:
  340. raise ValueError("unknown subheader index")
  341. processor(offset, length)
  342. def _process_rowsize_subheader(self, offset, length):
  343. int_len = self._int_length
  344. lcs_offset = offset
  345. lcp_offset = offset
  346. if self.U64:
  347. lcs_offset += 682
  348. lcp_offset += 706
  349. else:
  350. lcs_offset += 354
  351. lcp_offset += 378
  352. self.row_length = self._read_int(
  353. offset + const.row_length_offset_multiplier * int_len, int_len)
  354. self.row_count = self._read_int(
  355. offset + const.row_count_offset_multiplier * int_len, int_len)
  356. self.col_count_p1 = self._read_int(
  357. offset + const.col_count_p1_multiplier * int_len, int_len)
  358. self.col_count_p2 = self._read_int(
  359. offset + const.col_count_p2_multiplier * int_len, int_len)
  360. mx = const.row_count_on_mix_page_offset_multiplier * int_len
  361. self._mix_page_row_count = self._read_int(offset + mx, int_len)
  362. self._lcs = self._read_int(lcs_offset, 2)
  363. self._lcp = self._read_int(lcp_offset, 2)
  364. def _process_columnsize_subheader(self, offset, length):
  365. int_len = self._int_length
  366. offset += int_len
  367. self.column_count = self._read_int(offset, int_len)
  368. if (self.col_count_p1 + self.col_count_p2 !=
  369. self.column_count):
  370. print(
  371. "Warning: column count mismatch ({p1} + {p2} != "
  372. "{column_count})\n".format(
  373. p1=self.col_count_p1, p2=self.col_count_p2,
  374. column_count=self.column_count))
  375. # Unknown purpose
  376. def _process_subheader_counts(self, offset, length):
  377. pass
  378. def _process_columntext_subheader(self, offset, length):
  379. offset += self._int_length
  380. text_block_size = self._read_int(offset, const.text_block_size_length)
  381. buf = self._read_bytes(offset, text_block_size)
  382. cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
  383. cname = cname_raw
  384. if self.convert_header_text:
  385. cname = cname.decode(self.encoding or self.default_encoding)
  386. self.column_names_strings.append(cname)
  387. if len(self.column_names_strings) == 1:
  388. compression_literal = ""
  389. for cl in const.compression_literals:
  390. if cl in cname_raw:
  391. compression_literal = cl
  392. self.compression = compression_literal
  393. offset -= self._int_length
  394. offset1 = offset + 16
  395. if self.U64:
  396. offset1 += 4
  397. buf = self._read_bytes(offset1, self._lcp)
  398. compression_literal = buf.rstrip(b"\x00")
  399. if compression_literal == "":
  400. self._lcs = 0
  401. offset1 = offset + 32
  402. if self.U64:
  403. offset1 += 4
  404. buf = self._read_bytes(offset1, self._lcp)
  405. self.creator_proc = buf[0:self._lcp]
  406. elif compression_literal == const.rle_compression:
  407. offset1 = offset + 40
  408. if self.U64:
  409. offset1 += 4
  410. buf = self._read_bytes(offset1, self._lcp)
  411. self.creator_proc = buf[0:self._lcp]
  412. elif self._lcs > 0:
  413. self._lcp = 0
  414. offset1 = offset + 16
  415. if self.U64:
  416. offset1 += 4
  417. buf = self._read_bytes(offset1, self._lcs)
  418. self.creator_proc = buf[0:self._lcp]
  419. if self.convert_header_text:
  420. if hasattr(self, "creator_proc"):
  421. self.creator_proc = self.creator_proc.decode(
  422. self.encoding or self.default_encoding)
  423. def _process_columnname_subheader(self, offset, length):
  424. int_len = self._int_length
  425. offset += int_len
  426. column_name_pointers_count = (length - 2 * int_len - 12) // 8
  427. for i in range(column_name_pointers_count):
  428. text_subheader = offset + const.column_name_pointer_length * \
  429. (i + 1) + const.column_name_text_subheader_offset
  430. col_name_offset = offset + const.column_name_pointer_length * \
  431. (i + 1) + const.column_name_offset_offset
  432. col_name_length = offset + const.column_name_pointer_length * \
  433. (i + 1) + const.column_name_length_offset
  434. idx = self._read_int(
  435. text_subheader, const.column_name_text_subheader_length)
  436. col_offset = self._read_int(
  437. col_name_offset, const.column_name_offset_length)
  438. col_len = self._read_int(
  439. col_name_length, const.column_name_length_length)
  440. name_str = self.column_names_strings[idx]
  441. self.column_names.append(name_str[col_offset:col_offset + col_len])
  442. def _process_columnattributes_subheader(self, offset, length):
  443. int_len = self._int_length
  444. column_attributes_vectors_count = (
  445. length - 2 * int_len - 12) // (int_len + 8)
  446. for i in range(column_attributes_vectors_count):
  447. col_data_offset = (offset + int_len +
  448. const.column_data_offset_offset +
  449. i * (int_len + 8))
  450. col_data_len = (offset + 2 * int_len +
  451. const.column_data_length_offset +
  452. i * (int_len + 8))
  453. col_types = (offset + 2 * int_len +
  454. const.column_type_offset + i * (int_len + 8))
  455. x = self._read_int(col_data_offset, int_len)
  456. self._column_data_offsets.append(x)
  457. x = self._read_int(col_data_len, const.column_data_length_length)
  458. self._column_data_lengths.append(x)
  459. x = self._read_int(col_types, const.column_type_length)
  460. self._column_types.append(b'd' if x == 1 else b's')
  461. def _process_columnlist_subheader(self, offset, length):
  462. # unknown purpose
  463. pass
  464. def _process_format_subheader(self, offset, length):
  465. int_len = self._int_length
  466. text_subheader_format = (
  467. offset +
  468. const.column_format_text_subheader_index_offset +
  469. 3 * int_len)
  470. col_format_offset = (offset +
  471. const.column_format_offset_offset +
  472. 3 * int_len)
  473. col_format_len = (offset +
  474. const.column_format_length_offset +
  475. 3 * int_len)
  476. text_subheader_label = (
  477. offset +
  478. const.column_label_text_subheader_index_offset +
  479. 3 * int_len)
  480. col_label_offset = (offset +
  481. const.column_label_offset_offset +
  482. 3 * int_len)
  483. col_label_len = offset + const.column_label_length_offset + 3 * int_len
  484. x = self._read_int(text_subheader_format,
  485. const.column_format_text_subheader_index_length)
  486. format_idx = min(x, len(self.column_names_strings) - 1)
  487. format_start = self._read_int(
  488. col_format_offset, const.column_format_offset_length)
  489. format_len = self._read_int(
  490. col_format_len, const.column_format_length_length)
  491. label_idx = self._read_int(
  492. text_subheader_label,
  493. const.column_label_text_subheader_index_length)
  494. label_idx = min(label_idx, len(self.column_names_strings) - 1)
  495. label_start = self._read_int(
  496. col_label_offset, const.column_label_offset_length)
  497. label_len = self._read_int(col_label_len,
  498. const.column_label_length_length)
  499. label_names = self.column_names_strings[label_idx]
  500. column_label = label_names[label_start: label_start + label_len]
  501. format_names = self.column_names_strings[format_idx]
  502. column_format = format_names[format_start: format_start + format_len]
  503. current_column_number = len(self.columns)
  504. col = _column()
  505. col.col_id = current_column_number
  506. col.name = self.column_names[current_column_number]
  507. col.label = column_label
  508. col.format = column_format
  509. col.ctype = self._column_types[current_column_number]
  510. col.length = self._column_data_lengths[current_column_number]
  511. self.column_formats.append(column_format)
  512. self.columns.append(col)
  513. def read(self, nrows=None):
  514. if (nrows is None) and (self.chunksize is not None):
  515. nrows = self.chunksize
  516. elif nrows is None:
  517. nrows = self.row_count
  518. if len(self._column_types) == 0:
  519. self.close()
  520. raise EmptyDataError("No columns to parse from file")
  521. if self._current_row_in_file_index >= self.row_count:
  522. return None
  523. m = self.row_count - self._current_row_in_file_index
  524. if nrows > m:
  525. nrows = m
  526. nd = self._column_types.count(b'd')
  527. ns = self._column_types.count(b's')
  528. self._string_chunk = np.empty((ns, nrows), dtype=np.object)
  529. self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
  530. self._current_row_in_chunk_index = 0
  531. p = Parser(self)
  532. p.read(nrows)
  533. rslt = self._chunk_to_dataframe()
  534. if self.index is not None:
  535. rslt = rslt.set_index(self.index)
  536. return rslt
  537. def _read_next_page(self):
  538. self._current_page_data_subheader_pointers = []
  539. self._cached_page = self._path_or_buf.read(self._page_length)
  540. if len(self._cached_page) <= 0:
  541. return True
  542. elif len(self._cached_page) != self._page_length:
  543. self.close()
  544. msg = ("failed to read complete page from file "
  545. "(read {:d} of {:d} bytes)")
  546. raise ValueError(msg.format(len(self._cached_page),
  547. self._page_length))
  548. self._read_page_header()
  549. page_type = self._current_page_type
  550. if page_type == const.page_meta_type:
  551. self._process_page_metadata()
  552. is_data_page = page_type & const.page_data_type
  553. pt = [const.page_meta_type] + const.page_mix_types
  554. if not is_data_page and self._current_page_type not in pt:
  555. return self._read_next_page()
  556. return False
  557. def _chunk_to_dataframe(self):
  558. n = self._current_row_in_chunk_index
  559. m = self._current_row_in_file_index
  560. ix = range(m - n, m)
  561. rslt = pd.DataFrame(index=ix)
  562. js, jb = 0, 0
  563. for j in range(self.column_count):
  564. name = self.column_names[j]
  565. if self._column_types[j] == b'd':
  566. rslt[name] = self._byte_chunk[jb, :].view(
  567. dtype=self.byte_order + 'd')
  568. rslt[name] = np.asarray(rslt[name], dtype=np.float64)
  569. if self.convert_dates:
  570. unit = None
  571. if self.column_formats[j] in const.sas_date_formats:
  572. unit = 'd'
  573. elif self.column_formats[j] in const.sas_datetime_formats:
  574. unit = 's'
  575. if unit:
  576. rslt[name] = pd.to_datetime(rslt[name], unit=unit,
  577. origin="1960-01-01")
  578. jb += 1
  579. elif self._column_types[j] == b's':
  580. rslt[name] = self._string_chunk[js, :]
  581. if self.convert_text and (self.encoding is not None):
  582. rslt[name] = rslt[name].str.decode(
  583. self.encoding or self.default_encoding)
  584. if self.blank_missing:
  585. ii = rslt[name].str.len() == 0
  586. rslt.loc[ii, name] = np.nan
  587. js += 1
  588. else:
  589. self.close()
  590. raise ValueError("unknown column type {type}".format(
  591. type=self._column_types[j]))
  592. return rslt