miobase.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. # Authors: Travis Oliphant, Matthew Brett
  2. """
  3. Base classes for MATLAB file stream reading.
  4. MATLAB is a registered trademark of the Mathworks inc.
  5. """
  6. from __future__ import division, print_function, absolute_import
  7. import sys
  8. import operator
  9. from scipy._lib.six import reduce
  10. import numpy as np
  11. if sys.version_info[0] >= 3:
  12. byteord = int
  13. else:
  14. byteord = ord
  15. from scipy.misc import doccer
  16. from . import byteordercodes as boc
  17. class MatReadError(Exception):
  18. pass
  19. class MatWriteError(Exception):
  20. pass
  21. class MatReadWarning(UserWarning):
  22. pass
  23. doc_dict = \
  24. {'file_arg':
  25. '''file_name : str
  26. Name of the mat file (do not need .mat extension if
  27. appendmat==True) Can also pass open file-like object.''',
  28. 'append_arg':
  29. '''appendmat : bool, optional
  30. True to append the .mat extension to the end of the given
  31. filename, if not already present.''',
  32. 'load_args':
  33. '''byte_order : str or None, optional
  34. None by default, implying byte order guessed from mat
  35. file. Otherwise can be one of ('native', '=', 'little', '<',
  36. 'BIG', '>').
  37. mat_dtype : bool, optional
  38. If True, return arrays in same dtype as would be loaded into
  39. MATLAB (instead of the dtype with which they are saved).
  40. squeeze_me : bool, optional
  41. Whether to squeeze unit matrix dimensions or not.
  42. chars_as_strings : bool, optional
  43. Whether to convert char arrays to string arrays.
  44. matlab_compatible : bool, optional
  45. Returns matrices as would be loaded by MATLAB (implies
  46. squeeze_me=False, chars_as_strings=False, mat_dtype=True,
  47. struct_as_record=True).''',
  48. 'struct_arg':
  49. '''struct_as_record : bool, optional
  50. Whether to load MATLAB structs as numpy record arrays, or as
  51. old-style numpy arrays with dtype=object. Setting this flag to
  52. False replicates the behavior of scipy version 0.7.x (returning
  53. numpy object arrays). The default setting is True, because it
  54. allows easier round-trip load and save of MATLAB files.''',
  55. 'matstream_arg':
  56. '''mat_stream : file-like
  57. Object with file API, open for reading.''',
  58. 'long_fields':
  59. '''long_field_names : bool, optional
  60. * False - maximum field name length in a structure is 31 characters
  61. which is the documented maximum length. This is the default.
  62. * True - maximum field name length in a structure is 63 characters
  63. which works for MATLAB 7.6''',
  64. 'do_compression':
  65. '''do_compression : bool, optional
  66. Whether to compress matrices on write. Default is False.''',
  67. 'oned_as':
  68. '''oned_as : {'row', 'column'}, optional
  69. If 'column', write 1-D numpy arrays as column vectors.
  70. If 'row', write 1D numpy arrays as row vectors.''',
  71. 'unicode_strings':
  72. '''unicode_strings : bool, optional
  73. If True, write strings as Unicode, else MATLAB usual encoding.'''}
  74. docfiller = doccer.filldoc(doc_dict)
  75. '''
  76. Note on architecture
  77. ======================
  78. There are three sets of parameters relevant for reading files. The
  79. first are *file read parameters* - containing options that are common
  80. for reading the whole file, and therefore every variable within that
  81. file. At the moment these are:
  82. * mat_stream
  83. * dtypes (derived from byte code)
  84. * byte_order
  85. * chars_as_strings
  86. * squeeze_me
  87. * struct_as_record (MATLAB 5 files)
  88. * class_dtypes (derived from order code, MATLAB 5 files)
  89. * codecs (MATLAB 5 files)
  90. * uint16_codec (MATLAB 5 files)
  91. Another set of parameters are those that apply only to the current
  92. variable being read - the *header*:
  93. * header related variables (different for v4 and v5 mat files)
  94. * is_complex
  95. * mclass
  96. * var_stream
  97. With the header, we need ``next_position`` to tell us where the next
  98. variable in the stream is.
  99. Then, for each element in a matrix, there can be *element read
  100. parameters*. An element is, for example, one element in a MATLAB cell
  101. array. At the moment these are:
  102. * mat_dtype
  103. The file-reading object contains the *file read parameters*. The
  104. *header* is passed around as a data object, or may be read and discarded
  105. in a single function. The *element read parameters* - the mat_dtype in
  106. this instance, is passed into a general post-processing function - see
  107. ``mio_utils`` for details.
  108. '''
  109. def convert_dtypes(dtype_template, order_code):
  110. ''' Convert dtypes in mapping to given order
  111. Parameters
  112. ----------
  113. dtype_template : mapping
  114. mapping with values returning numpy dtype from ``np.dtype(val)``
  115. order_code : str
  116. an order code suitable for using in ``dtype.newbyteorder()``
  117. Returns
  118. -------
  119. dtypes : mapping
  120. mapping where values have been replaced by
  121. ``np.dtype(val).newbyteorder(order_code)``
  122. '''
  123. dtypes = dtype_template.copy()
  124. for k in dtypes:
  125. dtypes[k] = np.dtype(dtypes[k]).newbyteorder(order_code)
  126. return dtypes
  127. def read_dtype(mat_stream, a_dtype):
  128. """
  129. Generic get of byte stream data of known type
  130. Parameters
  131. ----------
  132. mat_stream : file_like object
  133. MATLAB (tm) mat file stream
  134. a_dtype : dtype
  135. dtype of array to read. `a_dtype` is assumed to be correct
  136. endianness.
  137. Returns
  138. -------
  139. arr : ndarray
  140. Array of dtype `a_dtype` read from stream.
  141. """
  142. num_bytes = a_dtype.itemsize
  143. arr = np.ndarray(shape=(),
  144. dtype=a_dtype,
  145. buffer=mat_stream.read(num_bytes),
  146. order='F')
  147. return arr
  148. def get_matfile_version(fileobj):
  149. """
  150. Return major, minor tuple depending on apparent mat file type
  151. Where:
  152. #. 0,x -> version 4 format mat files
  153. #. 1,x -> version 5 format mat files
  154. #. 2,x -> version 7.3 format mat files (HDF format)
  155. Parameters
  156. ----------
  157. fileobj : file_like
  158. object implementing seek() and read()
  159. Returns
  160. -------
  161. major_version : {0, 1, 2}
  162. major MATLAB File format version
  163. minor_version : int
  164. minor MATLAB file format version
  165. Raises
  166. ------
  167. MatReadError
  168. If the file is empty.
  169. ValueError
  170. The matfile version is unknown.
  171. Notes
  172. -----
  173. Has the side effect of setting the file read pointer to 0
  174. """
  175. # Mat4 files have a zero somewhere in first 4 bytes
  176. fileobj.seek(0)
  177. mopt_bytes = fileobj.read(4)
  178. if len(mopt_bytes) == 0:
  179. raise MatReadError("Mat file appears to be empty")
  180. mopt_ints = np.ndarray(shape=(4,), dtype=np.uint8, buffer=mopt_bytes)
  181. if 0 in mopt_ints:
  182. fileobj.seek(0)
  183. return (0,0)
  184. # For 5 format or 7.3 format we need to read an integer in the
  185. # header. Bytes 124 through 128 contain a version integer and an
  186. # endian test string
  187. fileobj.seek(124)
  188. tst_str = fileobj.read(4)
  189. fileobj.seek(0)
  190. maj_ind = int(tst_str[2] == b'I'[0])
  191. maj_val = byteord(tst_str[maj_ind])
  192. min_val = byteord(tst_str[1-maj_ind])
  193. ret = (maj_val, min_val)
  194. if maj_val in (1, 2):
  195. return ret
  196. raise ValueError('Unknown mat file type, version %s, %s' % ret)
  197. def matdims(arr, oned_as='column'):
  198. """
  199. Determine equivalent MATLAB dimensions for given array
  200. Parameters
  201. ----------
  202. arr : ndarray
  203. Input array
  204. oned_as : {'column', 'row'}, optional
  205. Whether 1-D arrays are returned as MATLAB row or column matrices.
  206. Default is 'column'.
  207. Returns
  208. -------
  209. dims : tuple
  210. Shape tuple, in the form MATLAB expects it.
  211. Notes
  212. -----
  213. We had to decide what shape a 1 dimensional array would be by
  214. default. ``np.atleast_2d`` thinks it is a row vector. The
  215. default for a vector in MATLAB (e.g. ``>> 1:12``) is a row vector.
  216. Versions of scipy up to and including 0.11 resulted (accidentally)
  217. in 1-D arrays being read as column vectors. For the moment, we
  218. maintain the same tradition here.
  219. Examples
  220. --------
  221. >>> matdims(np.array(1)) # numpy scalar
  222. (1, 1)
  223. >>> matdims(np.array([1])) # 1d array, 1 element
  224. (1, 1)
  225. >>> matdims(np.array([1,2])) # 1d array, 2 elements
  226. (2, 1)
  227. >>> matdims(np.array([[2],[3]])) # 2d array, column vector
  228. (2, 1)
  229. >>> matdims(np.array([[2,3]])) # 2d array, row vector
  230. (1, 2)
  231. >>> matdims(np.array([[[2,3]]])) # 3d array, rowish vector
  232. (1, 1, 2)
  233. >>> matdims(np.array([])) # empty 1d array
  234. (0, 0)
  235. >>> matdims(np.array([[]])) # empty 2d
  236. (0, 0)
  237. >>> matdims(np.array([[[]]])) # empty 3d
  238. (0, 0, 0)
  239. Optional argument flips 1-D shape behavior.
  240. >>> matdims(np.array([1,2]), 'row') # 1d array, 2 elements
  241. (1, 2)
  242. The argument has to make sense though
  243. >>> matdims(np.array([1,2]), 'bizarre')
  244. Traceback (most recent call last):
  245. ...
  246. ValueError: 1D option "bizarre" is strange
  247. """
  248. shape = arr.shape
  249. if shape == (): # scalar
  250. return (1,1)
  251. if reduce(operator.mul, shape) == 0: # zero elememts
  252. return (0,) * np.max([arr.ndim, 2])
  253. if len(shape) == 1: # 1D
  254. if oned_as == 'column':
  255. return shape + (1,)
  256. elif oned_as == 'row':
  257. return (1,) + shape
  258. else:
  259. raise ValueError('1D option "%s" is strange'
  260. % oned_as)
  261. return shape
  262. class MatVarReader(object):
  263. ''' Abstract class defining required interface for var readers'''
  264. def __init__(self, file_reader):
  265. pass
  266. def read_header(self):
  267. ''' Returns header '''
  268. pass
  269. def array_from_header(self, header):
  270. ''' Reads array given header '''
  271. pass
  272. class MatFileReader(object):
  273. """ Base object for reading mat files
  274. To make this class functional, you will need to override the
  275. following methods:
  276. matrix_getter_factory - gives object to fetch next matrix from stream
  277. guess_byte_order - guesses file byte order from file
  278. """
  279. @docfiller
  280. def __init__(self, mat_stream,
  281. byte_order=None,
  282. mat_dtype=False,
  283. squeeze_me=False,
  284. chars_as_strings=True,
  285. matlab_compatible=False,
  286. struct_as_record=True,
  287. verify_compressed_data_integrity=True
  288. ):
  289. '''
  290. Initializer for mat file reader
  291. mat_stream : file-like
  292. object with file API, open for reading
  293. %(load_args)s
  294. '''
  295. # Initialize stream
  296. self.mat_stream = mat_stream
  297. self.dtypes = {}
  298. if not byte_order:
  299. byte_order = self.guess_byte_order()
  300. else:
  301. byte_order = boc.to_numpy_code(byte_order)
  302. self.byte_order = byte_order
  303. self.struct_as_record = struct_as_record
  304. if matlab_compatible:
  305. self.set_matlab_compatible()
  306. else:
  307. self.squeeze_me = squeeze_me
  308. self.chars_as_strings = chars_as_strings
  309. self.mat_dtype = mat_dtype
  310. self.verify_compressed_data_integrity = verify_compressed_data_integrity
  311. def set_matlab_compatible(self):
  312. ''' Sets options to return arrays as MATLAB loads them '''
  313. self.mat_dtype = True
  314. self.squeeze_me = False
  315. self.chars_as_strings = False
  316. def guess_byte_order(self):
  317. ''' As we do not know what file type we have, assume native '''
  318. return boc.native_code
  319. def end_of_stream(self):
  320. b = self.mat_stream.read(1)
  321. curpos = self.mat_stream.tell()
  322. self.mat_stream.seek(curpos-1)
  323. return len(b) == 0
  324. def arr_dtype_number(arr, num):
  325. ''' Return dtype for given number of items per element'''
  326. return np.dtype(arr.dtype.str[:2] + str(num))
  327. def arr_to_chars(arr):
  328. ''' Convert string array to char array '''
  329. dims = list(arr.shape)
  330. if not dims:
  331. dims = [1]
  332. dims.append(int(arr.dtype.str[2:]))
  333. arr = np.ndarray(shape=dims,
  334. dtype=arr_dtype_number(arr, 1),
  335. buffer=arr)
  336. empties = [arr == '']
  337. if not np.any(empties):
  338. return arr
  339. arr = arr.copy()
  340. arr[tuple(empties)] = ' '
  341. return arr