netcdf.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098
  1. """
  2. NetCDF reader/writer module.
  3. This module is used to read and create NetCDF files. NetCDF files are
  4. accessed through the `netcdf_file` object. Data written to and from NetCDF
  5. files are contained in `netcdf_variable` objects. Attributes are given
  6. as member variables of the `netcdf_file` and `netcdf_variable` objects.
  7. This module implements the Scientific.IO.NetCDF API to read and create
  8. NetCDF files. The same API is also used in the PyNIO and pynetcdf
  9. modules, allowing these modules to be used interchangeably when working
  10. with NetCDF files.
  11. Only NetCDF3 is supported here; for NetCDF4 see
  12. `netCDF4-python <http://unidata.github.io/netcdf4-python/>`__,
  13. which has a similar API.
  14. """
  15. from __future__ import division, print_function, absolute_import
  16. # TODO:
  17. # * properly implement ``_FillValue``.
  18. # * fix character variables.
  19. # * implement PAGESIZE for Python 2.6?
  20. # The Scientific.IO.NetCDF API allows attributes to be added directly to
  21. # instances of ``netcdf_file`` and ``netcdf_variable``. To differentiate
  22. # between user-set attributes and instance attributes, user-set attributes
  23. # are automatically stored in the ``_attributes`` attribute by overloading
  24. #``__setattr__``. This is the reason why the code sometimes uses
  25. #``obj.__dict__['key'] = value``, instead of simply ``obj.key = value``;
  26. # otherwise the key would be inserted into userspace attributes.
  27. __all__ = ['netcdf_file', 'netcdf_variable']
  28. import sys
  29. import warnings
  30. import weakref
  31. from operator import mul
  32. from collections import OrderedDict
  33. import mmap as mm
  34. import numpy as np
  35. from numpy.compat import asbytes, asstr
  36. from numpy import frombuffer, dtype, empty, array, asarray
  37. from numpy import little_endian as LITTLE_ENDIAN
  38. from functools import reduce
  39. from scipy._lib.six import integer_types, text_type, binary_type
  40. IS_PYPY = ('__pypy__' in sys.modules)
  41. ABSENT = b'\x00\x00\x00\x00\x00\x00\x00\x00'
  42. ZERO = b'\x00\x00\x00\x00'
  43. NC_BYTE = b'\x00\x00\x00\x01'
  44. NC_CHAR = b'\x00\x00\x00\x02'
  45. NC_SHORT = b'\x00\x00\x00\x03'
  46. NC_INT = b'\x00\x00\x00\x04'
  47. NC_FLOAT = b'\x00\x00\x00\x05'
  48. NC_DOUBLE = b'\x00\x00\x00\x06'
  49. NC_DIMENSION = b'\x00\x00\x00\n'
  50. NC_VARIABLE = b'\x00\x00\x00\x0b'
  51. NC_ATTRIBUTE = b'\x00\x00\x00\x0c'
  52. FILL_BYTE = b'\x81'
  53. FILL_CHAR = b'\x00'
  54. FILL_SHORT = b'\x80\x01'
  55. FILL_INT = b'\x80\x00\x00\x01'
  56. FILL_FLOAT = b'\x7C\xF0\x00\x00'
  57. FILL_DOUBLE = b'\x47\x9E\x00\x00\x00\x00\x00\x00'
  58. TYPEMAP = {NC_BYTE: ('b', 1),
  59. NC_CHAR: ('c', 1),
  60. NC_SHORT: ('h', 2),
  61. NC_INT: ('i', 4),
  62. NC_FLOAT: ('f', 4),
  63. NC_DOUBLE: ('d', 8)}
  64. FILLMAP = {NC_BYTE: FILL_BYTE,
  65. NC_CHAR: FILL_CHAR,
  66. NC_SHORT: FILL_SHORT,
  67. NC_INT: FILL_INT,
  68. NC_FLOAT: FILL_FLOAT,
  69. NC_DOUBLE: FILL_DOUBLE}
  70. REVERSE = {('b', 1): NC_BYTE,
  71. ('B', 1): NC_CHAR,
  72. ('c', 1): NC_CHAR,
  73. ('h', 2): NC_SHORT,
  74. ('i', 4): NC_INT,
  75. ('f', 4): NC_FLOAT,
  76. ('d', 8): NC_DOUBLE,
  77. # these come from asarray(1).dtype.char and asarray('foo').dtype.char,
  78. # used when getting the types from generic attributes.
  79. ('l', 4): NC_INT,
  80. ('S', 1): NC_CHAR}
  81. class netcdf_file(object):
  82. """
  83. A file object for NetCDF data.
  84. A `netcdf_file` object has two standard attributes: `dimensions` and
  85. `variables`. The values of both are dictionaries, mapping dimension
  86. names to their associated lengths and variable names to variables,
  87. respectively. Application programs should never modify these
  88. dictionaries.
  89. All other attributes correspond to global attributes defined in the
  90. NetCDF file. Global file attributes are created by assigning to an
  91. attribute of the `netcdf_file` object.
  92. Parameters
  93. ----------
  94. filename : string or file-like
  95. string -> filename
  96. mode : {'r', 'w', 'a'}, optional
  97. read-write-append mode, default is 'r'
  98. mmap : None or bool, optional
  99. Whether to mmap `filename` when reading. Default is True
  100. when `filename` is a file name, False when `filename` is a
  101. file-like object. Note that when mmap is in use, data arrays
  102. returned refer directly to the mmapped data on disk, and the
  103. file cannot be closed as long as references to it exist.
  104. version : {1, 2}, optional
  105. version of netcdf to read / write, where 1 means *Classic
  106. format* and 2 means *64-bit offset format*. Default is 1. See
  107. `here <https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_introduction.html#select_format>`__
  108. for more info.
  109. maskandscale : bool, optional
  110. Whether to automatically scale and/or mask data based on attributes.
  111. Default is False.
  112. Notes
  113. -----
  114. The major advantage of this module over other modules is that it doesn't
  115. require the code to be linked to the NetCDF libraries. This module is
  116. derived from `pupynere <https://bitbucket.org/robertodealmeida/pupynere/>`_.
  117. NetCDF files are a self-describing binary data format. The file contains
  118. metadata that describes the dimensions and variables in the file. More
  119. details about NetCDF files can be found `here
  120. <https://www.unidata.ucar.edu/software/netcdf/docs/user_guide.html>`__. There
  121. are three main sections to a NetCDF data structure:
  122. 1. Dimensions
  123. 2. Variables
  124. 3. Attributes
  125. The dimensions section records the name and length of each dimension used
  126. by the variables. The variables would then indicate which dimensions it
  127. uses and any attributes such as data units, along with containing the data
  128. values for the variable. It is good practice to include a
  129. variable that is the same name as a dimension to provide the values for
  130. that axes. Lastly, the attributes section would contain additional
  131. information such as the name of the file creator or the instrument used to
  132. collect the data.
  133. When writing data to a NetCDF file, there is often the need to indicate the
  134. 'record dimension'. A record dimension is the unbounded dimension for a
  135. variable. For example, a temperature variable may have dimensions of
  136. latitude, longitude and time. If one wants to add more temperature data to
  137. the NetCDF file as time progresses, then the temperature variable should
  138. have the time dimension flagged as the record dimension.
  139. In addition, the NetCDF file header contains the position of the data in
  140. the file, so access can be done in an efficient manner without loading
  141. unnecessary data into memory. It uses the ``mmap`` module to create
  142. Numpy arrays mapped to the data on disk, for the same purpose.
  143. Note that when `netcdf_file` is used to open a file with mmap=True
  144. (default for read-only), arrays returned by it refer to data
  145. directly on the disk. The file should not be closed, and cannot be cleanly
  146. closed when asked, if such arrays are alive. You may want to copy data arrays
  147. obtained from mmapped Netcdf file if they are to be processed after the file
  148. is closed, see the example below.
  149. Examples
  150. --------
  151. To create a NetCDF file:
  152. >>> from scipy.io import netcdf
  153. >>> f = netcdf.netcdf_file('simple.nc', 'w')
  154. >>> f.history = 'Created for a test'
  155. >>> f.createDimension('time', 10)
  156. >>> time = f.createVariable('time', 'i', ('time',))
  157. >>> time[:] = np.arange(10)
  158. >>> time.units = 'days since 2008-01-01'
  159. >>> f.close()
  160. Note the assignment of ``arange(10)`` to ``time[:]``. Exposing the slice
  161. of the time variable allows for the data to be set in the object, rather
  162. than letting ``arange(10)`` overwrite the ``time`` variable.
  163. To read the NetCDF file we just created:
  164. >>> from scipy.io import netcdf
  165. >>> f = netcdf.netcdf_file('simple.nc', 'r')
  166. >>> print(f.history)
  167. b'Created for a test'
  168. >>> time = f.variables['time']
  169. >>> print(time.units)
  170. b'days since 2008-01-01'
  171. >>> print(time.shape)
  172. (10,)
  173. >>> print(time[-1])
  174. 9
  175. NetCDF files, when opened read-only, return arrays that refer
  176. directly to memory-mapped data on disk:
  177. >>> data = time[:]
  178. >>> data.base.base
  179. <mmap.mmap object at 0x7fe753763180>
  180. If the data is to be processed after the file is closed, it needs
  181. to be copied to main memory:
  182. >>> data = time[:].copy()
  183. >>> f.close()
  184. >>> data.mean()
  185. 4.5
  186. A NetCDF file can also be used as context manager:
  187. >>> from scipy.io import netcdf
  188. >>> with netcdf.netcdf_file('simple.nc', 'r') as f:
  189. ... print(f.history)
  190. b'Created for a test'
  191. """
  192. def __init__(self, filename, mode='r', mmap=None, version=1,
  193. maskandscale=False):
  194. """Initialize netcdf_file from fileobj (str or file-like)."""
  195. if mode not in 'rwa':
  196. raise ValueError("Mode must be either 'r', 'w' or 'a'.")
  197. if hasattr(filename, 'seek'): # file-like
  198. self.fp = filename
  199. self.filename = 'None'
  200. if mmap is None:
  201. mmap = False
  202. elif mmap and not hasattr(filename, 'fileno'):
  203. raise ValueError('Cannot use file object for mmap')
  204. else: # maybe it's a string
  205. self.filename = filename
  206. omode = 'r+' if mode == 'a' else mode
  207. self.fp = open(self.filename, '%sb' % omode)
  208. if mmap is None:
  209. # Mmapped files on PyPy cannot be usually closed
  210. # before the GC runs, so it's better to use mmap=False
  211. # as the default.
  212. mmap = (not IS_PYPY)
  213. if mode != 'r':
  214. # Cannot read write-only files
  215. mmap = False
  216. self.use_mmap = mmap
  217. self.mode = mode
  218. self.version_byte = version
  219. self.maskandscale = maskandscale
  220. self.dimensions = OrderedDict()
  221. self.variables = OrderedDict()
  222. self._dims = []
  223. self._recs = 0
  224. self._recsize = 0
  225. self._mm = None
  226. self._mm_buf = None
  227. if self.use_mmap:
  228. self._mm = mm.mmap(self.fp.fileno(), 0, access=mm.ACCESS_READ)
  229. self._mm_buf = np.frombuffer(self._mm, dtype=np.int8)
  230. self._attributes = OrderedDict()
  231. if mode in 'ra':
  232. self._read()
  233. def __setattr__(self, attr, value):
  234. # Store user defined attributes in a separate dict,
  235. # so we can save them to file later.
  236. try:
  237. self._attributes[attr] = value
  238. except AttributeError:
  239. pass
  240. self.__dict__[attr] = value
  241. def close(self):
  242. """Closes the NetCDF file."""
  243. if hasattr(self, 'fp') and not self.fp.closed:
  244. try:
  245. self.flush()
  246. finally:
  247. self.variables = OrderedDict()
  248. if self._mm_buf is not None:
  249. ref = weakref.ref(self._mm_buf)
  250. self._mm_buf = None
  251. if ref() is None:
  252. # self._mm_buf is gc'd, and we can close the mmap
  253. self._mm.close()
  254. else:
  255. # we cannot close self._mm, since self._mm_buf is
  256. # alive and there may still be arrays referring to it
  257. warnings.warn((
  258. "Cannot close a netcdf_file opened with mmap=True, when "
  259. "netcdf_variables or arrays referring to its data still exist. "
  260. "All data arrays obtained from such files refer directly to "
  261. "data on disk, and must be copied before the file can be cleanly "
  262. "closed. (See netcdf_file docstring for more information on mmap.)"
  263. ), category=RuntimeWarning)
  264. self._mm = None
  265. self.fp.close()
  266. __del__ = close
  267. def __enter__(self):
  268. return self
  269. def __exit__(self, type, value, traceback):
  270. self.close()
  271. def createDimension(self, name, length):
  272. """
  273. Adds a dimension to the Dimension section of the NetCDF data structure.
  274. Note that this function merely adds a new dimension that the variables can
  275. reference. The values for the dimension, if desired, should be added as
  276. a variable using `createVariable`, referring to this dimension.
  277. Parameters
  278. ----------
  279. name : str
  280. Name of the dimension (Eg, 'lat' or 'time').
  281. length : int
  282. Length of the dimension.
  283. See Also
  284. --------
  285. createVariable
  286. """
  287. if length is None and self._dims:
  288. raise ValueError("Only first dimension may be unlimited!")
  289. self.dimensions[name] = length
  290. self._dims.append(name)
  291. def createVariable(self, name, type, dimensions):
  292. """
  293. Create an empty variable for the `netcdf_file` object, specifying its data
  294. type and the dimensions it uses.
  295. Parameters
  296. ----------
  297. name : str
  298. Name of the new variable.
  299. type : dtype or str
  300. Data type of the variable.
  301. dimensions : sequence of str
  302. List of the dimension names used by the variable, in the desired order.
  303. Returns
  304. -------
  305. variable : netcdf_variable
  306. The newly created ``netcdf_variable`` object.
  307. This object has also been added to the `netcdf_file` object as well.
  308. See Also
  309. --------
  310. createDimension
  311. Notes
  312. -----
  313. Any dimensions to be used by the variable should already exist in the
  314. NetCDF data structure or should be created by `createDimension` prior to
  315. creating the NetCDF variable.
  316. """
  317. shape = tuple([self.dimensions[dim] for dim in dimensions])
  318. shape_ = tuple([dim or 0 for dim in shape]) # replace None with 0 for numpy
  319. type = dtype(type)
  320. typecode, size = type.char, type.itemsize
  321. if (typecode, size) not in REVERSE:
  322. raise ValueError("NetCDF 3 does not support type %s" % type)
  323. data = empty(shape_, dtype=type.newbyteorder("B")) # convert to big endian always for NetCDF 3
  324. self.variables[name] = netcdf_variable(
  325. data, typecode, size, shape, dimensions,
  326. maskandscale=self.maskandscale)
  327. return self.variables[name]
  328. def flush(self):
  329. """
  330. Perform a sync-to-disk flush if the `netcdf_file` object is in write mode.
  331. See Also
  332. --------
  333. sync : Identical function
  334. """
  335. if hasattr(self, 'mode') and self.mode in 'wa':
  336. self._write()
  337. sync = flush
  338. def _write(self):
  339. self.fp.seek(0)
  340. self.fp.write(b'CDF')
  341. self.fp.write(array(self.version_byte, '>b').tostring())
  342. # Write headers and data.
  343. self._write_numrecs()
  344. self._write_dim_array()
  345. self._write_gatt_array()
  346. self._write_var_array()
  347. def _write_numrecs(self):
  348. # Get highest record count from all record variables.
  349. for var in self.variables.values():
  350. if var.isrec and len(var.data) > self._recs:
  351. self.__dict__['_recs'] = len(var.data)
  352. self._pack_int(self._recs)
  353. def _write_dim_array(self):
  354. if self.dimensions:
  355. self.fp.write(NC_DIMENSION)
  356. self._pack_int(len(self.dimensions))
  357. for name in self._dims:
  358. self._pack_string(name)
  359. length = self.dimensions[name]
  360. self._pack_int(length or 0) # replace None with 0 for record dimension
  361. else:
  362. self.fp.write(ABSENT)
  363. def _write_gatt_array(self):
  364. self._write_att_array(self._attributes)
  365. def _write_att_array(self, attributes):
  366. if attributes:
  367. self.fp.write(NC_ATTRIBUTE)
  368. self._pack_int(len(attributes))
  369. for name, values in attributes.items():
  370. self._pack_string(name)
  371. self._write_att_values(values)
  372. else:
  373. self.fp.write(ABSENT)
  374. def _write_var_array(self):
  375. if self.variables:
  376. self.fp.write(NC_VARIABLE)
  377. self._pack_int(len(self.variables))
  378. # Sort variable names non-recs first, then recs.
  379. def sortkey(n):
  380. v = self.variables[n]
  381. if v.isrec:
  382. return (-1,)
  383. return v._shape
  384. variables = sorted(self.variables, key=sortkey, reverse=True)
  385. # Set the metadata for all variables.
  386. for name in variables:
  387. self._write_var_metadata(name)
  388. # Now that we have the metadata, we know the vsize of
  389. # each record variable, so we can calculate recsize.
  390. self.__dict__['_recsize'] = sum([
  391. var._vsize for var in self.variables.values()
  392. if var.isrec])
  393. # Set the data for all variables.
  394. for name in variables:
  395. self._write_var_data(name)
  396. else:
  397. self.fp.write(ABSENT)
  398. def _write_var_metadata(self, name):
  399. var = self.variables[name]
  400. self._pack_string(name)
  401. self._pack_int(len(var.dimensions))
  402. for dimname in var.dimensions:
  403. dimid = self._dims.index(dimname)
  404. self._pack_int(dimid)
  405. self._write_att_array(var._attributes)
  406. nc_type = REVERSE[var.typecode(), var.itemsize()]
  407. self.fp.write(asbytes(nc_type))
  408. if not var.isrec:
  409. vsize = var.data.size * var.data.itemsize
  410. vsize += -vsize % 4
  411. else: # record variable
  412. try:
  413. vsize = var.data[0].size * var.data.itemsize
  414. except IndexError:
  415. vsize = 0
  416. rec_vars = len([v for v in self.variables.values()
  417. if v.isrec])
  418. if rec_vars > 1:
  419. vsize += -vsize % 4
  420. self.variables[name].__dict__['_vsize'] = vsize
  421. self._pack_int(vsize)
  422. # Pack a bogus begin, and set the real value later.
  423. self.variables[name].__dict__['_begin'] = self.fp.tell()
  424. self._pack_begin(0)
  425. def _write_var_data(self, name):
  426. var = self.variables[name]
  427. # Set begin in file header.
  428. the_beguine = self.fp.tell()
  429. self.fp.seek(var._begin)
  430. self._pack_begin(the_beguine)
  431. self.fp.seek(the_beguine)
  432. # Write data.
  433. if not var.isrec:
  434. self.fp.write(var.data.tostring())
  435. count = var.data.size * var.data.itemsize
  436. self._write_var_padding(var, var._vsize - count)
  437. else: # record variable
  438. # Handle rec vars with shape[0] < nrecs.
  439. if self._recs > len(var.data):
  440. shape = (self._recs,) + var.data.shape[1:]
  441. # Resize in-place does not always work since
  442. # the array might not be single-segment
  443. try:
  444. var.data.resize(shape)
  445. except ValueError:
  446. var.__dict__['data'] = np.resize(var.data, shape).astype(var.data.dtype)
  447. pos0 = pos = self.fp.tell()
  448. for rec in var.data:
  449. # Apparently scalars cannot be converted to big endian. If we
  450. # try to convert a ``=i4`` scalar to, say, '>i4' the dtype
  451. # will remain as ``=i4``.
  452. if not rec.shape and (rec.dtype.byteorder == '<' or
  453. (rec.dtype.byteorder == '=' and LITTLE_ENDIAN)):
  454. rec = rec.byteswap()
  455. self.fp.write(rec.tostring())
  456. # Padding
  457. count = rec.size * rec.itemsize
  458. self._write_var_padding(var, var._vsize - count)
  459. pos += self._recsize
  460. self.fp.seek(pos)
  461. self.fp.seek(pos0 + var._vsize)
  462. def _write_var_padding(self, var, size):
  463. encoded_fill_value = var._get_encoded_fill_value()
  464. num_fills = size // len(encoded_fill_value)
  465. self.fp.write(encoded_fill_value * num_fills)
  466. def _write_att_values(self, values):
  467. if hasattr(values, 'dtype'):
  468. nc_type = REVERSE[values.dtype.char, values.dtype.itemsize]
  469. else:
  470. types = [(t, NC_INT) for t in integer_types]
  471. types += [
  472. (float, NC_FLOAT),
  473. (str, NC_CHAR)
  474. ]
  475. # bytes index into scalars in py3k. Check for "string" types
  476. if isinstance(values, text_type) or isinstance(values, binary_type):
  477. sample = values
  478. else:
  479. try:
  480. sample = values[0] # subscriptable?
  481. except TypeError:
  482. sample = values # scalar
  483. for class_, nc_type in types:
  484. if isinstance(sample, class_):
  485. break
  486. typecode, size = TYPEMAP[nc_type]
  487. dtype_ = '>%s' % typecode
  488. # asarray() dies with bytes and '>c' in py3k. Change to 'S'
  489. dtype_ = 'S' if dtype_ == '>c' else dtype_
  490. values = asarray(values, dtype=dtype_)
  491. self.fp.write(asbytes(nc_type))
  492. if values.dtype.char == 'S':
  493. nelems = values.itemsize
  494. else:
  495. nelems = values.size
  496. self._pack_int(nelems)
  497. if not values.shape and (values.dtype.byteorder == '<' or
  498. (values.dtype.byteorder == '=' and LITTLE_ENDIAN)):
  499. values = values.byteswap()
  500. self.fp.write(values.tostring())
  501. count = values.size * values.itemsize
  502. self.fp.write(b'\x00' * (-count % 4)) # pad
  503. def _read(self):
  504. # Check magic bytes and version
  505. magic = self.fp.read(3)
  506. if not magic == b'CDF':
  507. raise TypeError("Error: %s is not a valid NetCDF 3 file" %
  508. self.filename)
  509. self.__dict__['version_byte'] = frombuffer(self.fp.read(1), '>b')[0]
  510. # Read file headers and set data.
  511. self._read_numrecs()
  512. self._read_dim_array()
  513. self._read_gatt_array()
  514. self._read_var_array()
  515. def _read_numrecs(self):
  516. self.__dict__['_recs'] = self._unpack_int()
  517. def _read_dim_array(self):
  518. header = self.fp.read(4)
  519. if header not in [ZERO, NC_DIMENSION]:
  520. raise ValueError("Unexpected header.")
  521. count = self._unpack_int()
  522. for dim in range(count):
  523. name = asstr(self._unpack_string())
  524. length = self._unpack_int() or None # None for record dimension
  525. self.dimensions[name] = length
  526. self._dims.append(name) # preserve order
  527. def _read_gatt_array(self):
  528. for k, v in self._read_att_array().items():
  529. self.__setattr__(k, v)
  530. def _read_att_array(self):
  531. header = self.fp.read(4)
  532. if header not in [ZERO, NC_ATTRIBUTE]:
  533. raise ValueError("Unexpected header.")
  534. count = self._unpack_int()
  535. attributes = OrderedDict()
  536. for attr in range(count):
  537. name = asstr(self._unpack_string())
  538. attributes[name] = self._read_att_values()
  539. return attributes
  540. def _read_var_array(self):
  541. header = self.fp.read(4)
  542. if header not in [ZERO, NC_VARIABLE]:
  543. raise ValueError("Unexpected header.")
  544. begin = 0
  545. dtypes = {'names': [], 'formats': []}
  546. rec_vars = []
  547. count = self._unpack_int()
  548. for var in range(count):
  549. (name, dimensions, shape, attributes,
  550. typecode, size, dtype_, begin_, vsize) = self._read_var()
  551. # https://www.unidata.ucar.edu/software/netcdf/docs/user_guide.html
  552. # Note that vsize is the product of the dimension lengths
  553. # (omitting the record dimension) and the number of bytes
  554. # per value (determined from the type), increased to the
  555. # next multiple of 4, for each variable. If a record
  556. # variable, this is the amount of space per record. The
  557. # netCDF "record size" is calculated as the sum of the
  558. # vsize's of all the record variables.
  559. #
  560. # The vsize field is actually redundant, because its value
  561. # may be computed from other information in the header. The
  562. # 32-bit vsize field is not large enough to contain the size
  563. # of variables that require more than 2^32 - 4 bytes, so
  564. # 2^32 - 1 is used in the vsize field for such variables.
  565. if shape and shape[0] is None: # record variable
  566. rec_vars.append(name)
  567. # The netCDF "record size" is calculated as the sum of
  568. # the vsize's of all the record variables.
  569. self.__dict__['_recsize'] += vsize
  570. if begin == 0:
  571. begin = begin_
  572. dtypes['names'].append(name)
  573. dtypes['formats'].append(str(shape[1:]) + dtype_)
  574. # Handle padding with a virtual variable.
  575. if typecode in 'bch':
  576. actual_size = reduce(mul, (1,) + shape[1:]) * size
  577. padding = -actual_size % 4
  578. if padding:
  579. dtypes['names'].append('_padding_%d' % var)
  580. dtypes['formats'].append('(%d,)>b' % padding)
  581. # Data will be set later.
  582. data = None
  583. else: # not a record variable
  584. # Calculate size to avoid problems with vsize (above)
  585. a_size = reduce(mul, shape, 1) * size
  586. if self.use_mmap:
  587. data = self._mm_buf[begin_:begin_+a_size].view(dtype=dtype_)
  588. data.shape = shape
  589. else:
  590. pos = self.fp.tell()
  591. self.fp.seek(begin_)
  592. data = frombuffer(self.fp.read(a_size), dtype=dtype_
  593. ).copy()
  594. data.shape = shape
  595. self.fp.seek(pos)
  596. # Add variable.
  597. self.variables[name] = netcdf_variable(
  598. data, typecode, size, shape, dimensions, attributes,
  599. maskandscale=self.maskandscale)
  600. if rec_vars:
  601. # Remove padding when only one record variable.
  602. if len(rec_vars) == 1:
  603. dtypes['names'] = dtypes['names'][:1]
  604. dtypes['formats'] = dtypes['formats'][:1]
  605. # Build rec array.
  606. if self.use_mmap:
  607. rec_array = self._mm_buf[begin:begin+self._recs*self._recsize].view(dtype=dtypes)
  608. rec_array.shape = (self._recs,)
  609. else:
  610. pos = self.fp.tell()
  611. self.fp.seek(begin)
  612. rec_array = frombuffer(self.fp.read(self._recs*self._recsize),
  613. dtype=dtypes).copy()
  614. rec_array.shape = (self._recs,)
  615. self.fp.seek(pos)
  616. for var in rec_vars:
  617. self.variables[var].__dict__['data'] = rec_array[var]
  618. def _read_var(self):
  619. name = asstr(self._unpack_string())
  620. dimensions = []
  621. shape = []
  622. dims = self._unpack_int()
  623. for i in range(dims):
  624. dimid = self._unpack_int()
  625. dimname = self._dims[dimid]
  626. dimensions.append(dimname)
  627. dim = self.dimensions[dimname]
  628. shape.append(dim)
  629. dimensions = tuple(dimensions)
  630. shape = tuple(shape)
  631. attributes = self._read_att_array()
  632. nc_type = self.fp.read(4)
  633. vsize = self._unpack_int()
  634. begin = [self._unpack_int, self._unpack_int64][self.version_byte-1]()
  635. typecode, size = TYPEMAP[nc_type]
  636. dtype_ = '>%s' % typecode
  637. return name, dimensions, shape, attributes, typecode, size, dtype_, begin, vsize
  638. def _read_att_values(self):
  639. nc_type = self.fp.read(4)
  640. n = self._unpack_int()
  641. typecode, size = TYPEMAP[nc_type]
  642. count = n*size
  643. values = self.fp.read(int(count))
  644. self.fp.read(-count % 4) # read padding
  645. if typecode is not 'c':
  646. values = frombuffer(values, dtype='>%s' % typecode).copy()
  647. if values.shape == (1,):
  648. values = values[0]
  649. else:
  650. values = values.rstrip(b'\x00')
  651. return values
  652. def _pack_begin(self, begin):
  653. if self.version_byte == 1:
  654. self._pack_int(begin)
  655. elif self.version_byte == 2:
  656. self._pack_int64(begin)
  657. def _pack_int(self, value):
  658. self.fp.write(array(value, '>i').tostring())
  659. _pack_int32 = _pack_int
  660. def _unpack_int(self):
  661. return int(frombuffer(self.fp.read(4), '>i')[0])
  662. _unpack_int32 = _unpack_int
  663. def _pack_int64(self, value):
  664. self.fp.write(array(value, '>q').tostring())
  665. def _unpack_int64(self):
  666. return frombuffer(self.fp.read(8), '>q')[0]
  667. def _pack_string(self, s):
  668. count = len(s)
  669. self._pack_int(count)
  670. self.fp.write(asbytes(s))
  671. self.fp.write(b'\x00' * (-count % 4)) # pad
  672. def _unpack_string(self):
  673. count = self._unpack_int()
  674. s = self.fp.read(count).rstrip(b'\x00')
  675. self.fp.read(-count % 4) # read padding
  676. return s
  677. class netcdf_variable(object):
  678. """
  679. A data object for the `netcdf` module.
  680. `netcdf_variable` objects are constructed by calling the method
  681. `netcdf_file.createVariable` on the `netcdf_file` object. `netcdf_variable`
  682. objects behave much like array objects defined in numpy, except that their
  683. data resides in a file. Data is read by indexing and written by assigning
  684. to an indexed subset; the entire array can be accessed by the index ``[:]``
  685. or (for scalars) by using the methods `getValue` and `assignValue`.
  686. `netcdf_variable` objects also have attribute `shape` with the same meaning
  687. as for arrays, but the shape cannot be modified. There is another read-only
  688. attribute `dimensions`, whose value is the tuple of dimension names.
  689. All other attributes correspond to variable attributes defined in
  690. the NetCDF file. Variable attributes are created by assigning to an
  691. attribute of the `netcdf_variable` object.
  692. Parameters
  693. ----------
  694. data : array_like
  695. The data array that holds the values for the variable.
  696. Typically, this is initialized as empty, but with the proper shape.
  697. typecode : dtype character code
  698. Desired data-type for the data array.
  699. size : int
  700. Desired element size for the data array.
  701. shape : sequence of ints
  702. The shape of the array. This should match the lengths of the
  703. variable's dimensions.
  704. dimensions : sequence of strings
  705. The names of the dimensions used by the variable. Must be in the
  706. same order of the dimension lengths given by `shape`.
  707. attributes : dict, optional
  708. Attribute values (any type) keyed by string names. These attributes
  709. become attributes for the netcdf_variable object.
  710. maskandscale : bool, optional
  711. Whether to automatically scale and/or mask data based on attributes.
  712. Default is False.
  713. Attributes
  714. ----------
  715. dimensions : list of str
  716. List of names of dimensions used by the variable object.
  717. isrec, shape
  718. Properties
  719. See also
  720. --------
  721. isrec, shape
  722. """
  723. def __init__(self, data, typecode, size, shape, dimensions,
  724. attributes=None,
  725. maskandscale=False):
  726. self.data = data
  727. self._typecode = typecode
  728. self._size = size
  729. self._shape = shape
  730. self.dimensions = dimensions
  731. self.maskandscale = maskandscale
  732. self._attributes = attributes or OrderedDict()
  733. for k, v in self._attributes.items():
  734. self.__dict__[k] = v
  735. def __setattr__(self, attr, value):
  736. # Store user defined attributes in a separate dict,
  737. # so we can save them to file later.
  738. try:
  739. self._attributes[attr] = value
  740. except AttributeError:
  741. pass
  742. self.__dict__[attr] = value
  743. def isrec(self):
  744. """Returns whether the variable has a record dimension or not.
  745. A record dimension is a dimension along which additional data could be
  746. easily appended in the netcdf data structure without much rewriting of
  747. the data file. This attribute is a read-only property of the
  748. `netcdf_variable`.
  749. """
  750. return bool(self.data.shape) and not self._shape[0]
  751. isrec = property(isrec)
  752. def shape(self):
  753. """Returns the shape tuple of the data variable.
  754. This is a read-only attribute and can not be modified in the
  755. same manner of other numpy arrays.
  756. """
  757. return self.data.shape
  758. shape = property(shape)
  759. def getValue(self):
  760. """
  761. Retrieve a scalar value from a `netcdf_variable` of length one.
  762. Raises
  763. ------
  764. ValueError
  765. If the netcdf variable is an array of length greater than one,
  766. this exception will be raised.
  767. """
  768. return self.data.item()
  769. def assignValue(self, value):
  770. """
  771. Assign a scalar value to a `netcdf_variable` of length one.
  772. Parameters
  773. ----------
  774. value : scalar
  775. Scalar value (of compatible type) to assign to a length-one netcdf
  776. variable. This value will be written to file.
  777. Raises
  778. ------
  779. ValueError
  780. If the input is not a scalar, or if the destination is not a length-one
  781. netcdf variable.
  782. """
  783. if not self.data.flags.writeable:
  784. # Work-around for a bug in NumPy. Calling itemset() on a read-only
  785. # memory-mapped array causes a seg. fault.
  786. # See NumPy ticket #1622, and SciPy ticket #1202.
  787. # This check for `writeable` can be removed when the oldest version
  788. # of numpy still supported by scipy contains the fix for #1622.
  789. raise RuntimeError("variable is not writeable")
  790. self.data.itemset(value)
  791. def typecode(self):
  792. """
  793. Return the typecode of the variable.
  794. Returns
  795. -------
  796. typecode : char
  797. The character typecode of the variable (eg, 'i' for int).
  798. """
  799. return self._typecode
  800. def itemsize(self):
  801. """
  802. Return the itemsize of the variable.
  803. Returns
  804. -------
  805. itemsize : int
  806. The element size of the variable (eg, 8 for float64).
  807. """
  808. return self._size
  809. def __getitem__(self, index):
  810. if not self.maskandscale:
  811. return self.data[index]
  812. data = self.data[index].copy()
  813. missing_value = self._get_missing_value()
  814. data = self._apply_missing_value(data, missing_value)
  815. scale_factor = self._attributes.get('scale_factor')
  816. add_offset = self._attributes.get('add_offset')
  817. if add_offset is not None or scale_factor is not None:
  818. data = data.astype(np.float64)
  819. if scale_factor is not None:
  820. data = data * scale_factor
  821. if add_offset is not None:
  822. data += add_offset
  823. return data
  824. def __setitem__(self, index, data):
  825. if self.maskandscale:
  826. missing_value = (
  827. self._get_missing_value() or
  828. getattr(data, 'fill_value', 999999))
  829. self._attributes.setdefault('missing_value', missing_value)
  830. self._attributes.setdefault('_FillValue', missing_value)
  831. data = ((data - self._attributes.get('add_offset', 0.0)) /
  832. self._attributes.get('scale_factor', 1.0))
  833. data = np.ma.asarray(data).filled(missing_value)
  834. if self._typecode not in 'fd' and data.dtype.kind == 'f':
  835. data = np.round(data)
  836. # Expand data for record vars?
  837. if self.isrec:
  838. if isinstance(index, tuple):
  839. rec_index = index[0]
  840. else:
  841. rec_index = index
  842. if isinstance(rec_index, slice):
  843. recs = (rec_index.start or 0) + len(data)
  844. else:
  845. recs = rec_index + 1
  846. if recs > len(self.data):
  847. shape = (recs,) + self._shape[1:]
  848. # Resize in-place does not always work since
  849. # the array might not be single-segment
  850. try:
  851. self.data.resize(shape)
  852. except ValueError:
  853. self.__dict__['data'] = np.resize(self.data, shape).astype(self.data.dtype)
  854. self.data[index] = data
  855. def _default_encoded_fill_value(self):
  856. """
  857. The default encoded fill-value for this Variable's data type.
  858. """
  859. nc_type = REVERSE[self.typecode(), self.itemsize()]
  860. return FILLMAP[nc_type]
  861. def _get_encoded_fill_value(self):
  862. """
  863. Returns the encoded fill value for this variable as bytes.
  864. This is taken from either the _FillValue attribute, or the default fill
  865. value for this variable's data type.
  866. """
  867. if '_FillValue' in self._attributes:
  868. fill_value = np.array(self._attributes['_FillValue'],
  869. dtype=self.data.dtype).tostring()
  870. if len(fill_value) == self.itemsize():
  871. return fill_value
  872. else:
  873. return self._default_encoded_fill_value()
  874. else:
  875. return self._default_encoded_fill_value()
  876. def _get_missing_value(self):
  877. """
  878. Returns the value denoting "no data" for this variable.
  879. If this variable does not have a missing/fill value, returns None.
  880. If both _FillValue and missing_value are given, give precedence to
  881. _FillValue. The netCDF standard gives special meaning to _FillValue;
  882. missing_value is just used for compatibility with old datasets.
  883. """
  884. if '_FillValue' in self._attributes:
  885. missing_value = self._attributes['_FillValue']
  886. elif 'missing_value' in self._attributes:
  887. missing_value = self._attributes['missing_value']
  888. else:
  889. missing_value = None
  890. return missing_value
  891. @staticmethod
  892. def _apply_missing_value(data, missing_value):
  893. """
  894. Applies the given missing value to the data array.
  895. Returns a numpy.ma array, with any value equal to missing_value masked
  896. out (unless missing_value is None, in which case the original array is
  897. returned).
  898. """
  899. if missing_value is None:
  900. newdata = data
  901. else:
  902. try:
  903. missing_value_isnan = np.isnan(missing_value)
  904. except (TypeError, NotImplementedError):
  905. # some data types (e.g., characters) cannot be tested for NaN
  906. missing_value_isnan = False
  907. if missing_value_isnan:
  908. mymask = np.isnan(data)
  909. else:
  910. mymask = (data == missing_value)
  911. newdata = np.ma.masked_where(mymask, data)
  912. return newdata
  913. NetCDFFile = netcdf_file
  914. NetCDFVariable = netcdf_variable