arrayterator.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. """
  2. A buffered iterator for big arrays.
  3. This module solves the problem of iterating over a big file-based array
  4. without having to read it into memory. The `Arrayterator` class wraps
  5. an array object, and when iterated it will return sub-arrays with at most
  6. a user-specified number of elements.
  7. """
  8. from __future__ import division, absolute_import, print_function
  9. from operator import mul
  10. from functools import reduce
  11. from numpy.compat import long
  12. __all__ = ['Arrayterator']
  13. class Arrayterator(object):
  14. """
  15. Buffered iterator for big arrays.
  16. `Arrayterator` creates a buffered iterator for reading big arrays in small
  17. contiguous blocks. The class is useful for objects stored in the
  18. file system. It allows iteration over the object *without* reading
  19. everything in memory; instead, small blocks are read and iterated over.
  20. `Arrayterator` can be used with any object that supports multidimensional
  21. slices. This includes NumPy arrays, but also variables from
  22. Scientific.IO.NetCDF or pynetcdf for example.
  23. Parameters
  24. ----------
  25. var : array_like
  26. The object to iterate over.
  27. buf_size : int, optional
  28. The buffer size. If `buf_size` is supplied, the maximum amount of
  29. data that will be read into memory is `buf_size` elements.
  30. Default is None, which will read as many element as possible
  31. into memory.
  32. Attributes
  33. ----------
  34. var
  35. buf_size
  36. start
  37. stop
  38. step
  39. shape
  40. flat
  41. See Also
  42. --------
  43. ndenumerate : Multidimensional array iterator.
  44. flatiter : Flat array iterator.
  45. memmap : Create a memory-map to an array stored in a binary file on disk.
  46. Notes
  47. -----
  48. The algorithm works by first finding a "running dimension", along which
  49. the blocks will be extracted. Given an array of dimensions
  50. ``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the
  51. first dimension will be used. If, on the other hand,
  52. ``d1 < buf_size < d1*d2`` the second dimension will be used, and so on.
  53. Blocks are extracted along this dimension, and when the last block is
  54. returned the process continues from the next dimension, until all
  55. elements have been read.
  56. Examples
  57. --------
  58. >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
  59. >>> a_itor = np.lib.Arrayterator(a, 2)
  60. >>> a_itor.shape
  61. (3, 4, 5, 6)
  62. Now we can iterate over ``a_itor``, and it will return arrays of size
  63. two. Since `buf_size` was smaller than any dimension, the first
  64. dimension will be iterated over first:
  65. >>> for subarr in a_itor:
  66. ... if not subarr.all():
  67. ... print(subarr, subarr.shape)
  68. ...
  69. [[[[0 1]]]] (1, 1, 1, 2)
  70. """
  71. def __init__(self, var, buf_size=None):
  72. self.var = var
  73. self.buf_size = buf_size
  74. self.start = [0 for dim in var.shape]
  75. self.stop = [dim for dim in var.shape]
  76. self.step = [1 for dim in var.shape]
  77. def __getattr__(self, attr):
  78. return getattr(self.var, attr)
  79. def __getitem__(self, index):
  80. """
  81. Return a new arrayterator.
  82. """
  83. # Fix index, handling ellipsis and incomplete slices.
  84. if not isinstance(index, tuple):
  85. index = (index,)
  86. fixed = []
  87. length, dims = len(index), self.ndim
  88. for slice_ in index:
  89. if slice_ is Ellipsis:
  90. fixed.extend([slice(None)] * (dims-length+1))
  91. length = len(fixed)
  92. elif isinstance(slice_, (int, long)):
  93. fixed.append(slice(slice_, slice_+1, 1))
  94. else:
  95. fixed.append(slice_)
  96. index = tuple(fixed)
  97. if len(index) < dims:
  98. index += (slice(None),) * (dims-len(index))
  99. # Return a new arrayterator object.
  100. out = self.__class__(self.var, self.buf_size)
  101. for i, (start, stop, step, slice_) in enumerate(
  102. zip(self.start, self.stop, self.step, index)):
  103. out.start[i] = start + (slice_.start or 0)
  104. out.step[i] = step * (slice_.step or 1)
  105. out.stop[i] = start + (slice_.stop or stop-start)
  106. out.stop[i] = min(stop, out.stop[i])
  107. return out
  108. def __array__(self):
  109. """
  110. Return corresponding data.
  111. """
  112. slice_ = tuple(slice(*t) for t in zip(
  113. self.start, self.stop, self.step))
  114. return self.var[slice_]
  115. @property
  116. def flat(self):
  117. """
  118. A 1-D flat iterator for Arrayterator objects.
  119. This iterator returns elements of the array to be iterated over in
  120. `Arrayterator` one by one. It is similar to `flatiter`.
  121. See Also
  122. --------
  123. Arrayterator
  124. flatiter
  125. Examples
  126. --------
  127. >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
  128. >>> a_itor = np.lib.Arrayterator(a, 2)
  129. >>> for subarr in a_itor.flat:
  130. ... if not subarr:
  131. ... print(subarr, type(subarr))
  132. ...
  133. 0 <type 'numpy.int32'>
  134. """
  135. for block in self:
  136. for value in block.flat:
  137. yield value
  138. @property
  139. def shape(self):
  140. """
  141. The shape of the array to be iterated over.
  142. For an example, see `Arrayterator`.
  143. """
  144. return tuple(((stop-start-1)//step+1) for start, stop, step in
  145. zip(self.start, self.stop, self.step))
  146. def __iter__(self):
  147. # Skip arrays with degenerate dimensions
  148. if [dim for dim in self.shape if dim <= 0]:
  149. return
  150. start = self.start[:]
  151. stop = self.stop[:]
  152. step = self.step[:]
  153. ndims = self.var.ndim
  154. while True:
  155. count = self.buf_size or reduce(mul, self.shape)
  156. # iterate over each dimension, looking for the
  157. # running dimension (ie, the dimension along which
  158. # the blocks will be built from)
  159. rundim = 0
  160. for i in range(ndims-1, -1, -1):
  161. # if count is zero we ran out of elements to read
  162. # along higher dimensions, so we read only a single position
  163. if count == 0:
  164. stop[i] = start[i]+1
  165. elif count <= self.shape[i]:
  166. # limit along this dimension
  167. stop[i] = start[i] + count*step[i]
  168. rundim = i
  169. else:
  170. # read everything along this dimension
  171. stop[i] = self.stop[i]
  172. stop[i] = min(self.stop[i], stop[i])
  173. count = count//self.shape[i]
  174. # yield a block
  175. slice_ = tuple(slice(*t) for t in zip(start, stop, step))
  176. yield self.var[slice_]
  177. # Update start position, taking care of overflow to
  178. # other dimensions
  179. start[rundim] = stop[rundim] # start where we stopped
  180. for i in range(ndims-1, 0, -1):
  181. if start[i] >= self.stop[i]:
  182. start[i] = self.start[i]
  183. start[i-1] += self.step[i-1]
  184. if start[0] >= self.stop[0]:
  185. return