recfunctions.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608
  1. """
  2. Collection of utilities to manipulate structured arrays.
  3. Most of these functions were initially implemented by John Hunter for
  4. matplotlib. They have been rewritten and extended for convenience.
  5. """
  6. from __future__ import division, absolute_import, print_function
  7. import sys
  8. import itertools
  9. import numpy as np
  10. import numpy.ma as ma
  11. from numpy import ndarray, recarray
  12. from numpy.ma import MaskedArray
  13. from numpy.ma.mrecords import MaskedRecords
  14. from numpy.core.overrides import array_function_dispatch
  15. from numpy.lib._iotools import _is_string_like
  16. from numpy.compat import basestring
  17. from numpy.testing import suppress_warnings
  18. if sys.version_info[0] < 3:
  19. from future_builtins import zip
  20. _check_fill_value = np.ma.core._check_fill_value
  21. __all__ = [
  22. 'append_fields', 'apply_along_fields', 'assign_fields_by_name',
  23. 'drop_fields', 'find_duplicates', 'flatten_descr',
  24. 'get_fieldstructure', 'get_names', 'get_names_flat',
  25. 'join_by', 'merge_arrays', 'rec_append_fields',
  26. 'rec_drop_fields', 'rec_join', 'recursive_fill_fields',
  27. 'rename_fields', 'repack_fields', 'require_fields',
  28. 'stack_arrays', 'structured_to_unstructured', 'unstructured_to_structured',
  29. ]
  30. def _recursive_fill_fields_dispatcher(input, output):
  31. return (input, output)
  32. @array_function_dispatch(_recursive_fill_fields_dispatcher)
  33. def recursive_fill_fields(input, output):
  34. """
  35. Fills fields from output with fields from input,
  36. with support for nested structures.
  37. Parameters
  38. ----------
  39. input : ndarray
  40. Input array.
  41. output : ndarray
  42. Output array.
  43. Notes
  44. -----
  45. * `output` should be at least the same size as `input`
  46. Examples
  47. --------
  48. >>> from numpy.lib import recfunctions as rfn
  49. >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', np.int64), ('B', np.float64)])
  50. >>> b = np.zeros((3,), dtype=a.dtype)
  51. >>> rfn.recursive_fill_fields(a, b)
  52. array([(1, 10.), (2, 20.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])
  53. """
  54. newdtype = output.dtype
  55. for field in newdtype.names:
  56. try:
  57. current = input[field]
  58. except ValueError:
  59. continue
  60. if current.dtype.names is not None:
  61. recursive_fill_fields(current, output[field])
  62. else:
  63. output[field][:len(current)] = current
  64. return output
  65. def _get_fieldspec(dtype):
  66. """
  67. Produce a list of name/dtype pairs corresponding to the dtype fields
  68. Similar to dtype.descr, but the second item of each tuple is a dtype, not a
  69. string. As a result, this handles subarray dtypes
  70. Can be passed to the dtype constructor to reconstruct the dtype, noting that
  71. this (deliberately) discards field offsets.
  72. Examples
  73. --------
  74. >>> dt = np.dtype([(('a', 'A'), np.int64), ('b', np.double, 3)])
  75. >>> dt.descr
  76. [(('a', 'A'), '<i8'), ('b', '<f8', (3,))]
  77. >>> _get_fieldspec(dt)
  78. [(('a', 'A'), dtype('int64')), ('b', dtype(('<f8', (3,))))]
  79. """
  80. if dtype.names is None:
  81. # .descr returns a nameless field, so we should too
  82. return [('', dtype)]
  83. else:
  84. fields = ((name, dtype.fields[name]) for name in dtype.names)
  85. # keep any titles, if present
  86. return [
  87. (name if len(f) == 2 else (f[2], name), f[0])
  88. for name, f in fields
  89. ]
  90. def get_names(adtype):
  91. """
  92. Returns the field names of the input datatype as a tuple.
  93. Parameters
  94. ----------
  95. adtype : dtype
  96. Input datatype
  97. Examples
  98. --------
  99. >>> from numpy.lib import recfunctions as rfn
  100. >>> rfn.get_names(np.empty((1,), dtype=int))
  101. Traceback (most recent call last):
  102. ...
  103. AttributeError: 'numpy.ndarray' object has no attribute 'names'
  104. >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)]))
  105. Traceback (most recent call last):
  106. ...
  107. AttributeError: 'numpy.ndarray' object has no attribute 'names'
  108. >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
  109. >>> rfn.get_names(adtype)
  110. ('a', ('b', ('ba', 'bb')))
  111. """
  112. listnames = []
  113. names = adtype.names
  114. for name in names:
  115. current = adtype[name]
  116. if current.names is not None:
  117. listnames.append((name, tuple(get_names(current))))
  118. else:
  119. listnames.append(name)
  120. return tuple(listnames)
  121. def get_names_flat(adtype):
  122. """
  123. Returns the field names of the input datatype as a tuple. Nested structure
  124. are flattened beforehand.
  125. Parameters
  126. ----------
  127. adtype : dtype
  128. Input datatype
  129. Examples
  130. --------
  131. >>> from numpy.lib import recfunctions as rfn
  132. >>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None
  133. Traceback (most recent call last):
  134. ...
  135. AttributeError: 'numpy.ndarray' object has no attribute 'names'
  136. >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)]))
  137. Traceback (most recent call last):
  138. ...
  139. AttributeError: 'numpy.ndarray' object has no attribute 'names'
  140. >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
  141. >>> rfn.get_names_flat(adtype)
  142. ('a', 'b', 'ba', 'bb')
  143. """
  144. listnames = []
  145. names = adtype.names
  146. for name in names:
  147. listnames.append(name)
  148. current = adtype[name]
  149. if current.names is not None:
  150. listnames.extend(get_names_flat(current))
  151. return tuple(listnames)
  152. def flatten_descr(ndtype):
  153. """
  154. Flatten a structured data-type description.
  155. Examples
  156. --------
  157. >>> from numpy.lib import recfunctions as rfn
  158. >>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])])
  159. >>> rfn.flatten_descr(ndtype)
  160. (('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32')))
  161. """
  162. names = ndtype.names
  163. if names is None:
  164. return (('', ndtype),)
  165. else:
  166. descr = []
  167. for field in names:
  168. (typ, _) = ndtype.fields[field]
  169. if typ.names:
  170. descr.extend(flatten_descr(typ))
  171. else:
  172. descr.append((field, typ))
  173. return tuple(descr)
  174. def _zip_dtype(seqarrays, flatten=False):
  175. newdtype = []
  176. if flatten:
  177. for a in seqarrays:
  178. newdtype.extend(flatten_descr(a.dtype))
  179. else:
  180. for a in seqarrays:
  181. current = a.dtype
  182. if current.names is not None and len(current.names) == 1:
  183. # special case - dtypes of 1 field are flattened
  184. newdtype.extend(_get_fieldspec(current))
  185. else:
  186. newdtype.append(('', current))
  187. return np.dtype(newdtype)
  188. def _zip_descr(seqarrays, flatten=False):
  189. """
  190. Combine the dtype description of a series of arrays.
  191. Parameters
  192. ----------
  193. seqarrays : sequence of arrays
  194. Sequence of arrays
  195. flatten : {boolean}, optional
  196. Whether to collapse nested descriptions.
  197. """
  198. return _zip_dtype(seqarrays, flatten=flatten).descr
  199. def get_fieldstructure(adtype, lastname=None, parents=None,):
  200. """
  201. Returns a dictionary with fields indexing lists of their parent fields.
  202. This function is used to simplify access to fields nested in other fields.
  203. Parameters
  204. ----------
  205. adtype : np.dtype
  206. Input datatype
  207. lastname : optional
  208. Last processed field name (used internally during recursion).
  209. parents : dictionary
  210. Dictionary of parent fields (used interbally during recursion).
  211. Examples
  212. --------
  213. >>> from numpy.lib import recfunctions as rfn
  214. >>> ndtype = np.dtype([('A', int),
  215. ... ('B', [('BA', int),
  216. ... ('BB', [('BBA', int), ('BBB', int)])])])
  217. >>> rfn.get_fieldstructure(ndtype)
  218. ... # XXX: possible regression, order of BBA and BBB is swapped
  219. {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']}
  220. """
  221. if parents is None:
  222. parents = {}
  223. names = adtype.names
  224. for name in names:
  225. current = adtype[name]
  226. if current.names is not None:
  227. if lastname:
  228. parents[name] = [lastname, ]
  229. else:
  230. parents[name] = []
  231. parents.update(get_fieldstructure(current, name, parents))
  232. else:
  233. lastparent = [_ for _ in (parents.get(lastname, []) or [])]
  234. if lastparent:
  235. lastparent.append(lastname)
  236. elif lastname:
  237. lastparent = [lastname, ]
  238. parents[name] = lastparent or []
  239. return parents
  240. def _izip_fields_flat(iterable):
  241. """
  242. Returns an iterator of concatenated fields from a sequence of arrays,
  243. collapsing any nested structure.
  244. """
  245. for element in iterable:
  246. if isinstance(element, np.void):
  247. for f in _izip_fields_flat(tuple(element)):
  248. yield f
  249. else:
  250. yield element
  251. def _izip_fields(iterable):
  252. """
  253. Returns an iterator of concatenated fields from a sequence of arrays.
  254. """
  255. for element in iterable:
  256. if (hasattr(element, '__iter__') and
  257. not isinstance(element, basestring)):
  258. for f in _izip_fields(element):
  259. yield f
  260. elif isinstance(element, np.void) and len(tuple(element)) == 1:
  261. for f in _izip_fields(element):
  262. yield f
  263. else:
  264. yield element
  265. def _izip_records(seqarrays, fill_value=None, flatten=True):
  266. """
  267. Returns an iterator of concatenated items from a sequence of arrays.
  268. Parameters
  269. ----------
  270. seqarrays : sequence of arrays
  271. Sequence of arrays.
  272. fill_value : {None, integer}
  273. Value used to pad shorter iterables.
  274. flatten : {True, False},
  275. Whether to
  276. """
  277. # Should we flatten the items, or just use a nested approach
  278. if flatten:
  279. zipfunc = _izip_fields_flat
  280. else:
  281. zipfunc = _izip_fields
  282. if sys.version_info[0] >= 3:
  283. zip_longest = itertools.zip_longest
  284. else:
  285. zip_longest = itertools.izip_longest
  286. for tup in zip_longest(*seqarrays, fillvalue=fill_value):
  287. yield tuple(zipfunc(tup))
  288. def _fix_output(output, usemask=True, asrecarray=False):
  289. """
  290. Private function: return a recarray, a ndarray, a MaskedArray
  291. or a MaskedRecords depending on the input parameters
  292. """
  293. if not isinstance(output, MaskedArray):
  294. usemask = False
  295. if usemask:
  296. if asrecarray:
  297. output = output.view(MaskedRecords)
  298. else:
  299. output = ma.filled(output)
  300. if asrecarray:
  301. output = output.view(recarray)
  302. return output
  303. def _fix_defaults(output, defaults=None):
  304. """
  305. Update the fill_value and masked data of `output`
  306. from the default given in a dictionary defaults.
  307. """
  308. names = output.dtype.names
  309. (data, mask, fill_value) = (output.data, output.mask, output.fill_value)
  310. for (k, v) in (defaults or {}).items():
  311. if k in names:
  312. fill_value[k] = v
  313. data[k][mask[k]] = v
  314. return output
  315. def _merge_arrays_dispatcher(seqarrays, fill_value=None, flatten=None,
  316. usemask=None, asrecarray=None):
  317. return seqarrays
  318. @array_function_dispatch(_merge_arrays_dispatcher)
  319. def merge_arrays(seqarrays, fill_value=-1, flatten=False,
  320. usemask=False, asrecarray=False):
  321. """
  322. Merge arrays field by field.
  323. Parameters
  324. ----------
  325. seqarrays : sequence of ndarrays
  326. Sequence of arrays
  327. fill_value : {float}, optional
  328. Filling value used to pad missing data on the shorter arrays.
  329. flatten : {False, True}, optional
  330. Whether to collapse nested fields.
  331. usemask : {False, True}, optional
  332. Whether to return a masked array or not.
  333. asrecarray : {False, True}, optional
  334. Whether to return a recarray (MaskedRecords) or not.
  335. Examples
  336. --------
  337. >>> from numpy.lib import recfunctions as rfn
  338. >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])))
  339. array([( 1, 10.), ( 2, 20.), (-1, 30.)],
  340. dtype=[('f0', '<i8'), ('f1', '<f8')])
  341. >>> rfn.merge_arrays((np.array([1, 2], dtype=np.int64),
  342. ... np.array([10., 20., 30.])), usemask=False)
  343. array([(1, 10.0), (2, 20.0), (-1, 30.0)],
  344. dtype=[('f0', '<i8'), ('f1', '<f8')])
  345. >>> rfn.merge_arrays((np.array([1, 2]).view([('a', np.int64)]),
  346. ... np.array([10., 20., 30.])),
  347. ... usemask=False, asrecarray=True)
  348. rec.array([( 1, 10.), ( 2, 20.), (-1, 30.)],
  349. dtype=[('a', '<i8'), ('f1', '<f8')])
  350. Notes
  351. -----
  352. * Without a mask, the missing value will be filled with something,
  353. depending on what its corresponding type:
  354. * ``-1`` for integers
  355. * ``-1.0`` for floating point numbers
  356. * ``'-'`` for characters
  357. * ``'-1'`` for strings
  358. * ``True`` for boolean values
  359. * XXX: I just obtained these values empirically
  360. """
  361. # Only one item in the input sequence ?
  362. if (len(seqarrays) == 1):
  363. seqarrays = np.asanyarray(seqarrays[0])
  364. # Do we have a single ndarray as input ?
  365. if isinstance(seqarrays, (ndarray, np.void)):
  366. seqdtype = seqarrays.dtype
  367. # Make sure we have named fields
  368. if seqdtype.names is None:
  369. seqdtype = np.dtype([('', seqdtype)])
  370. if not flatten or _zip_dtype((seqarrays,), flatten=True) == seqdtype:
  371. # Minimal processing needed: just make sure everythng's a-ok
  372. seqarrays = seqarrays.ravel()
  373. # Find what type of array we must return
  374. if usemask:
  375. if asrecarray:
  376. seqtype = MaskedRecords
  377. else:
  378. seqtype = MaskedArray
  379. elif asrecarray:
  380. seqtype = recarray
  381. else:
  382. seqtype = ndarray
  383. return seqarrays.view(dtype=seqdtype, type=seqtype)
  384. else:
  385. seqarrays = (seqarrays,)
  386. else:
  387. # Make sure we have arrays in the input sequence
  388. seqarrays = [np.asanyarray(_m) for _m in seqarrays]
  389. # Find the sizes of the inputs and their maximum
  390. sizes = tuple(a.size for a in seqarrays)
  391. maxlength = max(sizes)
  392. # Get the dtype of the output (flattening if needed)
  393. newdtype = _zip_dtype(seqarrays, flatten=flatten)
  394. # Initialize the sequences for data and mask
  395. seqdata = []
  396. seqmask = []
  397. # If we expect some kind of MaskedArray, make a special loop.
  398. if usemask:
  399. for (a, n) in zip(seqarrays, sizes):
  400. nbmissing = (maxlength - n)
  401. # Get the data and mask
  402. data = a.ravel().__array__()
  403. mask = ma.getmaskarray(a).ravel()
  404. # Get the filling value (if needed)
  405. if nbmissing:
  406. fval = _check_fill_value(fill_value, a.dtype)
  407. if isinstance(fval, (ndarray, np.void)):
  408. if len(fval.dtype) == 1:
  409. fval = fval.item()[0]
  410. fmsk = True
  411. else:
  412. fval = np.array(fval, dtype=a.dtype, ndmin=1)
  413. fmsk = np.ones((1,), dtype=mask.dtype)
  414. else:
  415. fval = None
  416. fmsk = True
  417. # Store an iterator padding the input to the expected length
  418. seqdata.append(itertools.chain(data, [fval] * nbmissing))
  419. seqmask.append(itertools.chain(mask, [fmsk] * nbmissing))
  420. # Create an iterator for the data
  421. data = tuple(_izip_records(seqdata, flatten=flatten))
  422. output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength),
  423. mask=list(_izip_records(seqmask, flatten=flatten)))
  424. if asrecarray:
  425. output = output.view(MaskedRecords)
  426. else:
  427. # Same as before, without the mask we don't need...
  428. for (a, n) in zip(seqarrays, sizes):
  429. nbmissing = (maxlength - n)
  430. data = a.ravel().__array__()
  431. if nbmissing:
  432. fval = _check_fill_value(fill_value, a.dtype)
  433. if isinstance(fval, (ndarray, np.void)):
  434. if len(fval.dtype) == 1:
  435. fval = fval.item()[0]
  436. else:
  437. fval = np.array(fval, dtype=a.dtype, ndmin=1)
  438. else:
  439. fval = None
  440. seqdata.append(itertools.chain(data, [fval] * nbmissing))
  441. output = np.fromiter(tuple(_izip_records(seqdata, flatten=flatten)),
  442. dtype=newdtype, count=maxlength)
  443. if asrecarray:
  444. output = output.view(recarray)
  445. # And we're done...
  446. return output
  447. def _drop_fields_dispatcher(base, drop_names, usemask=None, asrecarray=None):
  448. return (base,)
  449. @array_function_dispatch(_drop_fields_dispatcher)
  450. def drop_fields(base, drop_names, usemask=True, asrecarray=False):
  451. """
  452. Return a new array with fields in `drop_names` dropped.
  453. Nested fields are supported.
  454. Parameters
  455. ----------
  456. base : array
  457. Input array
  458. drop_names : string or sequence
  459. String or sequence of strings corresponding to the names of the
  460. fields to drop.
  461. usemask : {False, True}, optional
  462. Whether to return a masked array or not.
  463. asrecarray : string or sequence, optional
  464. Whether to return a recarray or a mrecarray (`asrecarray=True`) or
  465. a plain ndarray or masked array with flexible dtype. The default
  466. is False.
  467. Examples
  468. --------
  469. >>> from numpy.lib import recfunctions as rfn
  470. >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
  471. ... dtype=[('a', np.int64), ('b', [('ba', np.double), ('bb', np.int64)])])
  472. >>> rfn.drop_fields(a, 'a')
  473. array([((2., 3),), ((5., 6),)],
  474. dtype=[('b', [('ba', '<f8'), ('bb', '<i8')])])
  475. >>> rfn.drop_fields(a, 'ba')
  476. array([(1, (3,)), (4, (6,))], dtype=[('a', '<i8'), ('b', [('bb', '<i8')])])
  477. >>> rfn.drop_fields(a, ['ba', 'bb'])
  478. array([(1,), (4,)], dtype=[('a', '<i8')])
  479. """
  480. if _is_string_like(drop_names):
  481. drop_names = [drop_names]
  482. else:
  483. drop_names = set(drop_names)
  484. def _drop_descr(ndtype, drop_names):
  485. names = ndtype.names
  486. newdtype = []
  487. for name in names:
  488. current = ndtype[name]
  489. if name in drop_names:
  490. continue
  491. if current.names:
  492. descr = _drop_descr(current, drop_names)
  493. if descr:
  494. newdtype.append((name, descr))
  495. else:
  496. newdtype.append((name, current))
  497. return newdtype
  498. newdtype = _drop_descr(base.dtype, drop_names)
  499. if not newdtype:
  500. return None
  501. output = np.empty(base.shape, dtype=newdtype)
  502. output = recursive_fill_fields(base, output)
  503. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  504. def _keep_fields(base, keep_names, usemask=True, asrecarray=False):
  505. """
  506. Return a new array keeping only the fields in `keep_names`,
  507. and preserving the order of those fields.
  508. Parameters
  509. ----------
  510. base : array
  511. Input array
  512. keep_names : string or sequence
  513. String or sequence of strings corresponding to the names of the
  514. fields to keep. Order of the names will be preserved.
  515. usemask : {False, True}, optional
  516. Whether to return a masked array or not.
  517. asrecarray : string or sequence, optional
  518. Whether to return a recarray or a mrecarray (`asrecarray=True`) or
  519. a plain ndarray or masked array with flexible dtype. The default
  520. is False.
  521. """
  522. newdtype = [(n, base.dtype[n]) for n in keep_names]
  523. output = np.empty(base.shape, dtype=newdtype)
  524. output = recursive_fill_fields(base, output)
  525. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  526. def _rec_drop_fields_dispatcher(base, drop_names):
  527. return (base,)
  528. @array_function_dispatch(_rec_drop_fields_dispatcher)
  529. def rec_drop_fields(base, drop_names):
  530. """
  531. Returns a new numpy.recarray with fields in `drop_names` dropped.
  532. """
  533. return drop_fields(base, drop_names, usemask=False, asrecarray=True)
  534. def _rename_fields_dispatcher(base, namemapper):
  535. return (base,)
  536. @array_function_dispatch(_rename_fields_dispatcher)
  537. def rename_fields(base, namemapper):
  538. """
  539. Rename the fields from a flexible-datatype ndarray or recarray.
  540. Nested fields are supported.
  541. Parameters
  542. ----------
  543. base : ndarray
  544. Input array whose fields must be modified.
  545. namemapper : dictionary
  546. Dictionary mapping old field names to their new version.
  547. Examples
  548. --------
  549. >>> from numpy.lib import recfunctions as rfn
  550. >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
  551. ... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])])
  552. >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'})
  553. array([(1, (2., [ 3., 30.])), (4, (5., [ 6., 60.]))],
  554. dtype=[('A', '<i8'), ('b', [('ba', '<f8'), ('BB', '<f8', (2,))])])
  555. """
  556. def _recursive_rename_fields(ndtype, namemapper):
  557. newdtype = []
  558. for name in ndtype.names:
  559. newname = namemapper.get(name, name)
  560. current = ndtype[name]
  561. if current.names is not None:
  562. newdtype.append(
  563. (newname, _recursive_rename_fields(current, namemapper))
  564. )
  565. else:
  566. newdtype.append((newname, current))
  567. return newdtype
  568. newdtype = _recursive_rename_fields(base.dtype, namemapper)
  569. return base.view(newdtype)
  570. def _append_fields_dispatcher(base, names, data, dtypes=None,
  571. fill_value=None, usemask=None, asrecarray=None):
  572. yield base
  573. for d in data:
  574. yield d
  575. @array_function_dispatch(_append_fields_dispatcher)
  576. def append_fields(base, names, data, dtypes=None,
  577. fill_value=-1, usemask=True, asrecarray=False):
  578. """
  579. Add new fields to an existing array.
  580. The names of the fields are given with the `names` arguments,
  581. the corresponding values with the `data` arguments.
  582. If a single field is appended, `names`, `data` and `dtypes` do not have
  583. to be lists but just values.
  584. Parameters
  585. ----------
  586. base : array
  587. Input array to extend.
  588. names : string, sequence
  589. String or sequence of strings corresponding to the names
  590. of the new fields.
  591. data : array or sequence of arrays
  592. Array or sequence of arrays storing the fields to add to the base.
  593. dtypes : sequence of datatypes, optional
  594. Datatype or sequence of datatypes.
  595. If None, the datatypes are estimated from the `data`.
  596. fill_value : {float}, optional
  597. Filling value used to pad missing data on the shorter arrays.
  598. usemask : {False, True}, optional
  599. Whether to return a masked array or not.
  600. asrecarray : {False, True}, optional
  601. Whether to return a recarray (MaskedRecords) or not.
  602. """
  603. # Check the names
  604. if isinstance(names, (tuple, list)):
  605. if len(names) != len(data):
  606. msg = "The number of arrays does not match the number of names"
  607. raise ValueError(msg)
  608. elif isinstance(names, basestring):
  609. names = [names, ]
  610. data = [data, ]
  611. #
  612. if dtypes is None:
  613. data = [np.array(a, copy=False, subok=True) for a in data]
  614. data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)]
  615. else:
  616. if not isinstance(dtypes, (tuple, list)):
  617. dtypes = [dtypes, ]
  618. if len(data) != len(dtypes):
  619. if len(dtypes) == 1:
  620. dtypes = dtypes * len(data)
  621. else:
  622. msg = "The dtypes argument must be None, a dtype, or a list."
  623. raise ValueError(msg)
  624. data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)])
  625. for (a, n, d) in zip(data, names, dtypes)]
  626. #
  627. base = merge_arrays(base, usemask=usemask, fill_value=fill_value)
  628. if len(data) > 1:
  629. data = merge_arrays(data, flatten=True, usemask=usemask,
  630. fill_value=fill_value)
  631. else:
  632. data = data.pop()
  633. #
  634. output = ma.masked_all(
  635. max(len(base), len(data)),
  636. dtype=_get_fieldspec(base.dtype) + _get_fieldspec(data.dtype))
  637. output = recursive_fill_fields(base, output)
  638. output = recursive_fill_fields(data, output)
  639. #
  640. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  641. def _rec_append_fields_dispatcher(base, names, data, dtypes=None):
  642. yield base
  643. for d in data:
  644. yield d
  645. @array_function_dispatch(_rec_append_fields_dispatcher)
  646. def rec_append_fields(base, names, data, dtypes=None):
  647. """
  648. Add new fields to an existing array.
  649. The names of the fields are given with the `names` arguments,
  650. the corresponding values with the `data` arguments.
  651. If a single field is appended, `names`, `data` and `dtypes` do not have
  652. to be lists but just values.
  653. Parameters
  654. ----------
  655. base : array
  656. Input array to extend.
  657. names : string, sequence
  658. String or sequence of strings corresponding to the names
  659. of the new fields.
  660. data : array or sequence of arrays
  661. Array or sequence of arrays storing the fields to add to the base.
  662. dtypes : sequence of datatypes, optional
  663. Datatype or sequence of datatypes.
  664. If None, the datatypes are estimated from the `data`.
  665. See Also
  666. --------
  667. append_fields
  668. Returns
  669. -------
  670. appended_array : np.recarray
  671. """
  672. return append_fields(base, names, data=data, dtypes=dtypes,
  673. asrecarray=True, usemask=False)
  674. def _repack_fields_dispatcher(a, align=None, recurse=None):
  675. return (a,)
  676. @array_function_dispatch(_repack_fields_dispatcher)
  677. def repack_fields(a, align=False, recurse=False):
  678. """
  679. Re-pack the fields of a structured array or dtype in memory.
  680. The memory layout of structured datatypes allows fields at arbitrary
  681. byte offsets. This means the fields can be separated by padding bytes,
  682. their offsets can be non-monotonically increasing, and they can overlap.
  683. This method removes any overlaps and reorders the fields in memory so they
  684. have increasing byte offsets, and adds or removes padding bytes depending
  685. on the `align` option, which behaves like the `align` option to `np.dtype`.
  686. If `align=False`, this method produces a "packed" memory layout in which
  687. each field starts at the byte the previous field ended, and any padding
  688. bytes are removed.
  689. If `align=True`, this methods produces an "aligned" memory layout in which
  690. each field's offset is a multiple of its alignment, and the total itemsize
  691. is a multiple of the largest alignment, by adding padding bytes as needed.
  692. Parameters
  693. ----------
  694. a : ndarray or dtype
  695. array or dtype for which to repack the fields.
  696. align : boolean
  697. If true, use an "aligned" memory layout, otherwise use a "packed" layout.
  698. recurse : boolean
  699. If True, also repack nested structures.
  700. Returns
  701. -------
  702. repacked : ndarray or dtype
  703. Copy of `a` with fields repacked, or `a` itself if no repacking was
  704. needed.
  705. Examples
  706. --------
  707. >>> from numpy.lib import recfunctions as rfn
  708. >>> def print_offsets(d):
  709. ... print("offsets:", [d.fields[name][1] for name in d.names])
  710. ... print("itemsize:", d.itemsize)
  711. ...
  712. >>> dt = np.dtype('u1, <i8, <f8', align=True)
  713. >>> dt
  714. dtype({'names':['f0','f1','f2'], 'formats':['u1','<i8','<f8'], 'offsets':[0,8,16], 'itemsize':24}, align=True)
  715. >>> print_offsets(dt)
  716. offsets: [0, 8, 16]
  717. itemsize: 24
  718. >>> packed_dt = rfn.repack_fields(dt)
  719. >>> packed_dt
  720. dtype([('f0', 'u1'), ('f1', '<i8'), ('f2', '<f8')])
  721. >>> print_offsets(packed_dt)
  722. offsets: [0, 1, 9]
  723. itemsize: 17
  724. """
  725. if not isinstance(a, np.dtype):
  726. dt = repack_fields(a.dtype, align=align, recurse=recurse)
  727. return a.astype(dt, copy=False)
  728. if a.names is None:
  729. return a
  730. fieldinfo = []
  731. for name in a.names:
  732. tup = a.fields[name]
  733. if recurse:
  734. fmt = repack_fields(tup[0], align=align, recurse=True)
  735. else:
  736. fmt = tup[0]
  737. if len(tup) == 3:
  738. name = (tup[2], name)
  739. fieldinfo.append((name, fmt))
  740. dt = np.dtype(fieldinfo, align=align)
  741. return np.dtype((a.type, dt))
  742. def _get_fields_and_offsets(dt, offset=0):
  743. """
  744. Returns a flat list of (dtype, count, offset) tuples of all the
  745. scalar fields in the dtype "dt", including nested fields, in left
  746. to right order.
  747. """
  748. # counts up elements in subarrays, including nested subarrays, and returns
  749. # base dtype and count
  750. def count_elem(dt):
  751. count = 1
  752. while dt.shape != ():
  753. for size in dt.shape:
  754. count *= size
  755. dt = dt.base
  756. return dt, count
  757. fields = []
  758. for name in dt.names:
  759. field = dt.fields[name]
  760. f_dt, f_offset = field[0], field[1]
  761. f_dt, n = count_elem(f_dt)
  762. if f_dt.names is None:
  763. fields.append((np.dtype((f_dt, (n,))), n, f_offset + offset))
  764. else:
  765. subfields = _get_fields_and_offsets(f_dt, f_offset + offset)
  766. size = f_dt.itemsize
  767. for i in range(n):
  768. if i == 0:
  769. # optimization: avoid list comprehension if no subarray
  770. fields.extend(subfields)
  771. else:
  772. fields.extend([(d, c, o + i*size) for d, c, o in subfields])
  773. return fields
  774. def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None,
  775. casting=None):
  776. return (arr,)
  777. @array_function_dispatch(_structured_to_unstructured_dispatcher)
  778. def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
  779. """
  780. Converts and n-D structured array into an (n+1)-D unstructured array.
  781. The new array will have a new last dimension equal in size to the
  782. number of field-elements of the input array. If not supplied, the output
  783. datatype is determined from the numpy type promotion rules applied to all
  784. the field datatypes.
  785. Nested fields, as well as each element of any subarray fields, all count
  786. as a single field-elements.
  787. Parameters
  788. ----------
  789. arr : ndarray
  790. Structured array or dtype to convert. Cannot contain object datatype.
  791. dtype : dtype, optional
  792. The dtype of the output unstructured array.
  793. copy : bool, optional
  794. See copy argument to `ndarray.astype`. If true, always return a copy.
  795. If false, and `dtype` requirements are satisfied, a view is returned.
  796. casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
  797. See casting argument of `ndarray.astype`. Controls what kind of data
  798. casting may occur.
  799. Returns
  800. -------
  801. unstructured : ndarray
  802. Unstructured array with one more dimension.
  803. Examples
  804. --------
  805. >>> from numpy.lib import recfunctions as rfn
  806. >>> a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
  807. >>> a
  808. array([(0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.]),
  809. (0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.])],
  810. dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
  811. >>> rfn.structured_to_unstructured(a)
  812. array([[0., 0., 0., 0., 0.],
  813. [0., 0., 0., 0., 0.],
  814. [0., 0., 0., 0., 0.],
  815. [0., 0., 0., 0., 0.]])
  816. >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
  817. ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
  818. >>> np.mean(rfn.structured_to_unstructured(b[['x', 'z']]), axis=-1)
  819. array([ 3. , 5.5, 9. , 11. ])
  820. """
  821. if arr.dtype.names is None:
  822. raise ValueError('arr must be a structured array')
  823. fields = _get_fields_and_offsets(arr.dtype)
  824. n_fields = len(fields)
  825. if n_fields == 0 and dtype is None:
  826. raise ValueError("arr has no fields. Unable to guess dtype")
  827. elif n_fields == 0:
  828. # too many bugs elsewhere for this to work now
  829. raise NotImplementedError("arr with no fields is not supported")
  830. dts, counts, offsets = zip(*fields)
  831. names = ['f{}'.format(n) for n in range(n_fields)]
  832. if dtype is None:
  833. out_dtype = np.result_type(*[dt.base for dt in dts])
  834. else:
  835. out_dtype = dtype
  836. # Use a series of views and casts to convert to an unstructured array:
  837. # first view using flattened fields (doesn't work for object arrays)
  838. # Note: dts may include a shape for subarrays
  839. flattened_fields = np.dtype({'names': names,
  840. 'formats': dts,
  841. 'offsets': offsets,
  842. 'itemsize': arr.dtype.itemsize})
  843. with suppress_warnings() as sup: # until 1.16 (gh-12447)
  844. sup.filter(FutureWarning, "Numpy has detected")
  845. arr = arr.view(flattened_fields)
  846. # next cast to a packed format with all fields converted to new dtype
  847. packed_fields = np.dtype({'names': names,
  848. 'formats': [(out_dtype, dt.shape) for dt in dts]})
  849. arr = arr.astype(packed_fields, copy=copy, casting=casting)
  850. # finally is it safe to view the packed fields as the unstructured type
  851. return arr.view((out_dtype, (sum(counts),)))
  852. def _unstructured_to_structured_dispatcher(arr, dtype=None, names=None,
  853. align=None, copy=None, casting=None):
  854. return (arr,)
  855. @array_function_dispatch(_unstructured_to_structured_dispatcher)
  856. def unstructured_to_structured(arr, dtype=None, names=None, align=False,
  857. copy=False, casting='unsafe'):
  858. """
  859. Converts and n-D unstructured array into an (n-1)-D structured array.
  860. The last dimension of the input array is converted into a structure, with
  861. number of field-elements equal to the size of the last dimension of the
  862. input array. By default all output fields have the input array's dtype, but
  863. an output structured dtype with an equal number of fields-elements can be
  864. supplied instead.
  865. Nested fields, as well as each element of any subarray fields, all count
  866. towards the number of field-elements.
  867. Parameters
  868. ----------
  869. arr : ndarray
  870. Unstructured array or dtype to convert.
  871. dtype : dtype, optional
  872. The structured dtype of the output array
  873. names : list of strings, optional
  874. If dtype is not supplied, this specifies the field names for the output
  875. dtype, in order. The field dtypes will be the same as the input array.
  876. align : boolean, optional
  877. Whether to create an aligned memory layout.
  878. copy : bool, optional
  879. See copy argument to `ndarray.astype`. If true, always return a copy.
  880. If false, and `dtype` requirements are satisfied, a view is returned.
  881. casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
  882. See casting argument of `ndarray.astype`. Controls what kind of data
  883. casting may occur.
  884. Returns
  885. -------
  886. structured : ndarray
  887. Structured array with fewer dimensions.
  888. Examples
  889. --------
  890. >>> from numpy.lib import recfunctions as rfn
  891. >>> dt = np.dtype([('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
  892. >>> a = np.arange(20).reshape((4,5))
  893. >>> a
  894. array([[ 0, 1, 2, 3, 4],
  895. [ 5, 6, 7, 8, 9],
  896. [10, 11, 12, 13, 14],
  897. [15, 16, 17, 18, 19]])
  898. >>> rfn.unstructured_to_structured(a, dt)
  899. array([( 0, ( 1., 2), [ 3., 4.]), ( 5, ( 6., 7), [ 8., 9.]),
  900. (10, (11., 12), [13., 14.]), (15, (16., 17), [18., 19.])],
  901. dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
  902. """
  903. if arr.shape == ():
  904. raise ValueError('arr must have at least one dimension')
  905. n_elem = arr.shape[-1]
  906. if n_elem == 0:
  907. # too many bugs elsewhere for this to work now
  908. raise NotImplementedError("last axis with size 0 is not supported")
  909. if dtype is None:
  910. if names is None:
  911. names = ['f{}'.format(n) for n in range(n_elem)]
  912. out_dtype = np.dtype([(n, arr.dtype) for n in names], align=align)
  913. fields = _get_fields_and_offsets(out_dtype)
  914. dts, counts, offsets = zip(*fields)
  915. else:
  916. if names is not None:
  917. raise ValueError("don't supply both dtype and names")
  918. # sanity check of the input dtype
  919. fields = _get_fields_and_offsets(dtype)
  920. if len(fields) == 0:
  921. dts, counts, offsets = [], [], []
  922. else:
  923. dts, counts, offsets = zip(*fields)
  924. if n_elem != sum(counts):
  925. raise ValueError('The length of the last dimension of arr must '
  926. 'be equal to the number of fields in dtype')
  927. out_dtype = dtype
  928. if align and not out_dtype.isalignedstruct:
  929. raise ValueError("align was True but dtype is not aligned")
  930. names = ['f{}'.format(n) for n in range(len(fields))]
  931. # Use a series of views and casts to convert to a structured array:
  932. # first view as a packed structured array of one dtype
  933. packed_fields = np.dtype({'names': names,
  934. 'formats': [(arr.dtype, dt.shape) for dt in dts]})
  935. arr = np.ascontiguousarray(arr).view(packed_fields)
  936. # next cast to an unpacked but flattened format with varied dtypes
  937. flattened_fields = np.dtype({'names': names,
  938. 'formats': dts,
  939. 'offsets': offsets,
  940. 'itemsize': out_dtype.itemsize})
  941. arr = arr.astype(flattened_fields, copy=copy, casting=casting)
  942. # finally view as the final nested dtype and remove the last axis
  943. return arr.view(out_dtype)[..., 0]
  944. def _apply_along_fields_dispatcher(func, arr):
  945. return (arr,)
  946. @array_function_dispatch(_apply_along_fields_dispatcher)
  947. def apply_along_fields(func, arr):
  948. """
  949. Apply function 'func' as a reduction across fields of a structured array.
  950. This is similar to `apply_along_axis`, but treats the fields of a
  951. structured array as an extra axis. The fields are all first cast to a
  952. common type following the type-promotion rules from `numpy.result_type`
  953. applied to the field's dtypes.
  954. Parameters
  955. ----------
  956. func : function
  957. Function to apply on the "field" dimension. This function must
  958. support an `axis` argument, like np.mean, np.sum, etc.
  959. arr : ndarray
  960. Structured array for which to apply func.
  961. Returns
  962. -------
  963. out : ndarray
  964. Result of the recution operation
  965. Examples
  966. --------
  967. >>> from numpy.lib import recfunctions as rfn
  968. >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
  969. ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
  970. >>> rfn.apply_along_fields(np.mean, b)
  971. array([ 2.66666667, 5.33333333, 8.66666667, 11. ])
  972. >>> rfn.apply_along_fields(np.mean, b[['x', 'z']])
  973. array([ 3. , 5.5, 9. , 11. ])
  974. """
  975. if arr.dtype.names is None:
  976. raise ValueError('arr must be a structured array')
  977. uarr = structured_to_unstructured(arr)
  978. return func(uarr, axis=-1)
  979. # works and avoids axis requirement, but very, very slow:
  980. #return np.apply_along_axis(func, -1, uarr)
  981. def _assign_fields_by_name_dispatcher(dst, src, zero_unassigned=None):
  982. return dst, src
  983. @array_function_dispatch(_assign_fields_by_name_dispatcher)
  984. def assign_fields_by_name(dst, src, zero_unassigned=True):
  985. """
  986. Assigns values from one structured array to another by field name.
  987. Normally in numpy >= 1.14, assignment of one structured array to another
  988. copies fields "by position", meaning that the first field from the src is
  989. copied to the first field of the dst, and so on, regardless of field name.
  990. This function instead copies "by field name", such that fields in the dst
  991. are assigned from the identically named field in the src. This applies
  992. recursively for nested structures. This is how structure assignment worked
  993. in numpy >= 1.6 to <= 1.13.
  994. Parameters
  995. ----------
  996. dst : ndarray
  997. src : ndarray
  998. The source and destination arrays during assignment.
  999. zero_unassigned : bool, optional
  1000. If True, fields in the dst for which there was no matching
  1001. field in the src are filled with the value 0 (zero). This
  1002. was the behavior of numpy <= 1.13. If False, those fields
  1003. are not modified.
  1004. """
  1005. if dst.dtype.names is None:
  1006. dst[...] = src
  1007. return
  1008. for name in dst.dtype.names:
  1009. if name not in src.dtype.names:
  1010. if zero_unassigned:
  1011. dst[name] = 0
  1012. else:
  1013. assign_fields_by_name(dst[name], src[name],
  1014. zero_unassigned)
  1015. def _require_fields_dispatcher(array, required_dtype):
  1016. return (array,)
  1017. @array_function_dispatch(_require_fields_dispatcher)
  1018. def require_fields(array, required_dtype):
  1019. """
  1020. Casts a structured array to a new dtype using assignment by field-name.
  1021. This function assigns from the old to the new array by name, so the
  1022. value of a field in the output array is the value of the field with the
  1023. same name in the source array. This has the effect of creating a new
  1024. ndarray containing only the fields "required" by the required_dtype.
  1025. If a field name in the required_dtype does not exist in the
  1026. input array, that field is created and set to 0 in the output array.
  1027. Parameters
  1028. ----------
  1029. a : ndarray
  1030. array to cast
  1031. required_dtype : dtype
  1032. datatype for output array
  1033. Returns
  1034. -------
  1035. out : ndarray
  1036. array with the new dtype, with field values copied from the fields in
  1037. the input array with the same name
  1038. Examples
  1039. --------
  1040. >>> from numpy.lib import recfunctions as rfn
  1041. >>> a = np.ones(4, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')])
  1042. >>> rfn.require_fields(a, [('b', 'f4'), ('c', 'u1')])
  1043. array([(1., 1), (1., 1), (1., 1), (1., 1)],
  1044. dtype=[('b', '<f4'), ('c', 'u1')])
  1045. >>> rfn.require_fields(a, [('b', 'f4'), ('newf', 'u1')])
  1046. array([(1., 0), (1., 0), (1., 0), (1., 0)],
  1047. dtype=[('b', '<f4'), ('newf', 'u1')])
  1048. """
  1049. out = np.empty(array.shape, dtype=required_dtype)
  1050. assign_fields_by_name(out, array)
  1051. return out
  1052. def _stack_arrays_dispatcher(arrays, defaults=None, usemask=None,
  1053. asrecarray=None, autoconvert=None):
  1054. return arrays
  1055. @array_function_dispatch(_stack_arrays_dispatcher)
  1056. def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
  1057. autoconvert=False):
  1058. """
  1059. Superposes arrays fields by fields
  1060. Parameters
  1061. ----------
  1062. arrays : array or sequence
  1063. Sequence of input arrays.
  1064. defaults : dictionary, optional
  1065. Dictionary mapping field names to the corresponding default values.
  1066. usemask : {True, False}, optional
  1067. Whether to return a MaskedArray (or MaskedRecords is
  1068. `asrecarray==True`) or a ndarray.
  1069. asrecarray : {False, True}, optional
  1070. Whether to return a recarray (or MaskedRecords if `usemask==True`)
  1071. or just a flexible-type ndarray.
  1072. autoconvert : {False, True}, optional
  1073. Whether automatically cast the type of the field to the maximum.
  1074. Examples
  1075. --------
  1076. >>> from numpy.lib import recfunctions as rfn
  1077. >>> x = np.array([1, 2,])
  1078. >>> rfn.stack_arrays(x) is x
  1079. True
  1080. >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)])
  1081. >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
  1082. ... dtype=[('A', '|S3'), ('B', np.double), ('C', np.double)])
  1083. >>> test = rfn.stack_arrays((z,zz))
  1084. >>> test
  1085. masked_array(data=[(b'A', 1.0, --), (b'B', 2.0, --), (b'a', 10.0, 100.0),
  1086. (b'b', 20.0, 200.0), (b'c', 30.0, 300.0)],
  1087. mask=[(False, False, True), (False, False, True),
  1088. (False, False, False), (False, False, False),
  1089. (False, False, False)],
  1090. fill_value=(b'N/A', 1.e+20, 1.e+20),
  1091. dtype=[('A', 'S3'), ('B', '<f8'), ('C', '<f8')])
  1092. """
  1093. if isinstance(arrays, ndarray):
  1094. return arrays
  1095. elif len(arrays) == 1:
  1096. return arrays[0]
  1097. seqarrays = [np.asanyarray(a).ravel() for a in arrays]
  1098. nrecords = [len(a) for a in seqarrays]
  1099. ndtype = [a.dtype for a in seqarrays]
  1100. fldnames = [d.names for d in ndtype]
  1101. #
  1102. dtype_l = ndtype[0]
  1103. newdescr = _get_fieldspec(dtype_l)
  1104. names = [n for n, d in newdescr]
  1105. for dtype_n in ndtype[1:]:
  1106. for fname, fdtype in _get_fieldspec(dtype_n):
  1107. if fname not in names:
  1108. newdescr.append((fname, fdtype))
  1109. names.append(fname)
  1110. else:
  1111. nameidx = names.index(fname)
  1112. _, cdtype = newdescr[nameidx]
  1113. if autoconvert:
  1114. newdescr[nameidx] = (fname, max(fdtype, cdtype))
  1115. elif fdtype != cdtype:
  1116. raise TypeError("Incompatible type '%s' <> '%s'" %
  1117. (cdtype, fdtype))
  1118. # Only one field: use concatenate
  1119. if len(newdescr) == 1:
  1120. output = ma.concatenate(seqarrays)
  1121. else:
  1122. #
  1123. output = ma.masked_all((np.sum(nrecords),), newdescr)
  1124. offset = np.cumsum(np.r_[0, nrecords])
  1125. seen = []
  1126. for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]):
  1127. names = a.dtype.names
  1128. if names is None:
  1129. output['f%i' % len(seen)][i:j] = a
  1130. else:
  1131. for name in n:
  1132. output[name][i:j] = a[name]
  1133. if name not in seen:
  1134. seen.append(name)
  1135. #
  1136. return _fix_output(_fix_defaults(output, defaults),
  1137. usemask=usemask, asrecarray=asrecarray)
  1138. def _find_duplicates_dispatcher(
  1139. a, key=None, ignoremask=None, return_index=None):
  1140. return (a,)
  1141. @array_function_dispatch(_find_duplicates_dispatcher)
  1142. def find_duplicates(a, key=None, ignoremask=True, return_index=False):
  1143. """
  1144. Find the duplicates in a structured array along a given key
  1145. Parameters
  1146. ----------
  1147. a : array-like
  1148. Input array
  1149. key : {string, None}, optional
  1150. Name of the fields along which to check the duplicates.
  1151. If None, the search is performed by records
  1152. ignoremask : {True, False}, optional
  1153. Whether masked data should be discarded or considered as duplicates.
  1154. return_index : {False, True}, optional
  1155. Whether to return the indices of the duplicated values.
  1156. Examples
  1157. --------
  1158. >>> from numpy.lib import recfunctions as rfn
  1159. >>> ndtype = [('a', int)]
  1160. >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3],
  1161. ... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
  1162. >>> rfn.find_duplicates(a, ignoremask=True, return_index=True)
  1163. (masked_array(data=[(1,), (1,), (2,), (2,)],
  1164. mask=[(False,), (False,), (False,), (False,)],
  1165. fill_value=(999999,),
  1166. dtype=[('a', '<i8')]), array([0, 1, 3, 4]))
  1167. """
  1168. a = np.asanyarray(a).ravel()
  1169. # Get a dictionary of fields
  1170. fields = get_fieldstructure(a.dtype)
  1171. # Get the sorting data (by selecting the corresponding field)
  1172. base = a
  1173. if key:
  1174. for f in fields[key]:
  1175. base = base[f]
  1176. base = base[key]
  1177. # Get the sorting indices and the sorted data
  1178. sortidx = base.argsort()
  1179. sortedbase = base[sortidx]
  1180. sorteddata = sortedbase.filled()
  1181. # Compare the sorting data
  1182. flag = (sorteddata[:-1] == sorteddata[1:])
  1183. # If masked data must be ignored, set the flag to false where needed
  1184. if ignoremask:
  1185. sortedmask = sortedbase.recordmask
  1186. flag[sortedmask[1:]] = False
  1187. flag = np.concatenate(([False], flag))
  1188. # We need to take the point on the left as well (else we're missing it)
  1189. flag[:-1] = flag[:-1] + flag[1:]
  1190. duplicates = a[sortidx][flag]
  1191. if return_index:
  1192. return (duplicates, sortidx[flag])
  1193. else:
  1194. return duplicates
  1195. def _join_by_dispatcher(
  1196. key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
  1197. defaults=None, usemask=None, asrecarray=None):
  1198. return (r1, r2)
  1199. @array_function_dispatch(_join_by_dispatcher)
  1200. def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
  1201. defaults=None, usemask=True, asrecarray=False):
  1202. """
  1203. Join arrays `r1` and `r2` on key `key`.
  1204. The key should be either a string or a sequence of string corresponding
  1205. to the fields used to join the array. An exception is raised if the
  1206. `key` field cannot be found in the two input arrays. Neither `r1` nor
  1207. `r2` should have any duplicates along `key`: the presence of duplicates
  1208. will make the output quite unreliable. Note that duplicates are not
  1209. looked for by the algorithm.
  1210. Parameters
  1211. ----------
  1212. key : {string, sequence}
  1213. A string or a sequence of strings corresponding to the fields used
  1214. for comparison.
  1215. r1, r2 : arrays
  1216. Structured arrays.
  1217. jointype : {'inner', 'outer', 'leftouter'}, optional
  1218. If 'inner', returns the elements common to both r1 and r2.
  1219. If 'outer', returns the common elements as well as the elements of
  1220. r1 not in r2 and the elements of not in r2.
  1221. If 'leftouter', returns the common elements and the elements of r1
  1222. not in r2.
  1223. r1postfix : string, optional
  1224. String appended to the names of the fields of r1 that are present
  1225. in r2 but absent of the key.
  1226. r2postfix : string, optional
  1227. String appended to the names of the fields of r2 that are present
  1228. in r1 but absent of the key.
  1229. defaults : {dictionary}, optional
  1230. Dictionary mapping field names to the corresponding default values.
  1231. usemask : {True, False}, optional
  1232. Whether to return a MaskedArray (or MaskedRecords is
  1233. `asrecarray==True`) or a ndarray.
  1234. asrecarray : {False, True}, optional
  1235. Whether to return a recarray (or MaskedRecords if `usemask==True`)
  1236. or just a flexible-type ndarray.
  1237. Notes
  1238. -----
  1239. * The output is sorted along the key.
  1240. * A temporary array is formed by dropping the fields not in the key for
  1241. the two arrays and concatenating the result. This array is then
  1242. sorted, and the common entries selected. The output is constructed by
  1243. filling the fields with the selected entries. Matching is not
  1244. preserved if there are some duplicates...
  1245. """
  1246. # Check jointype
  1247. if jointype not in ('inner', 'outer', 'leftouter'):
  1248. raise ValueError(
  1249. "The 'jointype' argument should be in 'inner', "
  1250. "'outer' or 'leftouter' (got '%s' instead)" % jointype
  1251. )
  1252. # If we have a single key, put it in a tuple
  1253. if isinstance(key, basestring):
  1254. key = (key,)
  1255. # Check the keys
  1256. if len(set(key)) != len(key):
  1257. dup = next(x for n,x in enumerate(key) if x in key[n+1:])
  1258. raise ValueError("duplicate join key %r" % dup)
  1259. for name in key:
  1260. if name not in r1.dtype.names:
  1261. raise ValueError('r1 does not have key field %r' % name)
  1262. if name not in r2.dtype.names:
  1263. raise ValueError('r2 does not have key field %r' % name)
  1264. # Make sure we work with ravelled arrays
  1265. r1 = r1.ravel()
  1266. r2 = r2.ravel()
  1267. # Fixme: nb2 below is never used. Commenting out for pyflakes.
  1268. # (nb1, nb2) = (len(r1), len(r2))
  1269. nb1 = len(r1)
  1270. (r1names, r2names) = (r1.dtype.names, r2.dtype.names)
  1271. # Check the names for collision
  1272. collisions = (set(r1names) & set(r2names)) - set(key)
  1273. if collisions and not (r1postfix or r2postfix):
  1274. msg = "r1 and r2 contain common names, r1postfix and r2postfix "
  1275. msg += "can't both be empty"
  1276. raise ValueError(msg)
  1277. # Make temporary arrays of just the keys
  1278. # (use order of keys in `r1` for back-compatibility)
  1279. key1 = [ n for n in r1names if n in key ]
  1280. r1k = _keep_fields(r1, key1)
  1281. r2k = _keep_fields(r2, key1)
  1282. # Concatenate the two arrays for comparison
  1283. aux = ma.concatenate((r1k, r2k))
  1284. idx_sort = aux.argsort(order=key)
  1285. aux = aux[idx_sort]
  1286. #
  1287. # Get the common keys
  1288. flag_in = ma.concatenate(([False], aux[1:] == aux[:-1]))
  1289. flag_in[:-1] = flag_in[1:] + flag_in[:-1]
  1290. idx_in = idx_sort[flag_in]
  1291. idx_1 = idx_in[(idx_in < nb1)]
  1292. idx_2 = idx_in[(idx_in >= nb1)] - nb1
  1293. (r1cmn, r2cmn) = (len(idx_1), len(idx_2))
  1294. if jointype == 'inner':
  1295. (r1spc, r2spc) = (0, 0)
  1296. elif jointype == 'outer':
  1297. idx_out = idx_sort[~flag_in]
  1298. idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
  1299. idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1))
  1300. (r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn)
  1301. elif jointype == 'leftouter':
  1302. idx_out = idx_sort[~flag_in]
  1303. idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
  1304. (r1spc, r2spc) = (len(idx_1) - r1cmn, 0)
  1305. # Select the entries from each input
  1306. (s1, s2) = (r1[idx_1], r2[idx_2])
  1307. #
  1308. # Build the new description of the output array .......
  1309. # Start with the key fields
  1310. ndtype = _get_fieldspec(r1k.dtype)
  1311. # Add the fields from r1
  1312. for fname, fdtype in _get_fieldspec(r1.dtype):
  1313. if fname not in key:
  1314. ndtype.append((fname, fdtype))
  1315. # Add the fields from r2
  1316. for fname, fdtype in _get_fieldspec(r2.dtype):
  1317. # Have we seen the current name already ?
  1318. # we need to rebuild this list every time
  1319. names = list(name for name, dtype in ndtype)
  1320. try:
  1321. nameidx = names.index(fname)
  1322. except ValueError:
  1323. #... we haven't: just add the description to the current list
  1324. ndtype.append((fname, fdtype))
  1325. else:
  1326. # collision
  1327. _, cdtype = ndtype[nameidx]
  1328. if fname in key:
  1329. # The current field is part of the key: take the largest dtype
  1330. ndtype[nameidx] = (fname, max(fdtype, cdtype))
  1331. else:
  1332. # The current field is not part of the key: add the suffixes,
  1333. # and place the new field adjacent to the old one
  1334. ndtype[nameidx:nameidx + 1] = [
  1335. (fname + r1postfix, cdtype),
  1336. (fname + r2postfix, fdtype)
  1337. ]
  1338. # Rebuild a dtype from the new fields
  1339. ndtype = np.dtype(ndtype)
  1340. # Find the largest nb of common fields :
  1341. # r1cmn and r2cmn should be equal, but...
  1342. cmn = max(r1cmn, r2cmn)
  1343. # Construct an empty array
  1344. output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype)
  1345. names = output.dtype.names
  1346. for f in r1names:
  1347. selected = s1[f]
  1348. if f not in names or (f in r2names and not r2postfix and f not in key):
  1349. f += r1postfix
  1350. current = output[f]
  1351. current[:r1cmn] = selected[:r1cmn]
  1352. if jointype in ('outer', 'leftouter'):
  1353. current[cmn:cmn + r1spc] = selected[r1cmn:]
  1354. for f in r2names:
  1355. selected = s2[f]
  1356. if f not in names or (f in r1names and not r1postfix and f not in key):
  1357. f += r2postfix
  1358. current = output[f]
  1359. current[:r2cmn] = selected[:r2cmn]
  1360. if (jointype == 'outer') and r2spc:
  1361. current[-r2spc:] = selected[r2cmn:]
  1362. # Sort and finalize the output
  1363. output.sort(order=key)
  1364. kwargs = dict(usemask=usemask, asrecarray=asrecarray)
  1365. return _fix_output(_fix_defaults(output, defaults), **kwargs)
  1366. def _rec_join_dispatcher(
  1367. key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
  1368. defaults=None):
  1369. return (r1, r2)
  1370. @array_function_dispatch(_rec_join_dispatcher)
  1371. def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
  1372. defaults=None):
  1373. """
  1374. Join arrays `r1` and `r2` on keys.
  1375. Alternative to join_by, that always returns a np.recarray.
  1376. See Also
  1377. --------
  1378. join_by : equivalent function
  1379. """
  1380. kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix,
  1381. defaults=defaults, usemask=False, asrecarray=True)
  1382. return join_by(key, r1, r2, **kwargs)