normalize.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. # ---------------------------------------------------------------------
  2. # JSON normalization routines
  3. from collections import defaultdict
  4. import copy
  5. import numpy as np
  6. from pandas._libs.writers import convert_json_to_lines
  7. from pandas import DataFrame, compat
  8. def _convert_to_line_delimits(s):
  9. """
  10. Helper function that converts JSON lists to line delimited JSON.
  11. """
  12. # Determine we have a JSON list to turn to lines otherwise just return the
  13. # json object, only lists can
  14. if not s[0] == '[' and s[-1] == ']':
  15. return s
  16. s = s[1:-1]
  17. return convert_json_to_lines(s)
  18. def nested_to_record(ds, prefix="", sep=".", level=0):
  19. """
  20. A simplified json_normalize.
  21. Converts a nested dict into a flat dict ("record"), unlike json_normalize,
  22. it does not attempt to extract a subset of the data.
  23. Parameters
  24. ----------
  25. ds : dict or list of dicts
  26. prefix: the prefix, optional, default: ""
  27. sep : string, default '.'
  28. Nested records will generate names separated by sep,
  29. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  30. .. versionadded:: 0.20.0
  31. level: the number of levels in the jason string, optional, default: 0
  32. Returns
  33. -------
  34. d - dict or list of dicts, matching `ds`
  35. Examples
  36. --------
  37. IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
  38. nested=dict(e=dict(c=1,d=2),d=2)))
  39. Out[52]:
  40. {'dict1.c': 1,
  41. 'dict1.d': 2,
  42. 'flat1': 1,
  43. 'nested.d': 2,
  44. 'nested.e.c': 1,
  45. 'nested.e.d': 2}
  46. """
  47. singleton = False
  48. if isinstance(ds, dict):
  49. ds = [ds]
  50. singleton = True
  51. new_ds = []
  52. for d in ds:
  53. new_d = copy.deepcopy(d)
  54. for k, v in d.items():
  55. # each key gets renamed with prefix
  56. if not isinstance(k, compat.string_types):
  57. k = str(k)
  58. if level == 0:
  59. newkey = k
  60. else:
  61. newkey = prefix + sep + k
  62. # only dicts gets recurse-flattend
  63. # only at level>1 do we rename the rest of the keys
  64. if not isinstance(v, dict):
  65. if level != 0: # so we skip copying for top level, common case
  66. v = new_d.pop(k)
  67. new_d[newkey] = v
  68. continue
  69. else:
  70. v = new_d.pop(k)
  71. new_d.update(nested_to_record(v, newkey, sep, level + 1))
  72. new_ds.append(new_d)
  73. if singleton:
  74. return new_ds[0]
  75. return new_ds
  76. def json_normalize(data, record_path=None, meta=None,
  77. meta_prefix=None,
  78. record_prefix=None,
  79. errors='raise',
  80. sep='.'):
  81. """
  82. Normalize semi-structured JSON data into a flat table.
  83. Parameters
  84. ----------
  85. data : dict or list of dicts
  86. Unserialized JSON objects
  87. record_path : string or list of strings, default None
  88. Path in each object to list of records. If not passed, data will be
  89. assumed to be an array of records
  90. meta : list of paths (string or list of strings), default None
  91. Fields to use as metadata for each record in resulting table
  92. meta_prefix : string, default None
  93. record_prefix : string, default None
  94. If True, prefix records with dotted (?) path, e.g. foo.bar.field if
  95. path to records is ['foo', 'bar']
  96. errors : {'raise', 'ignore'}, default 'raise'
  97. * 'ignore' : will ignore KeyError if keys listed in meta are not
  98. always present
  99. * 'raise' : will raise KeyError if keys listed in meta are not
  100. always present
  101. .. versionadded:: 0.20.0
  102. sep : string, default '.'
  103. Nested records will generate names separated by sep,
  104. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  105. .. versionadded:: 0.20.0
  106. Returns
  107. -------
  108. frame : DataFrame
  109. Examples
  110. --------
  111. >>> from pandas.io.json import json_normalize
  112. >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
  113. ... {'name': {'given': 'Mose', 'family': 'Regner'}},
  114. ... {'id': 2, 'name': 'Faye Raker'}]
  115. >>> json_normalize(data)
  116. id name name.family name.first name.given name.last
  117. 0 1.0 NaN NaN Coleen NaN Volk
  118. 1 NaN NaN Regner NaN Mose NaN
  119. 2 2.0 Faye Raker NaN NaN NaN NaN
  120. >>> data = [{'state': 'Florida',
  121. ... 'shortname': 'FL',
  122. ... 'info': {
  123. ... 'governor': 'Rick Scott'
  124. ... },
  125. ... 'counties': [{'name': 'Dade', 'population': 12345},
  126. ... {'name': 'Broward', 'population': 40000},
  127. ... {'name': 'Palm Beach', 'population': 60000}]},
  128. ... {'state': 'Ohio',
  129. ... 'shortname': 'OH',
  130. ... 'info': {
  131. ... 'governor': 'John Kasich'
  132. ... },
  133. ... 'counties': [{'name': 'Summit', 'population': 1234},
  134. ... {'name': 'Cuyahoga', 'population': 1337}]}]
  135. >>> result = json_normalize(data, 'counties', ['state', 'shortname',
  136. ... ['info', 'governor']])
  137. >>> result
  138. name population info.governor state shortname
  139. 0 Dade 12345 Rick Scott Florida FL
  140. 1 Broward 40000 Rick Scott Florida FL
  141. 2 Palm Beach 60000 Rick Scott Florida FL
  142. 3 Summit 1234 John Kasich Ohio OH
  143. 4 Cuyahoga 1337 John Kasich Ohio OH
  144. >>> data = {'A': [1, 2]}
  145. >>> json_normalize(data, 'A', record_prefix='Prefix.')
  146. Prefix.0
  147. 0 1
  148. 1 2
  149. """
  150. def _pull_field(js, spec):
  151. result = js
  152. if isinstance(spec, list):
  153. for field in spec:
  154. result = result[field]
  155. else:
  156. result = result[spec]
  157. return result
  158. if isinstance(data, list) and not data:
  159. return DataFrame()
  160. # A bit of a hackjob
  161. if isinstance(data, dict):
  162. data = [data]
  163. if record_path is None:
  164. if any([isinstance(x, dict)
  165. for x in compat.itervalues(y)] for y in data):
  166. # naive normalization, this is idempotent for flat records
  167. # and potentially will inflate the data considerably for
  168. # deeply nested structures:
  169. # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
  170. #
  171. # TODO: handle record value which are lists, at least error
  172. # reasonably
  173. data = nested_to_record(data, sep=sep)
  174. return DataFrame(data)
  175. elif not isinstance(record_path, list):
  176. record_path = [record_path]
  177. if meta is None:
  178. meta = []
  179. elif not isinstance(meta, list):
  180. meta = [meta]
  181. meta = [m if isinstance(m, list) else [m] for m in meta]
  182. # Disastrously inefficient for now
  183. records = []
  184. lengths = []
  185. meta_vals = defaultdict(list)
  186. if not isinstance(sep, compat.string_types):
  187. sep = str(sep)
  188. meta_keys = [sep.join(val) for val in meta]
  189. def _recursive_extract(data, path, seen_meta, level=0):
  190. if isinstance(data, dict):
  191. data = [data]
  192. if len(path) > 1:
  193. for obj in data:
  194. for val, key in zip(meta, meta_keys):
  195. if level + 1 == len(val):
  196. seen_meta[key] = _pull_field(obj, val[-1])
  197. _recursive_extract(obj[path[0]], path[1:],
  198. seen_meta, level=level + 1)
  199. else:
  200. for obj in data:
  201. recs = _pull_field(obj, path[0])
  202. # For repeating the metadata later
  203. lengths.append(len(recs))
  204. for val, key in zip(meta, meta_keys):
  205. if level + 1 > len(val):
  206. meta_val = seen_meta[key]
  207. else:
  208. try:
  209. meta_val = _pull_field(obj, val[level:])
  210. except KeyError as e:
  211. if errors == 'ignore':
  212. meta_val = np.nan
  213. else:
  214. raise KeyError("Try running with "
  215. "errors='ignore' as key "
  216. "{err} is not always present"
  217. .format(err=e))
  218. meta_vals[key].append(meta_val)
  219. records.extend(recs)
  220. _recursive_extract(data, record_path, {}, level=0)
  221. result = DataFrame(records)
  222. if record_prefix is not None:
  223. result = result.rename(
  224. columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
  225. # Data types, a problem
  226. for k, v in compat.iteritems(meta_vals):
  227. if meta_prefix is not None:
  228. k = meta_prefix + k
  229. if k in result:
  230. raise ValueError('Conflicting metadata name {name}, '
  231. 'need distinguishing prefix '.format(name=k))
  232. result[k] = np.array(v).repeat(lengths)
  233. return result