test_normalize.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. import json
  2. import numpy as np
  3. import pytest
  4. from pandas import DataFrame, Index, compat
  5. import pandas.util.testing as tm
  6. from pandas.io.json import json_normalize
  7. from pandas.io.json.normalize import nested_to_record
  8. @pytest.fixture
  9. def deep_nested():
  10. # deeply nested data
  11. return [{'country': 'USA',
  12. 'states': [{'name': 'California',
  13. 'cities': [{'name': 'San Francisco',
  14. 'pop': 12345},
  15. {'name': 'Los Angeles',
  16. 'pop': 12346}]
  17. },
  18. {'name': 'Ohio',
  19. 'cities': [{'name': 'Columbus',
  20. 'pop': 1234},
  21. {'name': 'Cleveland',
  22. 'pop': 1236}]}
  23. ]
  24. },
  25. {'country': 'Germany',
  26. 'states': [{'name': 'Bayern',
  27. 'cities': [{'name': 'Munich', 'pop': 12347}]
  28. },
  29. {'name': 'Nordrhein-Westfalen',
  30. 'cities': [{'name': 'Duesseldorf', 'pop': 1238},
  31. {'name': 'Koeln', 'pop': 1239}]}
  32. ]
  33. }
  34. ]
  35. @pytest.fixture
  36. def state_data():
  37. return [
  38. {'counties': [{'name': 'Dade', 'population': 12345},
  39. {'name': 'Broward', 'population': 40000},
  40. {'name': 'Palm Beach', 'population': 60000}],
  41. 'info': {'governor': 'Rick Scott'},
  42. 'shortname': 'FL',
  43. 'state': 'Florida'},
  44. {'counties': [{'name': 'Summit', 'population': 1234},
  45. {'name': 'Cuyahoga', 'population': 1337}],
  46. 'info': {'governor': 'John Kasich'},
  47. 'shortname': 'OH',
  48. 'state': 'Ohio'}]
  49. @pytest.fixture
  50. def author_missing_data():
  51. return [
  52. {'info': None},
  53. {'info':
  54. {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
  55. 'author_name':
  56. {'first': 'Jane', 'last_name': 'Doe'}
  57. }]
  58. class TestJSONNormalize(object):
  59. def test_simple_records(self):
  60. recs = [{'a': 1, 'b': 2, 'c': 3},
  61. {'a': 4, 'b': 5, 'c': 6},
  62. {'a': 7, 'b': 8, 'c': 9},
  63. {'a': 10, 'b': 11, 'c': 12}]
  64. result = json_normalize(recs)
  65. expected = DataFrame(recs)
  66. tm.assert_frame_equal(result, expected)
  67. def test_simple_normalize(self, state_data):
  68. result = json_normalize(state_data[0], 'counties')
  69. expected = DataFrame(state_data[0]['counties'])
  70. tm.assert_frame_equal(result, expected)
  71. result = json_normalize(state_data, 'counties')
  72. expected = []
  73. for rec in state_data:
  74. expected.extend(rec['counties'])
  75. expected = DataFrame(expected)
  76. tm.assert_frame_equal(result, expected)
  77. result = json_normalize(state_data, 'counties', meta='state')
  78. expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
  79. tm.assert_frame_equal(result, expected)
  80. def test_empty_array(self):
  81. result = json_normalize([])
  82. expected = DataFrame()
  83. tm.assert_frame_equal(result, expected)
  84. def test_simple_normalize_with_separator(self, deep_nested):
  85. # GH 14883
  86. result = json_normalize({'A': {'A': 1, 'B': 2}})
  87. expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
  88. tm.assert_frame_equal(result.reindex_like(expected), expected)
  89. result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
  90. expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
  91. tm.assert_frame_equal(result.reindex_like(expected), expected)
  92. result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
  93. expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
  94. tm.assert_frame_equal(result.reindex_like(expected), expected)
  95. result = json_normalize(deep_nested, ['states', 'cities'],
  96. meta=['country', ['states', 'name']],
  97. sep='_')
  98. expected = Index(['name', 'pop',
  99. 'country', 'states_name']).sort_values()
  100. assert result.columns.sort_values().equals(expected)
  101. def test_value_array_record_prefix(self):
  102. # GH 21536
  103. result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
  104. expected = DataFrame([[1], [2]], columns=['Prefix.0'])
  105. tm.assert_frame_equal(result, expected)
  106. def test_nested_object_record_path(self):
  107. # GH 22706
  108. data = {'state': 'Florida',
  109. 'info': {
  110. 'governor': 'Rick Scott',
  111. 'counties': [{'name': 'Dade', 'population': 12345},
  112. {'name': 'Broward', 'population': 40000},
  113. {'name': 'Palm Beach', 'population': 60000}]}}
  114. result = json_normalize(data, record_path=["info", "counties"])
  115. expected = DataFrame([['Dade', 12345],
  116. ['Broward', 40000],
  117. ['Palm Beach', 60000]],
  118. columns=['name', 'population'])
  119. tm.assert_frame_equal(result, expected)
  120. def test_more_deeply_nested(self, deep_nested):
  121. result = json_normalize(deep_nested, ['states', 'cities'],
  122. meta=['country', ['states', 'name']])
  123. # meta_prefix={'states': 'state_'})
  124. ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
  125. 'states.name': ['California', 'California', 'Ohio', 'Ohio',
  126. 'Bayern', 'Nordrhein-Westfalen',
  127. 'Nordrhein-Westfalen'],
  128. 'name': ['San Francisco', 'Los Angeles', 'Columbus',
  129. 'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
  130. 'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
  131. expected = DataFrame(ex_data, columns=result.columns)
  132. tm.assert_frame_equal(result, expected)
  133. def test_shallow_nested(self):
  134. data = [{'state': 'Florida',
  135. 'shortname': 'FL',
  136. 'info': {
  137. 'governor': 'Rick Scott'
  138. },
  139. 'counties': [{'name': 'Dade', 'population': 12345},
  140. {'name': 'Broward', 'population': 40000},
  141. {'name': 'Palm Beach', 'population': 60000}]},
  142. {'state': 'Ohio',
  143. 'shortname': 'OH',
  144. 'info': {
  145. 'governor': 'John Kasich'
  146. },
  147. 'counties': [{'name': 'Summit', 'population': 1234},
  148. {'name': 'Cuyahoga', 'population': 1337}]}]
  149. result = json_normalize(data, 'counties',
  150. ['state', 'shortname',
  151. ['info', 'governor']])
  152. ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
  153. 'Cuyahoga'],
  154. 'state': ['Florida'] * 3 + ['Ohio'] * 2,
  155. 'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
  156. 'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
  157. 'population': [12345, 40000, 60000, 1234, 1337]}
  158. expected = DataFrame(ex_data, columns=result.columns)
  159. tm.assert_frame_equal(result, expected)
  160. def test_meta_name_conflict(self):
  161. data = [{'foo': 'hello',
  162. 'bar': 'there',
  163. 'data': [{'foo': 'something', 'bar': 'else'},
  164. {'foo': 'something2', 'bar': 'else2'}]}]
  165. msg = (r"Conflicting metadata name (foo|bar),"
  166. " need distinguishing prefix")
  167. with pytest.raises(ValueError, match=msg):
  168. json_normalize(data, 'data', meta=['foo', 'bar'])
  169. result = json_normalize(data, 'data', meta=['foo', 'bar'],
  170. meta_prefix='meta')
  171. for val in ['metafoo', 'metabar', 'foo', 'bar']:
  172. assert val in result
  173. def test_meta_parameter_not_modified(self):
  174. # GH 18610
  175. data = [{'foo': 'hello',
  176. 'bar': 'there',
  177. 'data': [{'foo': 'something', 'bar': 'else'},
  178. {'foo': 'something2', 'bar': 'else2'}]}]
  179. COLUMNS = ['foo', 'bar']
  180. result = json_normalize(data, 'data', meta=COLUMNS,
  181. meta_prefix='meta')
  182. assert COLUMNS == ['foo', 'bar']
  183. for val in ['metafoo', 'metabar', 'foo', 'bar']:
  184. assert val in result
  185. def test_record_prefix(self, state_data):
  186. result = json_normalize(state_data[0], 'counties')
  187. expected = DataFrame(state_data[0]['counties'])
  188. tm.assert_frame_equal(result, expected)
  189. result = json_normalize(state_data, 'counties',
  190. meta='state',
  191. record_prefix='county_')
  192. expected = []
  193. for rec in state_data:
  194. expected.extend(rec['counties'])
  195. expected = DataFrame(expected)
  196. expected = expected.rename(columns=lambda x: 'county_' + x)
  197. expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
  198. tm.assert_frame_equal(result, expected)
  199. def test_non_ascii_key(self):
  200. if compat.PY3:
  201. testjson = (
  202. b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
  203. b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
  204. ).decode('utf8')
  205. else:
  206. testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
  207. '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
  208. testdata = {
  209. u'sub.A': [1, 3],
  210. u'sub.B': [2, 4],
  211. b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
  212. }
  213. expected = DataFrame(testdata)
  214. result = json_normalize(json.loads(testjson))
  215. tm.assert_frame_equal(result, expected)
  216. def test_missing_field(self, author_missing_data):
  217. # GH20030:
  218. result = json_normalize(author_missing_data)
  219. ex_data = [
  220. {'info': np.nan,
  221. 'author_name.first': np.nan,
  222. 'author_name.last_name': np.nan,
  223. 'info.created_at': np.nan,
  224. 'info.last_updated': np.nan},
  225. {'info': None,
  226. 'author_name.first': 'Jane',
  227. 'author_name.last_name': 'Doe',
  228. 'info.created_at': '11/08/1993',
  229. 'info.last_updated': '26/05/2012'}
  230. ]
  231. expected = DataFrame(ex_data)
  232. tm.assert_frame_equal(result, expected)
  233. class TestNestedToRecord(object):
  234. def test_flat_stays_flat(self):
  235. recs = [dict(flat1=1, flat2=2),
  236. dict(flat1=3, flat2=4),
  237. ]
  238. result = nested_to_record(recs)
  239. expected = recs
  240. assert result == expected
  241. def test_one_level_deep_flattens(self):
  242. data = dict(flat1=1,
  243. dict1=dict(c=1, d=2))
  244. result = nested_to_record(data)
  245. expected = {'dict1.c': 1,
  246. 'dict1.d': 2,
  247. 'flat1': 1}
  248. assert result == expected
  249. def test_nested_flattens(self):
  250. data = dict(flat1=1,
  251. dict1=dict(c=1, d=2),
  252. nested=dict(e=dict(c=1, d=2),
  253. d=2))
  254. result = nested_to_record(data)
  255. expected = {'dict1.c': 1,
  256. 'dict1.d': 2,
  257. 'flat1': 1,
  258. 'nested.d': 2,
  259. 'nested.e.c': 1,
  260. 'nested.e.d': 2}
  261. assert result == expected
  262. def test_json_normalize_errors(self):
  263. # GH14583: If meta keys are not always present
  264. # a new option to set errors='ignore' has been implemented
  265. i = {
  266. "Trades": [{
  267. "general": {
  268. "tradeid": 100,
  269. "trade_version": 1,
  270. "stocks": [{
  271. "symbol": "AAPL",
  272. "name": "Apple",
  273. "price": "0"
  274. }, {
  275. "symbol": "GOOG",
  276. "name": "Google",
  277. "price": "0"
  278. }
  279. ]
  280. }
  281. }, {
  282. "general": {
  283. "tradeid": 100,
  284. "stocks": [{
  285. "symbol": "AAPL",
  286. "name": "Apple",
  287. "price": "0"
  288. }, {
  289. "symbol": "GOOG",
  290. "name": "Google",
  291. "price": "0"
  292. }
  293. ]
  294. }
  295. }
  296. ]
  297. }
  298. j = json_normalize(data=i['Trades'],
  299. record_path=[['general', 'stocks']],
  300. meta=[['general', 'tradeid'],
  301. ['general', 'trade_version']],
  302. errors='ignore')
  303. expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
  304. 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
  305. 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
  306. 'price': {0: '0', 1: '0', 2: '0', 3: '0'},
  307. 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
  308. assert j.fillna('').to_dict() == expected
  309. msg = ("Try running with errors='ignore' as key 'trade_version'"
  310. " is not always present")
  311. with pytest.raises(KeyError, match=msg):
  312. json_normalize(
  313. data=i['Trades'],
  314. record_path=[['general', 'stocks']],
  315. meta=[['general', 'tradeid'],
  316. ['general', 'trade_version']],
  317. errors='raise')
  318. def test_donot_drop_nonevalues(self):
  319. # GH21356
  320. data = [
  321. {'info': None,
  322. 'author_name':
  323. {'first': 'Smith', 'last_name': 'Appleseed'}
  324. },
  325. {'info':
  326. {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
  327. 'author_name':
  328. {'first': 'Jane', 'last_name': 'Doe'}
  329. }
  330. ]
  331. result = nested_to_record(data)
  332. expected = [
  333. {'info': None,
  334. 'author_name.first': 'Smith',
  335. 'author_name.last_name': 'Appleseed'},
  336. {'author_name.first': 'Jane',
  337. 'author_name.last_name': 'Doe',
  338. 'info.created_at': '11/08/1993',
  339. 'info.last_updated': '26/05/2012'}]
  340. assert result == expected
  341. def test_nonetype_top_level_bottom_level(self):
  342. # GH21158: If inner level json has a key with a null value
  343. # make sure it doesnt do a new_d.pop twice and except
  344. data = {
  345. "id": None,
  346. "location": {
  347. "country": {
  348. "state": {
  349. "id": None,
  350. "town.info": {
  351. "id": None,
  352. "region": None,
  353. "x": 49.151580810546875,
  354. "y": -33.148521423339844,
  355. "z": 27.572303771972656}}}
  356. }
  357. }
  358. result = nested_to_record(data)
  359. expected = {
  360. 'id': None,
  361. 'location.country.state.id': None,
  362. 'location.country.state.town.info.id': None,
  363. 'location.country.state.town.info.region': None,
  364. 'location.country.state.town.info.x': 49.151580810546875,
  365. 'location.country.state.town.info.y': -33.148521423339844,
  366. 'location.country.state.town.info.z': 27.572303771972656}
  367. assert result == expected
  368. def test_nonetype_multiple_levels(self):
  369. # GH21158: If inner level json has a key with a null value
  370. # make sure it doesnt do a new_d.pop twice and except
  371. data = {
  372. "id": None,
  373. "location": {
  374. "id": None,
  375. "country": {
  376. "id": None,
  377. "state": {
  378. "id": None,
  379. "town.info": {
  380. "region": None,
  381. "x": 49.151580810546875,
  382. "y": -33.148521423339844,
  383. "z": 27.572303771972656}}}
  384. }
  385. }
  386. result = nested_to_record(data)
  387. expected = {
  388. 'id': None,
  389. 'location.id': None,
  390. 'location.country.id': None,
  391. 'location.country.state.id': None,
  392. 'location.country.state.town.info.region': None,
  393. 'location.country.state.town.info.x': 49.151580810546875,
  394. 'location.country.state.town.info.y': -33.148521423339844,
  395. 'location.country.state.town.info.z': 27.572303771972656}
  396. assert result == expected