123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462 |
- import json
- import numpy as np
- import pytest
- from pandas import DataFrame, Index, compat
- import pandas.util.testing as tm
- from pandas.io.json import json_normalize
- from pandas.io.json.normalize import nested_to_record
- @pytest.fixture
- def deep_nested():
- # deeply nested data
- return [{'country': 'USA',
- 'states': [{'name': 'California',
- 'cities': [{'name': 'San Francisco',
- 'pop': 12345},
- {'name': 'Los Angeles',
- 'pop': 12346}]
- },
- {'name': 'Ohio',
- 'cities': [{'name': 'Columbus',
- 'pop': 1234},
- {'name': 'Cleveland',
- 'pop': 1236}]}
- ]
- },
- {'country': 'Germany',
- 'states': [{'name': 'Bayern',
- 'cities': [{'name': 'Munich', 'pop': 12347}]
- },
- {'name': 'Nordrhein-Westfalen',
- 'cities': [{'name': 'Duesseldorf', 'pop': 1238},
- {'name': 'Koeln', 'pop': 1239}]}
- ]
- }
- ]
- @pytest.fixture
- def state_data():
- return [
- {'counties': [{'name': 'Dade', 'population': 12345},
- {'name': 'Broward', 'population': 40000},
- {'name': 'Palm Beach', 'population': 60000}],
- 'info': {'governor': 'Rick Scott'},
- 'shortname': 'FL',
- 'state': 'Florida'},
- {'counties': [{'name': 'Summit', 'population': 1234},
- {'name': 'Cuyahoga', 'population': 1337}],
- 'info': {'governor': 'John Kasich'},
- 'shortname': 'OH',
- 'state': 'Ohio'}]
- @pytest.fixture
- def author_missing_data():
- return [
- {'info': None},
- {'info':
- {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
- 'author_name':
- {'first': 'Jane', 'last_name': 'Doe'}
- }]
- class TestJSONNormalize(object):
- def test_simple_records(self):
- recs = [{'a': 1, 'b': 2, 'c': 3},
- {'a': 4, 'b': 5, 'c': 6},
- {'a': 7, 'b': 8, 'c': 9},
- {'a': 10, 'b': 11, 'c': 12}]
- result = json_normalize(recs)
- expected = DataFrame(recs)
- tm.assert_frame_equal(result, expected)
- def test_simple_normalize(self, state_data):
- result = json_normalize(state_data[0], 'counties')
- expected = DataFrame(state_data[0]['counties'])
- tm.assert_frame_equal(result, expected)
- result = json_normalize(state_data, 'counties')
- expected = []
- for rec in state_data:
- expected.extend(rec['counties'])
- expected = DataFrame(expected)
- tm.assert_frame_equal(result, expected)
- result = json_normalize(state_data, 'counties', meta='state')
- expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
- tm.assert_frame_equal(result, expected)
- def test_empty_array(self):
- result = json_normalize([])
- expected = DataFrame()
- tm.assert_frame_equal(result, expected)
- def test_simple_normalize_with_separator(self, deep_nested):
- # GH 14883
- result = json_normalize({'A': {'A': 1, 'B': 2}})
- expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
- tm.assert_frame_equal(result.reindex_like(expected), expected)
- result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
- expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
- tm.assert_frame_equal(result.reindex_like(expected), expected)
- result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
- expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
- tm.assert_frame_equal(result.reindex_like(expected), expected)
- result = json_normalize(deep_nested, ['states', 'cities'],
- meta=['country', ['states', 'name']],
- sep='_')
- expected = Index(['name', 'pop',
- 'country', 'states_name']).sort_values()
- assert result.columns.sort_values().equals(expected)
- def test_value_array_record_prefix(self):
- # GH 21536
- result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
- expected = DataFrame([[1], [2]], columns=['Prefix.0'])
- tm.assert_frame_equal(result, expected)
- def test_nested_object_record_path(self):
- # GH 22706
- data = {'state': 'Florida',
- 'info': {
- 'governor': 'Rick Scott',
- 'counties': [{'name': 'Dade', 'population': 12345},
- {'name': 'Broward', 'population': 40000},
- {'name': 'Palm Beach', 'population': 60000}]}}
- result = json_normalize(data, record_path=["info", "counties"])
- expected = DataFrame([['Dade', 12345],
- ['Broward', 40000],
- ['Palm Beach', 60000]],
- columns=['name', 'population'])
- tm.assert_frame_equal(result, expected)
- def test_more_deeply_nested(self, deep_nested):
- result = json_normalize(deep_nested, ['states', 'cities'],
- meta=['country', ['states', 'name']])
- # meta_prefix={'states': 'state_'})
- ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
- 'states.name': ['California', 'California', 'Ohio', 'Ohio',
- 'Bayern', 'Nordrhein-Westfalen',
- 'Nordrhein-Westfalen'],
- 'name': ['San Francisco', 'Los Angeles', 'Columbus',
- 'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
- 'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
- expected = DataFrame(ex_data, columns=result.columns)
- tm.assert_frame_equal(result, expected)
- def test_shallow_nested(self):
- data = [{'state': 'Florida',
- 'shortname': 'FL',
- 'info': {
- 'governor': 'Rick Scott'
- },
- 'counties': [{'name': 'Dade', 'population': 12345},
- {'name': 'Broward', 'population': 40000},
- {'name': 'Palm Beach', 'population': 60000}]},
- {'state': 'Ohio',
- 'shortname': 'OH',
- 'info': {
- 'governor': 'John Kasich'
- },
- 'counties': [{'name': 'Summit', 'population': 1234},
- {'name': 'Cuyahoga', 'population': 1337}]}]
- result = json_normalize(data, 'counties',
- ['state', 'shortname',
- ['info', 'governor']])
- ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
- 'Cuyahoga'],
- 'state': ['Florida'] * 3 + ['Ohio'] * 2,
- 'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
- 'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
- 'population': [12345, 40000, 60000, 1234, 1337]}
- expected = DataFrame(ex_data, columns=result.columns)
- tm.assert_frame_equal(result, expected)
- def test_meta_name_conflict(self):
- data = [{'foo': 'hello',
- 'bar': 'there',
- 'data': [{'foo': 'something', 'bar': 'else'},
- {'foo': 'something2', 'bar': 'else2'}]}]
- msg = (r"Conflicting metadata name (foo|bar),"
- " need distinguishing prefix")
- with pytest.raises(ValueError, match=msg):
- json_normalize(data, 'data', meta=['foo', 'bar'])
- result = json_normalize(data, 'data', meta=['foo', 'bar'],
- meta_prefix='meta')
- for val in ['metafoo', 'metabar', 'foo', 'bar']:
- assert val in result
- def test_meta_parameter_not_modified(self):
- # GH 18610
- data = [{'foo': 'hello',
- 'bar': 'there',
- 'data': [{'foo': 'something', 'bar': 'else'},
- {'foo': 'something2', 'bar': 'else2'}]}]
- COLUMNS = ['foo', 'bar']
- result = json_normalize(data, 'data', meta=COLUMNS,
- meta_prefix='meta')
- assert COLUMNS == ['foo', 'bar']
- for val in ['metafoo', 'metabar', 'foo', 'bar']:
- assert val in result
- def test_record_prefix(self, state_data):
- result = json_normalize(state_data[0], 'counties')
- expected = DataFrame(state_data[0]['counties'])
- tm.assert_frame_equal(result, expected)
- result = json_normalize(state_data, 'counties',
- meta='state',
- record_prefix='county_')
- expected = []
- for rec in state_data:
- expected.extend(rec['counties'])
- expected = DataFrame(expected)
- expected = expected.rename(columns=lambda x: 'county_' + x)
- expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
- tm.assert_frame_equal(result, expected)
- def test_non_ascii_key(self):
- if compat.PY3:
- testjson = (
- b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
- b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
- ).decode('utf8')
- else:
- testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
- '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
- testdata = {
- u'sub.A': [1, 3],
- u'sub.B': [2, 4],
- b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
- }
- expected = DataFrame(testdata)
- result = json_normalize(json.loads(testjson))
- tm.assert_frame_equal(result, expected)
- def test_missing_field(self, author_missing_data):
- # GH20030:
- result = json_normalize(author_missing_data)
- ex_data = [
- {'info': np.nan,
- 'author_name.first': np.nan,
- 'author_name.last_name': np.nan,
- 'info.created_at': np.nan,
- 'info.last_updated': np.nan},
- {'info': None,
- 'author_name.first': 'Jane',
- 'author_name.last_name': 'Doe',
- 'info.created_at': '11/08/1993',
- 'info.last_updated': '26/05/2012'}
- ]
- expected = DataFrame(ex_data)
- tm.assert_frame_equal(result, expected)
- class TestNestedToRecord(object):
- def test_flat_stays_flat(self):
- recs = [dict(flat1=1, flat2=2),
- dict(flat1=3, flat2=4),
- ]
- result = nested_to_record(recs)
- expected = recs
- assert result == expected
- def test_one_level_deep_flattens(self):
- data = dict(flat1=1,
- dict1=dict(c=1, d=2))
- result = nested_to_record(data)
- expected = {'dict1.c': 1,
- 'dict1.d': 2,
- 'flat1': 1}
- assert result == expected
- def test_nested_flattens(self):
- data = dict(flat1=1,
- dict1=dict(c=1, d=2),
- nested=dict(e=dict(c=1, d=2),
- d=2))
- result = nested_to_record(data)
- expected = {'dict1.c': 1,
- 'dict1.d': 2,
- 'flat1': 1,
- 'nested.d': 2,
- 'nested.e.c': 1,
- 'nested.e.d': 2}
- assert result == expected
- def test_json_normalize_errors(self):
- # GH14583: If meta keys are not always present
- # a new option to set errors='ignore' has been implemented
- i = {
- "Trades": [{
- "general": {
- "tradeid": 100,
- "trade_version": 1,
- "stocks": [{
- "symbol": "AAPL",
- "name": "Apple",
- "price": "0"
- }, {
- "symbol": "GOOG",
- "name": "Google",
- "price": "0"
- }
- ]
- }
- }, {
- "general": {
- "tradeid": 100,
- "stocks": [{
- "symbol": "AAPL",
- "name": "Apple",
- "price": "0"
- }, {
- "symbol": "GOOG",
- "name": "Google",
- "price": "0"
- }
- ]
- }
- }
- ]
- }
- j = json_normalize(data=i['Trades'],
- record_path=[['general', 'stocks']],
- meta=[['general', 'tradeid'],
- ['general', 'trade_version']],
- errors='ignore')
- expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
- 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
- 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
- 'price': {0: '0', 1: '0', 2: '0', 3: '0'},
- 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}
- assert j.fillna('').to_dict() == expected
- msg = ("Try running with errors='ignore' as key 'trade_version'"
- " is not always present")
- with pytest.raises(KeyError, match=msg):
- json_normalize(
- data=i['Trades'],
- record_path=[['general', 'stocks']],
- meta=[['general', 'tradeid'],
- ['general', 'trade_version']],
- errors='raise')
- def test_donot_drop_nonevalues(self):
- # GH21356
- data = [
- {'info': None,
- 'author_name':
- {'first': 'Smith', 'last_name': 'Appleseed'}
- },
- {'info':
- {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
- 'author_name':
- {'first': 'Jane', 'last_name': 'Doe'}
- }
- ]
- result = nested_to_record(data)
- expected = [
- {'info': None,
- 'author_name.first': 'Smith',
- 'author_name.last_name': 'Appleseed'},
- {'author_name.first': 'Jane',
- 'author_name.last_name': 'Doe',
- 'info.created_at': '11/08/1993',
- 'info.last_updated': '26/05/2012'}]
- assert result == expected
- def test_nonetype_top_level_bottom_level(self):
- # GH21158: If inner level json has a key with a null value
- # make sure it doesnt do a new_d.pop twice and except
- data = {
- "id": None,
- "location": {
- "country": {
- "state": {
- "id": None,
- "town.info": {
- "id": None,
- "region": None,
- "x": 49.151580810546875,
- "y": -33.148521423339844,
- "z": 27.572303771972656}}}
- }
- }
- result = nested_to_record(data)
- expected = {
- 'id': None,
- 'location.country.state.id': None,
- 'location.country.state.town.info.id': None,
- 'location.country.state.town.info.region': None,
- 'location.country.state.town.info.x': 49.151580810546875,
- 'location.country.state.town.info.y': -33.148521423339844,
- 'location.country.state.town.info.z': 27.572303771972656}
- assert result == expected
- def test_nonetype_multiple_levels(self):
- # GH21158: If inner level json has a key with a null value
- # make sure it doesnt do a new_d.pop twice and except
- data = {
- "id": None,
- "location": {
- "id": None,
- "country": {
- "id": None,
- "state": {
- "id": None,
- "town.info": {
- "region": None,
- "x": 49.151580810546875,
- "y": -33.148521423339844,
- "z": 27.572303771972656}}}
- }
- }
- result = nested_to_record(data)
- expected = {
- 'id': None,
- 'location.id': None,
- 'location.country.id': None,
- 'location.country.state.id': None,
- 'location.country.state.town.info.region': None,
- 'location.country.state.town.info.x': 49.151580810546875,
- 'location.country.state.town.info.y': -33.148521423339844,
- 'location.country.state.town.info.z': 27.572303771972656}
- assert result == expected
|