dayuan
/
manyi


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
							import json

import numpy as np
import pytest

from pandas import DataFrame, Index, compat
import pandas.util.testing as tm

from pandas.io.json import json_normalize
from pandas.io.json.normalize import nested_to_record


@pytest.fixture
def deep_nested():
    # deeply nested data
    return [{'country': 'USA',
             'states': [{'name': 'California',
                         'cities': [{'name': 'San Francisco',
                                     'pop': 12345},
                                    {'name': 'Los Angeles',
                                     'pop': 12346}]
                         },
                        {'name': 'Ohio',
                         'cities': [{'name': 'Columbus',
                                     'pop': 1234},
                                    {'name': 'Cleveland',
                                     'pop': 1236}]}
                        ]
             },
            {'country': 'Germany',
             'states': [{'name': 'Bayern',
                         'cities': [{'name': 'Munich', 'pop': 12347}]
                         },
                        {'name': 'Nordrhein-Westfalen',
                         'cities': [{'name': 'Duesseldorf', 'pop': 1238},
                                    {'name': 'Koeln', 'pop': 1239}]}
                        ]
             }
            ]


@pytest.fixture
def state_data():
    return [
        {'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}],
         'info': {'governor': 'Rick Scott'},
         'shortname': 'FL',
         'state': 'Florida'},
        {'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}],
         'info': {'governor': 'John Kasich'},
         'shortname': 'OH',
         'state': 'Ohio'}]


@pytest.fixture
def author_missing_data():
    return [
        {'info': None},
        {'info':
            {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
            'author_name':
         {'first': 'Jane', 'last_name': 'Doe'}
         }]


class TestJSONNormalize(object):

    def test_simple_records(self):
        recs = [{'a': 1, 'b': 2, 'c': 3},
                {'a': 4, 'b': 5, 'c': 6},
                {'a': 7, 'b': 8, 'c': 9},
                {'a': 10, 'b': 11, 'c': 12}]

        result = json_normalize(recs)
        expected = DataFrame(recs)

        tm.assert_frame_equal(result, expected)

    def test_simple_normalize(self, state_data):
        result = json_normalize(state_data[0], 'counties')
        expected = DataFrame(state_data[0]['counties'])
        tm.assert_frame_equal(result, expected)

        result = json_normalize(state_data, 'counties')

        expected = []
        for rec in state_data:
            expected.extend(rec['counties'])
        expected = DataFrame(expected)

        tm.assert_frame_equal(result, expected)

        result = json_normalize(state_data, 'counties', meta='state')
        expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])

        tm.assert_frame_equal(result, expected)

    def test_empty_array(self):
        result = json_normalize([])
        expected = DataFrame()
        tm.assert_frame_equal(result, expected)

    def test_simple_normalize_with_separator(self, deep_nested):
        # GH 14883
        result = json_normalize({'A': {'A': 1, 'B': 2}})
        expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
        tm.assert_frame_equal(result.reindex_like(expected), expected)

        result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
        expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
        tm.assert_frame_equal(result.reindex_like(expected), expected)

        result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
        expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
        tm.assert_frame_equal(result.reindex_like(expected), expected)

        result = json_normalize(deep_nested, ['states', 'cities'],
                                meta=['country', ['states', 'name']],
                                sep='_')
        expected = Index(['name', 'pop',
                          'country', 'states_name']).sort_values()
        assert result.columns.sort_values().equals(expected)

    def test_value_array_record_prefix(self):
        # GH 21536
        result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
        expected = DataFrame([[1], [2]], columns=['Prefix.0'])
        tm.assert_frame_equal(result, expected)

    def test_nested_object_record_path(self):
        # GH 22706
        data = {'state': 'Florida',
                'info': {
                    'governor': 'Rick Scott',
                    'counties': [{'name': 'Dade', 'population': 12345},
                                 {'name': 'Broward', 'population': 40000},
                                 {'name': 'Palm Beach', 'population': 60000}]}}
        result = json_normalize(data, record_path=["info", "counties"])
        expected = DataFrame([['Dade', 12345],
                              ['Broward', 40000],
                              ['Palm Beach', 60000]],
                             columns=['name', 'population'])
        tm.assert_frame_equal(result, expected)

    def test_more_deeply_nested(self, deep_nested):

        result = json_normalize(deep_nested, ['states', 'cities'],
                                meta=['country', ['states', 'name']])
        # meta_prefix={'states': 'state_'})

        ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
                   'states.name': ['California', 'California', 'Ohio', 'Ohio',
                                   'Bayern', 'Nordrhein-Westfalen',
                                   'Nordrhein-Westfalen'],
                   'name': ['San Francisco', 'Los Angeles', 'Columbus',
                            'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
                   'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}

        expected = DataFrame(ex_data, columns=result.columns)
        tm.assert_frame_equal(result, expected)

    def test_shallow_nested(self):
        data = [{'state': 'Florida',
                 'shortname': 'FL',
                 'info': {
                     'governor': 'Rick Scott'
                 },
                 'counties': [{'name': 'Dade', 'population': 12345},
                              {'name': 'Broward', 'population': 40000},
                              {'name': 'Palm Beach', 'population': 60000}]},
                {'state': 'Ohio',
                 'shortname': 'OH',
                 'info': {
                     'governor': 'John Kasich'
                 },
                 'counties': [{'name': 'Summit', 'population': 1234},
                              {'name': 'Cuyahoga', 'population': 1337}]}]

        result = json_normalize(data, 'counties',
                                ['state', 'shortname',
                                 ['info', 'governor']])
        ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
                            'Cuyahoga'],
                   'state': ['Florida'] * 3 + ['Ohio'] * 2,
                   'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
                   'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
                   'population': [12345, 40000, 60000, 1234, 1337]}
        expected = DataFrame(ex_data, columns=result.columns)
        tm.assert_frame_equal(result, expected)

    def test_meta_name_conflict(self):
        data = [{'foo': 'hello',
                 'bar': 'there',
                 'data': [{'foo': 'something', 'bar': 'else'},
                          {'foo': 'something2', 'bar': 'else2'}]}]

        msg = (r"Conflicting metadata name (foo|bar),"
               " need distinguishing prefix")
        with pytest.raises(ValueError, match=msg):
            json_normalize(data, 'data', meta=['foo', 'bar'])

        result = json_normalize(data, 'data', meta=['foo', 'bar'],
                                meta_prefix='meta')

        for val in ['metafoo', 'metabar', 'foo', 'bar']:
            assert val in result

    def test_meta_parameter_not_modified(self):
        # GH 18610
        data = [{'foo': 'hello',
                 'bar': 'there',
                 'data': [{'foo': 'something', 'bar': 'else'},
                          {'foo': 'something2', 'bar': 'else2'}]}]

        COLUMNS = ['foo', 'bar']
        result = json_normalize(data, 'data', meta=COLUMNS,
                                meta_prefix='meta')

        assert COLUMNS == ['foo', 'bar']
        for val in ['metafoo', 'metabar', 'foo', 'bar']:
            assert val in result

    def test_record_prefix(self, state_data):
        result = json_normalize(state_data[0], 'counties')
        expected = DataFrame(state_data[0]['counties'])
        tm.assert_frame_equal(result, expected)

        result = json_normalize(state_data, 'counties',
                                meta='state',
                                record_prefix='county_')

        expected = []
        for rec in state_data:
            expected.extend(rec['counties'])
        expected = DataFrame(expected)
        expected = expected.rename(columns=lambda x: 'county_' + x)
        expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])

        tm.assert_frame_equal(result, expected)

    def test_non_ascii_key(self):
        if compat.PY3:
            testjson = (
                b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
                b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
            ).decode('utf8')
        else:
            testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
                        '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')

        testdata = {
            u'sub.A': [1, 3],
            u'sub.B': [2, 4],
            b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
        }
        expected = DataFrame(testdata)

        result = json_normalize(json.loads(testjson))
        tm.assert_frame_equal(result, expected)

    def test_missing_field(self, author_missing_data):
        # GH20030:
        result = json_normalize(author_missing_data)
        ex_data = [
            {'info': np.nan,
             'author_name.first': np.nan,
             'author_name.last_name': np.nan,
             'info.created_at': np.nan,
             'info.last_updated': np.nan},
            {'info': None,
             'author_name.first': 'Jane',
             'author_name.last_name': 'Doe',
             'info.created_at': '11/08/1993',
             'info.last_updated': '26/05/2012'}
        ]
        expected = DataFrame(ex_data)
        tm.assert_frame_equal(result, expected)


class TestNestedToRecord(object):

    def test_flat_stays_flat(self):
        recs = [dict(flat1=1, flat2=2),
                dict(flat1=3, flat2=4),
                ]

        result = nested_to_record(recs)
        expected = recs
        assert result == expected

    def test_one_level_deep_flattens(self):
        data = dict(flat1=1,
                    dict1=dict(c=1, d=2))

        result = nested_to_record(data)
        expected = {'dict1.c': 1,
                    'dict1.d': 2,
                    'flat1': 1}

        assert result == expected

    def test_nested_flattens(self):
        data = dict(flat1=1,
                    dict1=dict(c=1, d=2),
                    nested=dict(e=dict(c=1, d=2),
                                d=2))

        result = nested_to_record(data)
        expected = {'dict1.c': 1,
                    'dict1.d': 2,
                    'flat1': 1,
                    'nested.d': 2,
                    'nested.e.c': 1,
                    'nested.e.d': 2}

        assert result == expected

    def test_json_normalize_errors(self):
        # GH14583: If meta keys are not always present
        # a new option to set errors='ignore' has been implemented
        i = {
            "Trades": [{
                "general": {
                    "tradeid": 100,
                    "trade_version": 1,
                    "stocks": [{

                        "symbol": "AAPL",
                        "name": "Apple",
                        "price": "0"
                    }, {
                        "symbol": "GOOG",
                        "name": "Google",
                        "price": "0"
                    }
                    ]
                }
            }, {
                "general": {
                    "tradeid": 100,
                    "stocks": [{
                        "symbol": "AAPL",
                        "name": "Apple",
                        "price": "0"
                    }, {
                        "symbol": "GOOG",
                        "name": "Google",
                        "price": "0"
                    }
                    ]
                }
            }
            ]
        }
        j = json_normalize(data=i['Trades'],
                           record_path=[['general', 'stocks']],
                           meta=[['general', 'tradeid'],
                                 ['general', 'trade_version']],
                           errors='ignore')
        expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
                    'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
                    'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
                    'price': {0: '0', 1: '0', 2: '0', 3: '0'},
                    'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}

        assert j.fillna('').to_dict() == expected

        msg = ("Try running with errors='ignore' as key 'trade_version'"
               " is not always present")
        with pytest.raises(KeyError, match=msg):
            json_normalize(
                data=i['Trades'],
                record_path=[['general', 'stocks']],
                meta=[['general', 'tradeid'],
                      ['general', 'trade_version']],
                errors='raise')

    def test_donot_drop_nonevalues(self):
        # GH21356
        data = [
            {'info': None,
             'author_name':
             {'first': 'Smith', 'last_name': 'Appleseed'}
             },
            {'info':
                {'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
             'author_name':
                {'first': 'Jane', 'last_name': 'Doe'}
             }
        ]
        result = nested_to_record(data)
        expected = [
            {'info': None,
             'author_name.first': 'Smith',
             'author_name.last_name': 'Appleseed'},
            {'author_name.first': 'Jane',
             'author_name.last_name': 'Doe',
             'info.created_at': '11/08/1993',
             'info.last_updated': '26/05/2012'}]

        assert result == expected

    def test_nonetype_top_level_bottom_level(self):
        # GH21158: If inner level json has a key with a null value
        # make sure it doesnt do a new_d.pop twice and except
        data = {
            "id": None,
            "location": {
                "country": {
                    "state": {
                        "id": None,
                        "town.info": {
                            "id": None,
                            "region": None,
                            "x": 49.151580810546875,
                            "y": -33.148521423339844,
                            "z": 27.572303771972656}}}
            }
        }
        result = nested_to_record(data)
        expected = {
            'id': None,
            'location.country.state.id': None,
            'location.country.state.town.info.id': None,
            'location.country.state.town.info.region': None,
            'location.country.state.town.info.x': 49.151580810546875,
            'location.country.state.town.info.y': -33.148521423339844,
            'location.country.state.town.info.z': 27.572303771972656}
        assert result == expected

    def test_nonetype_multiple_levels(self):
        # GH21158: If inner level json has a key with a null value
        # make sure it doesnt do a new_d.pop twice and except
        data = {
            "id": None,
            "location": {
                "id": None,
                "country": {
                    "id": None,
                    "state": {
                        "id": None,
                        "town.info": {
                            "region": None,
                            "x": 49.151580810546875,
                            "y": -33.148521423339844,
                            "z": 27.572303771972656}}}
            }
        }
        result = nested_to_record(data)
        expected = {
            'id': None,
            'location.id': None,
            'location.country.id': None,
            'location.country.state.id': None,
            'location.country.state.town.info.region': None,
            'location.country.state.town.info.x': 49.151580810546875,
            'location.country.state.town.info.y': -33.148521423339844,
            'location.country.state.town.info.z': 27.572303771972656}
        assert result == expected