dayuan
/
manyi


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954
							import datetime
from distutils.version import LooseVersion
import glob
import os
from warnings import catch_warnings

import numpy as np
import pytest

from pandas._libs.tslib import iNaT
from pandas.compat import PY3, u
from pandas.errors import PerformanceWarning

import pandas
from pandas import (
    Categorical, DataFrame, Index, Interval, MultiIndex, NaT, Panel, Period,
    Series, Timestamp, bdate_range, compat, date_range, period_range)
from pandas.tests.test_panel import assert_panel_equal
import pandas.util.testing as tm
from pandas.util.testing import (
    assert_categorical_equal, assert_frame_equal, assert_index_equal,
    assert_series_equal, ensure_clean)

from pandas.io.packers import read_msgpack, to_msgpack

nan = np.nan

try:
    import blosc  # NOQA
except ImportError:
    _BLOSC_INSTALLED = False
else:
    _BLOSC_INSTALLED = True

try:
    import zlib  # NOQA
except ImportError:
    _ZLIB_INSTALLED = False
else:
    _ZLIB_INSTALLED = True


@pytest.fixture(scope='module')
def current_packers_data():
    # our current version packers data
    from pandas.tests.io.generate_legacy_storage_files import (
        create_msgpack_data)
    return create_msgpack_data()


@pytest.fixture(scope='module')
def all_packers_data():
    # our all of our current version packers data
    from pandas.tests.io.generate_legacy_storage_files import (
        create_data)
    return create_data()


def check_arbitrary(a, b):

    if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)):
        assert(len(a) == len(b))
        for a_, b_ in zip(a, b):
            check_arbitrary(a_, b_)
    elif isinstance(a, Panel):
        assert_panel_equal(a, b)
    elif isinstance(a, DataFrame):
        assert_frame_equal(a, b)
    elif isinstance(a, Series):
        assert_series_equal(a, b)
    elif isinstance(a, Index):
        assert_index_equal(a, b)
    elif isinstance(a, Categorical):
        # Temp,
        # Categorical.categories is changed from str to bytes in PY3
        # maybe the same as GH 13591
        if PY3 and b.categories.inferred_type == 'string':
            pass
        else:
            tm.assert_categorical_equal(a, b)
    elif a is NaT:
        assert b is NaT
    elif isinstance(a, Timestamp):
        assert a == b
        assert a.freq == b.freq
    else:
        assert(a == b)


@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
class TestPackers(object):

    def setup_method(self, method):
        self.path = '__%s__.msg' % tm.rands(10)

    def teardown_method(self, method):
        pass

    def encode_decode(self, x, compress=None, **kwargs):
        with ensure_clean(self.path) as p:
            to_msgpack(p, x, compress=compress, **kwargs)
            return read_msgpack(p, **kwargs)


@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
class TestAPI(TestPackers):

    def test_string_io(self):

        df = DataFrame(np.random.randn(10, 2))
        s = df.to_msgpack(None)
        result = read_msgpack(s)
        tm.assert_frame_equal(result, df)

        s = df.to_msgpack()
        result = read_msgpack(s)
        tm.assert_frame_equal(result, df)

        s = df.to_msgpack()
        result = read_msgpack(compat.BytesIO(s))
        tm.assert_frame_equal(result, df)

        s = to_msgpack(None, df)
        result = read_msgpack(s)
        tm.assert_frame_equal(result, df)

        with ensure_clean(self.path) as p:

            s = df.to_msgpack()
            with open(p, 'wb') as fh:
                fh.write(s)
            result = read_msgpack(p)
            tm.assert_frame_equal(result, df)

    def test_path_pathlib(self):
        df = tm.makeDataFrame()
        result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack)
        tm.assert_frame_equal(df, result)

    def test_path_localpath(self):
        df = tm.makeDataFrame()
        result = tm.round_trip_localpath(df.to_msgpack, read_msgpack)
        tm.assert_frame_equal(df, result)

    def test_iterator_with_string_io(self):

        dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)]
        s = to_msgpack(None, *dfs)
        for i, result in enumerate(read_msgpack(s, iterator=True)):
            tm.assert_frame_equal(result, dfs[i])

    def test_invalid_arg(self):
        # GH10369
        class A(object):

            def __init__(self):
                self.read = 0

        msg = (r"Invalid file path or buffer object type: <(class|type)"
               r" '{}'>")
        with pytest.raises(ValueError, match=msg.format('NoneType')):
            read_msgpack(path_or_buf=None)
        with pytest.raises(ValueError, match=msg.format('dict')):
            read_msgpack(path_or_buf={})
        with pytest.raises(ValueError, match=msg.format(r'.*\.A')):
            read_msgpack(path_or_buf=A())


class TestNumpy(TestPackers):

    def test_numpy_scalar_float(self):
        x = np.float32(np.random.rand())
        x_rec = self.encode_decode(x)
        tm.assert_almost_equal(x, x_rec)

    def test_numpy_scalar_complex(self):
        x = np.complex64(np.random.rand() + 1j * np.random.rand())
        x_rec = self.encode_decode(x)
        assert np.allclose(x, x_rec)

    def test_scalar_float(self):
        x = np.random.rand()
        x_rec = self.encode_decode(x)
        tm.assert_almost_equal(x, x_rec)

    def test_scalar_bool(self):
        x = np.bool_(1)
        x_rec = self.encode_decode(x)
        tm.assert_almost_equal(x, x_rec)

        x = np.bool_(0)
        x_rec = self.encode_decode(x)
        tm.assert_almost_equal(x, x_rec)

    def test_scalar_complex(self):
        x = np.random.rand() + 1j * np.random.rand()
        x_rec = self.encode_decode(x)
        assert np.allclose(x, x_rec)

    def test_list_numpy_float(self):
        x = [np.float32(np.random.rand()) for i in range(5)]
        x_rec = self.encode_decode(x)
        # current msgpack cannot distinguish list/tuple
        tm.assert_almost_equal(tuple(x), x_rec)

        x_rec = self.encode_decode(tuple(x))
        tm.assert_almost_equal(tuple(x), x_rec)

    def test_list_numpy_float_complex(self):
        if not hasattr(np, 'complex128'):
            pytest.skip('numpy can not handle complex128')

        x = [np.float32(np.random.rand()) for i in range(5)] + \
            [np.complex128(np.random.rand() + 1j * np.random.rand())
             for i in range(5)]
        x_rec = self.encode_decode(x)
        assert np.allclose(x, x_rec)

    def test_list_float(self):
        x = [np.random.rand() for i in range(5)]
        x_rec = self.encode_decode(x)
        # current msgpack cannot distinguish list/tuple
        tm.assert_almost_equal(tuple(x), x_rec)

        x_rec = self.encode_decode(tuple(x))
        tm.assert_almost_equal(tuple(x), x_rec)

    def test_list_float_complex(self):
        x = [np.random.rand() for i in range(5)] + \
            [(np.random.rand() + 1j * np.random.rand()) for i in range(5)]
        x_rec = self.encode_decode(x)
        assert np.allclose(x, x_rec)

    def test_dict_float(self):
        x = {'foo': 1.0, 'bar': 2.0}
        x_rec = self.encode_decode(x)
        tm.assert_almost_equal(x, x_rec)

    def test_dict_complex(self):
        x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j}
        x_rec = self.encode_decode(x)
        tm.assert_dict_equal(x, x_rec)

        for key in x:
            tm.assert_class_equal(x[key], x_rec[key], obj="complex value")

    def test_dict_numpy_float(self):
        x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)}
        x_rec = self.encode_decode(x)
        tm.assert_almost_equal(x, x_rec)

    def test_dict_numpy_complex(self):
        x = {'foo': np.complex128(1.0 + 1.0j),
             'bar': np.complex128(2.0 + 2.0j)}
        x_rec = self.encode_decode(x)
        tm.assert_dict_equal(x, x_rec)

        for key in x:
            tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128")

    def test_numpy_array_float(self):

        # run multiple times
        for n in range(10):
            x = np.random.rand(10)
            for dtype in ['float32', 'float64']:
                x = x.astype(dtype)
                x_rec = self.encode_decode(x)
                tm.assert_almost_equal(x, x_rec)

    def test_numpy_array_complex(self):
        x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128)
        x_rec = self.encode_decode(x)
        assert (all(map(lambda x, y: x == y, x, x_rec)) and
                x.dtype == x_rec.dtype)

    def test_list_mixed(self):
        x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)]
        x_rec = self.encode_decode(x)
        # current msgpack cannot distinguish list/tuple
        tm.assert_almost_equal(tuple(x), x_rec)

        x_rec = self.encode_decode(tuple(x))
        tm.assert_almost_equal(tuple(x), x_rec)


class TestBasic(TestPackers):

    def test_timestamp(self):

        for i in [Timestamp(
            '20130101'), Timestamp('20130101', tz='US/Eastern'),
                Timestamp('201301010501')]:
            i_rec = self.encode_decode(i)
            assert i == i_rec

    def test_nat(self):
        nat_rec = self.encode_decode(NaT)
        assert NaT is nat_rec

    def test_datetimes(self):

        for i in [datetime.datetime(2013, 1, 1),
                  datetime.datetime(2013, 1, 1, 5, 1),
                  datetime.date(2013, 1, 1),
                  np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]:
            i_rec = self.encode_decode(i)
            assert i == i_rec

    def test_timedeltas(self):

        for i in [datetime.timedelta(days=1),
                  datetime.timedelta(days=1, seconds=10),
                  np.timedelta64(1000000)]:
            i_rec = self.encode_decode(i)
            assert i == i_rec

    def test_periods(self):
        # 13463
        for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]:
            i_rec = self.encode_decode(i)
            assert i == i_rec

    def test_intervals(self):
        # 19967
        for i in [Interval(0, 1), Interval(0, 1, 'left'),
                  Interval(10, 25., 'right')]:
            i_rec = self.encode_decode(i)
            assert i == i_rec


class TestIndex(TestPackers):

    def setup_method(self, method):
        super(TestIndex, self).setup_method(method)

        self.d = {
            'string': tm.makeStringIndex(100),
            'date': tm.makeDateIndex(100),
            'int': tm.makeIntIndex(100),
            'rng': tm.makeRangeIndex(100),
            'float': tm.makeFloatIndex(100),
            'empty': Index([]),
            'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])),
            'period': Index(period_range('2012-1-1', freq='M', periods=3)),
            'date2': Index(date_range('2013-01-1', periods=10)),
            'bdate': Index(bdate_range('2013-01-02', periods=10)),
            'cat': tm.makeCategoricalIndex(100),
            'interval': tm.makeIntervalIndex(100),
            'timedelta': tm.makeTimedeltaIndex(100, 'H')
        }

        self.mi = {
            'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'),
                                           ('foo', 'two'),
                                           ('qux', 'one'), ('qux', 'two')],
                                          names=['first', 'second']),
        }

    def test_basic_index(self):

        for s, i in self.d.items():
            i_rec = self.encode_decode(i)
            tm.assert_index_equal(i, i_rec)

        # datetime with no freq (GH5506)
        i = Index([Timestamp('20130101'), Timestamp('20130103')])
        i_rec = self.encode_decode(i)
        tm.assert_index_equal(i, i_rec)

        # datetime with timezone
        i = Index([Timestamp('20130101 9:00:00'), Timestamp(
            '20130103 11:00:00')]).tz_localize('US/Eastern')
        i_rec = self.encode_decode(i)
        tm.assert_index_equal(i, i_rec)

    def test_multi_index(self):

        for s, i in self.mi.items():
            i_rec = self.encode_decode(i)
            tm.assert_index_equal(i, i_rec)

    def test_unicode(self):
        i = tm.makeUnicodeIndex(100)

        i_rec = self.encode_decode(i)
        tm.assert_index_equal(i, i_rec)

    def categorical_index(self):
        # GH15487
        df = DataFrame(np.random.randn(10, 2))
        df = df.astype({0: 'category'}).set_index(0)
        result = self.encode_decode(df)
        tm.assert_frame_equal(result, df)


class TestSeries(TestPackers):

    def setup_method(self, method):
        super(TestSeries, self).setup_method(method)

        self.d = {}

        s = tm.makeStringSeries()
        s.name = 'string'
        self.d['string'] = s

        s = tm.makeObjectSeries()
        s.name = 'object'
        self.d['object'] = s

        s = Series(iNaT, dtype='M8[ns]', index=range(5))
        self.d['date'] = s

        data = {
            'A': [0., 1., 2., 3., np.nan],
            'B': [0, 1, 0, 1, 0],
            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
            'D': date_range('1/1/2009', periods=5),
            'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
            'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
                 [Timestamp('20130603', tz='CET')] * 3,
            'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
            'H': Categorical([1, 2, 3, 4, 5]),
            'I': Categorical([1, 2, 3, 4, 5], ordered=True),
            'J': (np.bool_(1), 2, 3, 4, 5),
        }

        self.d['float'] = Series(data['A'])
        self.d['int'] = Series(data['B'])
        self.d['mixed'] = Series(data['E'])
        self.d['dt_tz_mixed'] = Series(data['F'])
        self.d['dt_tz'] = Series(data['G'])
        self.d['cat_ordered'] = Series(data['H'])
        self.d['cat_unordered'] = Series(data['I'])
        self.d['numpy_bool_mixed'] = Series(data['J'])

    def test_basic(self):

        # run multiple times here
        for n in range(10):
            for s, i in self.d.items():
                i_rec = self.encode_decode(i)
                assert_series_equal(i, i_rec)


class TestCategorical(TestPackers):

    def setup_method(self, method):
        super(TestCategorical, self).setup_method(method)

        self.d = {}

        self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
        self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
                                                  ordered=True)

        self.d['plain_int'] = Categorical([5, 6, 7, 8])
        self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)

    def test_basic(self):

        # run multiple times here
        for n in range(10):
            for s, i in self.d.items():
                i_rec = self.encode_decode(i)
                assert_categorical_equal(i, i_rec)


@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
class TestNDFrame(TestPackers):

    def setup_method(self, method):
        super(TestNDFrame, self).setup_method(method)

        data = {
            'A': [0., 1., 2., 3., np.nan],
            'B': [0, 1, 0, 1, 0],
            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
            'D': date_range('1/1/2009', periods=5),
            'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
            'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
            'G': [Timestamp('20130603', tz='CET')] * 5,
            'H': Categorical(['a', 'b', 'c', 'd', 'e']),
            'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
        }

        self.frame = {
            'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)),
            'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)),
            'mixed': DataFrame(data)}

        self.panel = {
            'float': Panel(dict(ItemA=self.frame['float'],
                                ItemB=self.frame['float'] + 1))}

    def test_basic_frame(self):

        for s, i in self.frame.items():
            i_rec = self.encode_decode(i)
            assert_frame_equal(i, i_rec)

    def test_basic_panel(self):

        with catch_warnings(record=True):
            for s, i in self.panel.items():
                i_rec = self.encode_decode(i)
                assert_panel_equal(i, i_rec)

    def test_multi(self):

        i_rec = self.encode_decode(self.frame)
        for k in self.frame.keys():
            assert_frame_equal(self.frame[k], i_rec[k])

        packed_items = tuple([self.frame['float'], self.frame['float'].A,
                              self.frame['float'].B, None])
        l_rec = self.encode_decode(packed_items)
        check_arbitrary(packed_items, l_rec)

        # this is an oddity in that packed lists will be returned as tuples
        packed_items = [self.frame['float'], self.frame['float'].A,
                        self.frame['float'].B, None]
        l_rec = self.encode_decode(packed_items)
        assert isinstance(l_rec, tuple)
        check_arbitrary(packed_items, l_rec)

    def test_iterator(self):

        packed_items = [self.frame['float'], self.frame['float'].A,
                        self.frame['float'].B, None]

        with ensure_clean(self.path) as path:
            to_msgpack(path, *packed_items)
            for i, packed in enumerate(read_msgpack(path, iterator=True)):
                check_arbitrary(packed, packed_items[i])

    def tests_datetimeindex_freq_issue(self):

        # GH 5947
        # inferring freq on the datetimeindex
        df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013'))
        result = self.encode_decode(df)
        assert_frame_equal(result, df)

        df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013'))
        result = self.encode_decode(df)
        assert_frame_equal(result, df)

    def test_dataframe_duplicate_column_names(self):

        # GH 9618
        expected_1 = DataFrame(columns=['a', 'a'])
        expected_2 = DataFrame(columns=[1] * 100)
        expected_2.loc[0] = np.random.randn(100)
        expected_3 = DataFrame(columns=[1, 1])
        expected_3.loc[0] = ['abc', np.nan]

        result_1 = self.encode_decode(expected_1)
        result_2 = self.encode_decode(expected_2)
        result_3 = self.encode_decode(expected_3)

        assert_frame_equal(result_1, expected_1)
        assert_frame_equal(result_2, expected_2)
        assert_frame_equal(result_3, expected_3)


class TestSparse(TestPackers):

    def _check_roundtrip(self, obj, comparator, **kwargs):

        # currently these are not implemetned
        # i_rec = self.encode_decode(obj)
        # comparator(obj, i_rec, **kwargs)
        msg = r"msgpack sparse (series|frame) is not implemented"
        with pytest.raises(NotImplementedError, match=msg):
            self.encode_decode(obj)

    def test_sparse_series(self):

        s = tm.makeStringSeries()
        s[3:5] = np.nan
        ss = s.to_sparse()
        self._check_roundtrip(ss, tm.assert_series_equal,
                              check_series_type=True)

        ss2 = s.to_sparse(kind='integer')
        self._check_roundtrip(ss2, tm.assert_series_equal,
                              check_series_type=True)

        ss3 = s.to_sparse(fill_value=0)
        self._check_roundtrip(ss3, tm.assert_series_equal,
                              check_series_type=True)

    def test_sparse_frame(self):

        s = tm.makeDataFrame()
        s.loc[3:5, 1:3] = np.nan
        s.loc[8:10, -2] = np.nan
        ss = s.to_sparse()

        self._check_roundtrip(ss, tm.assert_frame_equal,
                              check_frame_type=True)

        ss2 = s.to_sparse(kind='integer')
        self._check_roundtrip(ss2, tm.assert_frame_equal,
                              check_frame_type=True)

        ss3 = s.to_sparse(fill_value=0)
        self._check_roundtrip(ss3, tm.assert_frame_equal,
                              check_frame_type=True)


class TestCompression(TestPackers):
    """See https://github.com/pandas-dev/pandas/pull/9783
    """

    def setup_method(self, method):
        try:
            from sqlalchemy import create_engine
            self._create_sql_engine = create_engine
        except ImportError:
            self._SQLALCHEMY_INSTALLED = False
        else:
            self._SQLALCHEMY_INSTALLED = True

        super(TestCompression, self).setup_method(method)
        data = {
            'A': np.arange(1000, dtype=np.float64),
            'B': np.arange(1000, dtype=np.int32),
            'C': list(100 * 'abcdefghij'),
            'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
            'E': [datetime.timedelta(days=x) for x in range(1000)],
        }
        self.frame = {
            'float': DataFrame({k: data[k] for k in ['A', 'A']}),
            'int': DataFrame({k: data[k] for k in ['B', 'B']}),
            'mixed': DataFrame(data),
        }

    def test_plain(self):
        i_rec = self.encode_decode(self.frame)
        for k in self.frame.keys():
            assert_frame_equal(self.frame[k], i_rec[k])

    def _test_compression(self, compress):
        i_rec = self.encode_decode(self.frame, compress=compress)
        for k in self.frame.keys():
            value = i_rec[k]
            expected = self.frame[k]
            assert_frame_equal(value, expected)
            # make sure that we can write to the new frames
            for block in value._data.blocks:
                assert block.values.flags.writeable

    def test_compression_zlib(self):
        if not _ZLIB_INSTALLED:
            pytest.skip('no zlib')
        self._test_compression('zlib')

    def test_compression_blosc(self):
        if not _BLOSC_INSTALLED:
            pytest.skip('no blosc')
        self._test_compression('blosc')

    def _test_compression_warns_when_decompress_caches(
            self, monkeypatch, compress):
        not_garbage = []
        control = []  # copied data

        compress_module = globals()[compress]
        real_decompress = compress_module.decompress

        def decompress(ob):
            """mock decompress function that delegates to the real
            decompress but caches the result and a copy of the result.
            """
            res = real_decompress(ob)
            not_garbage.append(res)  # hold a reference to this bytes object
            control.append(bytearray(res))  # copy the data here to check later
            return res

        # types mapped to values to add in place.
        rhs = {
            np.dtype('float64'): 1.0,
            np.dtype('int32'): 1,
            np.dtype('object'): 'a',
            np.dtype('datetime64[ns]'): np.timedelta64(1, 'ns'),
            np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'),
        }

        with monkeypatch.context() as m, \
                tm.assert_produces_warning(PerformanceWarning) as ws:
            m.setattr(compress_module, 'decompress', decompress)
            i_rec = self.encode_decode(self.frame, compress=compress)
            for k in self.frame.keys():

                value = i_rec[k]
                expected = self.frame[k]
                assert_frame_equal(value, expected)
                # make sure that we can write to the new frames even though
                # we needed to copy the data
                for block in value._data.blocks:
                    assert block.values.flags.writeable
                    # mutate the data in some way
                    block.values[0] += rhs[block.dtype]

        for w in ws:
            # check the messages from our warnings
            assert str(w.message) == ('copying data after decompressing; '
                                      'this may mean that decompress is '
                                      'caching its result')

        for buf, control_buf in zip(not_garbage, control):
            # make sure none of our mutations above affected the
            # original buffers
            assert buf == control_buf

    def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch):
        if not _ZLIB_INSTALLED:
            pytest.skip('no zlib')
        self._test_compression_warns_when_decompress_caches(
            monkeypatch, 'zlib')

    def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch):
        if not _BLOSC_INSTALLED:
            pytest.skip('no blosc')
        self._test_compression_warns_when_decompress_caches(
            monkeypatch, 'blosc')

    def _test_small_strings_no_warn(self, compress):
        empty = np.array([], dtype='uint8')
        with tm.assert_produces_warning(None):
            empty_unpacked = self.encode_decode(empty, compress=compress)

        tm.assert_numpy_array_equal(empty_unpacked, empty)
        assert empty_unpacked.flags.writeable

        char = np.array([ord(b'a')], dtype='uint8')
        with tm.assert_produces_warning(None):
            char_unpacked = self.encode_decode(char, compress=compress)

        tm.assert_numpy_array_equal(char_unpacked, char)
        assert char_unpacked.flags.writeable
        # if this test fails I am sorry because the interpreter is now in a
        # bad state where b'a' points to 98 == ord(b'b').
        char_unpacked[0] = ord(b'b')

        # we compare the ord of bytes b'a' with unicode u'a' because the should
        # always be the same (unless we were able to mutate the shared
        # character singleton in which case ord(b'a') == ord(b'b').
        assert ord(b'a') == ord(u'a')
        tm.assert_numpy_array_equal(
            char_unpacked,
            np.array([ord(b'b')], dtype='uint8'),
        )

    def test_small_strings_no_warn_zlib(self):
        if not _ZLIB_INSTALLED:
            pytest.skip('no zlib')
        self._test_small_strings_no_warn('zlib')

    def test_small_strings_no_warn_blosc(self):
        if not _BLOSC_INSTALLED:
            pytest.skip('no blosc')
        self._test_small_strings_no_warn('blosc')

    def test_readonly_axis_blosc(self):
        # GH11880
        if not _BLOSC_INSTALLED:
            pytest.skip('no blosc')
        df1 = DataFrame({'A': list('abcd')})
        df2 = DataFrame(df1, index=[1., 2., 3., 4.])
        assert 1 in self.encode_decode(df1['A'], compress='blosc')
        assert 1. in self.encode_decode(df2['A'], compress='blosc')

    def test_readonly_axis_zlib(self):
        # GH11880
        df1 = DataFrame({'A': list('abcd')})
        df2 = DataFrame(df1, index=[1., 2., 3., 4.])
        assert 1 in self.encode_decode(df1['A'], compress='zlib')
        assert 1. in self.encode_decode(df2['A'], compress='zlib')

    def test_readonly_axis_blosc_to_sql(self):
        # GH11880
        if not _BLOSC_INSTALLED:
            pytest.skip('no blosc')
        if not self._SQLALCHEMY_INSTALLED:
            pytest.skip('no sqlalchemy')
        expected = DataFrame({'A': list('abcd')})
        df = self.encode_decode(expected, compress='blosc')
        eng = self._create_sql_engine("sqlite:///:memory:")
        df.to_sql('test', eng, if_exists='append')
        result = pandas.read_sql_table('test', eng, index_col='index')
        result.index.names = [None]
        assert_frame_equal(expected, result)

    def test_readonly_axis_zlib_to_sql(self):
        # GH11880
        if not _ZLIB_INSTALLED:
            pytest.skip('no zlib')
        if not self._SQLALCHEMY_INSTALLED:
            pytest.skip('no sqlalchemy')
        expected = DataFrame({'A': list('abcd')})
        df = self.encode_decode(expected, compress='zlib')
        eng = self._create_sql_engine("sqlite:///:memory:")
        df.to_sql('test', eng, if_exists='append')
        result = pandas.read_sql_table('test', eng, index_col='index')
        result.index.names = [None]
        assert_frame_equal(expected, result)


class TestEncoding(TestPackers):

    def setup_method(self, method):
        super(TestEncoding, self).setup_method(method)
        data = {
            'A': [compat.u('\u2019')] * 1000,
            'B': np.arange(1000, dtype=np.int32),
            'C': list(100 * 'abcdefghij'),
            'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
            'E': [datetime.timedelta(days=x) for x in range(1000)],
            'G': [400] * 1000
        }
        self.frame = {
            'float': DataFrame({k: data[k] for k in ['A', 'A']}),
            'int': DataFrame({k: data[k] for k in ['B', 'B']}),
            'mixed': DataFrame(data),
        }
        self.utf_encodings = ['utf8', 'utf16', 'utf32']

    def test_utf(self):
        # GH10581
        for encoding in self.utf_encodings:
            for frame in compat.itervalues(self.frame):
                result = self.encode_decode(frame, encoding=encoding)
                assert_frame_equal(result, frame)

    def test_default_encoding(self):
        for frame in compat.itervalues(self.frame):
            result = frame.to_msgpack()
            expected = frame.to_msgpack(encoding='utf8')
            assert result == expected
            result = self.encode_decode(frame)
            assert_frame_equal(result, frame)


files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
                               "legacy_msgpack", "*", "*.msgpack"))


@pytest.fixture(params=files)
def legacy_packer(request, datapath):
    return datapath(request.param)


@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
class TestMsgpack(object):
    """
    How to add msgpack tests:

    1. Install pandas version intended to output the msgpack.
TestPackers
    2. Execute "generate_legacy_storage_files.py" to create the msgpack.
    $ python generate_legacy_storage_files.py <output_dir> msgpack

    3. Move the created pickle to "data/legacy_msgpack/<version>" directory.
    """

    minimum_structure = {'series': ['float', 'int', 'mixed',
                                    'ts', 'mi', 'dup'],
                         'frame': ['float', 'int', 'mixed', 'mi'],
                         'panel': ['float'],
                         'index': ['int', 'date', 'period'],
                         'mi': ['reg2']}

    def check_min_structure(self, data, version):
        for typ, v in self.minimum_structure.items():
            assert typ in data, '"{0}" not found in unpacked data'.format(typ)
            for kind in v:
                msg = '"{0}" not found in data["{1}"]'.format(kind, typ)
                assert kind in data[typ], msg

    def compare(self, current_data, all_data, vf, version):
        # GH12277 encoding default used to be latin-1, now utf-8
        if LooseVersion(version) < LooseVersion('0.18.0'):
            data = read_msgpack(vf, encoding='latin-1')
        else:
            data = read_msgpack(vf)
        self.check_min_structure(data, version)
        for typ, dv in data.items():
            assert typ in all_data, ('unpacked data contains '
                                     'extra key "{0}"'
                                     .format(typ))
            for dt, result in dv.items():
                assert dt in current_data[typ], ('data["{0}"] contains extra '
                                                 'key "{1}"'.format(typ, dt))
                try:
                    expected = current_data[typ][dt]
                except KeyError:
                    continue

                # use a specific comparator
                # if available
                comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
                comparator = getattr(self, comp_method, None)
                if comparator is not None:
                    comparator(result, expected, typ, version)
                else:
                    check_arbitrary(result, expected)

        return data

    def compare_series_dt_tz(self, result, expected, typ, version):
        # 8260
        # dtype is object < 0.17.0
        if LooseVersion(version) < LooseVersion('0.17.0'):
            expected = expected.astype(object)
            tm.assert_series_equal(result, expected)
        else:
            tm.assert_series_equal(result, expected)

    def compare_frame_dt_mixed_tzs(self, result, expected, typ, version):
        # 8260
        # dtype is object < 0.17.0
        if LooseVersion(version) < LooseVersion('0.17.0'):
            expected = expected.astype(object)
            tm.assert_frame_equal(result, expected)
        else:
            tm.assert_frame_equal(result, expected)

    def test_msgpacks_legacy(self, current_packers_data, all_packers_data,
                             legacy_packer, datapath):

        version = os.path.basename(os.path.dirname(legacy_packer))

        # GH12142 0.17 files packed in P2 can't be read in P3
        if (compat.PY3 and version.startswith('0.17.') and
                legacy_packer.split('.')[-4][-1] == '2'):
            msg = "Files packed in Py2 can't be read in Py3 ({})"
            pytest.skip(msg.format(version))
        try:
            with catch_warnings(record=True):
                self.compare(current_packers_data, all_packers_data,
                             legacy_packer, version)
        except ImportError:
            # blosc not installed
            pass

    def test_msgpack_period_freq(self):
        # https://github.com/pandas-dev/pandas/issues/24135
        s = Series(np.random.rand(5), index=date_range('20130101', periods=5))
        r = read_msgpack(s.to_msgpack())
        repr(r)