test_packers.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954
  1. import datetime
  2. from distutils.version import LooseVersion
  3. import glob
  4. import os
  5. from warnings import catch_warnings
  6. import numpy as np
  7. import pytest
  8. from pandas._libs.tslib import iNaT
  9. from pandas.compat import PY3, u
  10. from pandas.errors import PerformanceWarning
  11. import pandas
  12. from pandas import (
  13. Categorical, DataFrame, Index, Interval, MultiIndex, NaT, Panel, Period,
  14. Series, Timestamp, bdate_range, compat, date_range, period_range)
  15. from pandas.tests.test_panel import assert_panel_equal
  16. import pandas.util.testing as tm
  17. from pandas.util.testing import (
  18. assert_categorical_equal, assert_frame_equal, assert_index_equal,
  19. assert_series_equal, ensure_clean)
  20. from pandas.io.packers import read_msgpack, to_msgpack
  21. nan = np.nan
  22. try:
  23. import blosc # NOQA
  24. except ImportError:
  25. _BLOSC_INSTALLED = False
  26. else:
  27. _BLOSC_INSTALLED = True
  28. try:
  29. import zlib # NOQA
  30. except ImportError:
  31. _ZLIB_INSTALLED = False
  32. else:
  33. _ZLIB_INSTALLED = True
  34. @pytest.fixture(scope='module')
  35. def current_packers_data():
  36. # our current version packers data
  37. from pandas.tests.io.generate_legacy_storage_files import (
  38. create_msgpack_data)
  39. return create_msgpack_data()
  40. @pytest.fixture(scope='module')
  41. def all_packers_data():
  42. # our all of our current version packers data
  43. from pandas.tests.io.generate_legacy_storage_files import (
  44. create_data)
  45. return create_data()
  46. def check_arbitrary(a, b):
  47. if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)):
  48. assert(len(a) == len(b))
  49. for a_, b_ in zip(a, b):
  50. check_arbitrary(a_, b_)
  51. elif isinstance(a, Panel):
  52. assert_panel_equal(a, b)
  53. elif isinstance(a, DataFrame):
  54. assert_frame_equal(a, b)
  55. elif isinstance(a, Series):
  56. assert_series_equal(a, b)
  57. elif isinstance(a, Index):
  58. assert_index_equal(a, b)
  59. elif isinstance(a, Categorical):
  60. # Temp,
  61. # Categorical.categories is changed from str to bytes in PY3
  62. # maybe the same as GH 13591
  63. if PY3 and b.categories.inferred_type == 'string':
  64. pass
  65. else:
  66. tm.assert_categorical_equal(a, b)
  67. elif a is NaT:
  68. assert b is NaT
  69. elif isinstance(a, Timestamp):
  70. assert a == b
  71. assert a.freq == b.freq
  72. else:
  73. assert(a == b)
  74. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  75. class TestPackers(object):
  76. def setup_method(self, method):
  77. self.path = '__%s__.msg' % tm.rands(10)
  78. def teardown_method(self, method):
  79. pass
  80. def encode_decode(self, x, compress=None, **kwargs):
  81. with ensure_clean(self.path) as p:
  82. to_msgpack(p, x, compress=compress, **kwargs)
  83. return read_msgpack(p, **kwargs)
  84. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  85. class TestAPI(TestPackers):
  86. def test_string_io(self):
  87. df = DataFrame(np.random.randn(10, 2))
  88. s = df.to_msgpack(None)
  89. result = read_msgpack(s)
  90. tm.assert_frame_equal(result, df)
  91. s = df.to_msgpack()
  92. result = read_msgpack(s)
  93. tm.assert_frame_equal(result, df)
  94. s = df.to_msgpack()
  95. result = read_msgpack(compat.BytesIO(s))
  96. tm.assert_frame_equal(result, df)
  97. s = to_msgpack(None, df)
  98. result = read_msgpack(s)
  99. tm.assert_frame_equal(result, df)
  100. with ensure_clean(self.path) as p:
  101. s = df.to_msgpack()
  102. with open(p, 'wb') as fh:
  103. fh.write(s)
  104. result = read_msgpack(p)
  105. tm.assert_frame_equal(result, df)
  106. def test_path_pathlib(self):
  107. df = tm.makeDataFrame()
  108. result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack)
  109. tm.assert_frame_equal(df, result)
  110. def test_path_localpath(self):
  111. df = tm.makeDataFrame()
  112. result = tm.round_trip_localpath(df.to_msgpack, read_msgpack)
  113. tm.assert_frame_equal(df, result)
  114. def test_iterator_with_string_io(self):
  115. dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)]
  116. s = to_msgpack(None, *dfs)
  117. for i, result in enumerate(read_msgpack(s, iterator=True)):
  118. tm.assert_frame_equal(result, dfs[i])
  119. def test_invalid_arg(self):
  120. # GH10369
  121. class A(object):
  122. def __init__(self):
  123. self.read = 0
  124. msg = (r"Invalid file path or buffer object type: <(class|type)"
  125. r" '{}'>")
  126. with pytest.raises(ValueError, match=msg.format('NoneType')):
  127. read_msgpack(path_or_buf=None)
  128. with pytest.raises(ValueError, match=msg.format('dict')):
  129. read_msgpack(path_or_buf={})
  130. with pytest.raises(ValueError, match=msg.format(r'.*\.A')):
  131. read_msgpack(path_or_buf=A())
  132. class TestNumpy(TestPackers):
  133. def test_numpy_scalar_float(self):
  134. x = np.float32(np.random.rand())
  135. x_rec = self.encode_decode(x)
  136. tm.assert_almost_equal(x, x_rec)
  137. def test_numpy_scalar_complex(self):
  138. x = np.complex64(np.random.rand() + 1j * np.random.rand())
  139. x_rec = self.encode_decode(x)
  140. assert np.allclose(x, x_rec)
  141. def test_scalar_float(self):
  142. x = np.random.rand()
  143. x_rec = self.encode_decode(x)
  144. tm.assert_almost_equal(x, x_rec)
  145. def test_scalar_bool(self):
  146. x = np.bool_(1)
  147. x_rec = self.encode_decode(x)
  148. tm.assert_almost_equal(x, x_rec)
  149. x = np.bool_(0)
  150. x_rec = self.encode_decode(x)
  151. tm.assert_almost_equal(x, x_rec)
  152. def test_scalar_complex(self):
  153. x = np.random.rand() + 1j * np.random.rand()
  154. x_rec = self.encode_decode(x)
  155. assert np.allclose(x, x_rec)
  156. def test_list_numpy_float(self):
  157. x = [np.float32(np.random.rand()) for i in range(5)]
  158. x_rec = self.encode_decode(x)
  159. # current msgpack cannot distinguish list/tuple
  160. tm.assert_almost_equal(tuple(x), x_rec)
  161. x_rec = self.encode_decode(tuple(x))
  162. tm.assert_almost_equal(tuple(x), x_rec)
  163. def test_list_numpy_float_complex(self):
  164. if not hasattr(np, 'complex128'):
  165. pytest.skip('numpy can not handle complex128')
  166. x = [np.float32(np.random.rand()) for i in range(5)] + \
  167. [np.complex128(np.random.rand() + 1j * np.random.rand())
  168. for i in range(5)]
  169. x_rec = self.encode_decode(x)
  170. assert np.allclose(x, x_rec)
  171. def test_list_float(self):
  172. x = [np.random.rand() for i in range(5)]
  173. x_rec = self.encode_decode(x)
  174. # current msgpack cannot distinguish list/tuple
  175. tm.assert_almost_equal(tuple(x), x_rec)
  176. x_rec = self.encode_decode(tuple(x))
  177. tm.assert_almost_equal(tuple(x), x_rec)
  178. def test_list_float_complex(self):
  179. x = [np.random.rand() for i in range(5)] + \
  180. [(np.random.rand() + 1j * np.random.rand()) for i in range(5)]
  181. x_rec = self.encode_decode(x)
  182. assert np.allclose(x, x_rec)
  183. def test_dict_float(self):
  184. x = {'foo': 1.0, 'bar': 2.0}
  185. x_rec = self.encode_decode(x)
  186. tm.assert_almost_equal(x, x_rec)
  187. def test_dict_complex(self):
  188. x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j}
  189. x_rec = self.encode_decode(x)
  190. tm.assert_dict_equal(x, x_rec)
  191. for key in x:
  192. tm.assert_class_equal(x[key], x_rec[key], obj="complex value")
  193. def test_dict_numpy_float(self):
  194. x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)}
  195. x_rec = self.encode_decode(x)
  196. tm.assert_almost_equal(x, x_rec)
  197. def test_dict_numpy_complex(self):
  198. x = {'foo': np.complex128(1.0 + 1.0j),
  199. 'bar': np.complex128(2.0 + 2.0j)}
  200. x_rec = self.encode_decode(x)
  201. tm.assert_dict_equal(x, x_rec)
  202. for key in x:
  203. tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128")
  204. def test_numpy_array_float(self):
  205. # run multiple times
  206. for n in range(10):
  207. x = np.random.rand(10)
  208. for dtype in ['float32', 'float64']:
  209. x = x.astype(dtype)
  210. x_rec = self.encode_decode(x)
  211. tm.assert_almost_equal(x, x_rec)
  212. def test_numpy_array_complex(self):
  213. x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128)
  214. x_rec = self.encode_decode(x)
  215. assert (all(map(lambda x, y: x == y, x, x_rec)) and
  216. x.dtype == x_rec.dtype)
  217. def test_list_mixed(self):
  218. x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)]
  219. x_rec = self.encode_decode(x)
  220. # current msgpack cannot distinguish list/tuple
  221. tm.assert_almost_equal(tuple(x), x_rec)
  222. x_rec = self.encode_decode(tuple(x))
  223. tm.assert_almost_equal(tuple(x), x_rec)
  224. class TestBasic(TestPackers):
  225. def test_timestamp(self):
  226. for i in [Timestamp(
  227. '20130101'), Timestamp('20130101', tz='US/Eastern'),
  228. Timestamp('201301010501')]:
  229. i_rec = self.encode_decode(i)
  230. assert i == i_rec
  231. def test_nat(self):
  232. nat_rec = self.encode_decode(NaT)
  233. assert NaT is nat_rec
  234. def test_datetimes(self):
  235. for i in [datetime.datetime(2013, 1, 1),
  236. datetime.datetime(2013, 1, 1, 5, 1),
  237. datetime.date(2013, 1, 1),
  238. np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]:
  239. i_rec = self.encode_decode(i)
  240. assert i == i_rec
  241. def test_timedeltas(self):
  242. for i in [datetime.timedelta(days=1),
  243. datetime.timedelta(days=1, seconds=10),
  244. np.timedelta64(1000000)]:
  245. i_rec = self.encode_decode(i)
  246. assert i == i_rec
  247. def test_periods(self):
  248. # 13463
  249. for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]:
  250. i_rec = self.encode_decode(i)
  251. assert i == i_rec
  252. def test_intervals(self):
  253. # 19967
  254. for i in [Interval(0, 1), Interval(0, 1, 'left'),
  255. Interval(10, 25., 'right')]:
  256. i_rec = self.encode_decode(i)
  257. assert i == i_rec
  258. class TestIndex(TestPackers):
  259. def setup_method(self, method):
  260. super(TestIndex, self).setup_method(method)
  261. self.d = {
  262. 'string': tm.makeStringIndex(100),
  263. 'date': tm.makeDateIndex(100),
  264. 'int': tm.makeIntIndex(100),
  265. 'rng': tm.makeRangeIndex(100),
  266. 'float': tm.makeFloatIndex(100),
  267. 'empty': Index([]),
  268. 'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])),
  269. 'period': Index(period_range('2012-1-1', freq='M', periods=3)),
  270. 'date2': Index(date_range('2013-01-1', periods=10)),
  271. 'bdate': Index(bdate_range('2013-01-02', periods=10)),
  272. 'cat': tm.makeCategoricalIndex(100),
  273. 'interval': tm.makeIntervalIndex(100),
  274. 'timedelta': tm.makeTimedeltaIndex(100, 'H')
  275. }
  276. self.mi = {
  277. 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'),
  278. ('foo', 'two'),
  279. ('qux', 'one'), ('qux', 'two')],
  280. names=['first', 'second']),
  281. }
  282. def test_basic_index(self):
  283. for s, i in self.d.items():
  284. i_rec = self.encode_decode(i)
  285. tm.assert_index_equal(i, i_rec)
  286. # datetime with no freq (GH5506)
  287. i = Index([Timestamp('20130101'), Timestamp('20130103')])
  288. i_rec = self.encode_decode(i)
  289. tm.assert_index_equal(i, i_rec)
  290. # datetime with timezone
  291. i = Index([Timestamp('20130101 9:00:00'), Timestamp(
  292. '20130103 11:00:00')]).tz_localize('US/Eastern')
  293. i_rec = self.encode_decode(i)
  294. tm.assert_index_equal(i, i_rec)
  295. def test_multi_index(self):
  296. for s, i in self.mi.items():
  297. i_rec = self.encode_decode(i)
  298. tm.assert_index_equal(i, i_rec)
  299. def test_unicode(self):
  300. i = tm.makeUnicodeIndex(100)
  301. i_rec = self.encode_decode(i)
  302. tm.assert_index_equal(i, i_rec)
  303. def categorical_index(self):
  304. # GH15487
  305. df = DataFrame(np.random.randn(10, 2))
  306. df = df.astype({0: 'category'}).set_index(0)
  307. result = self.encode_decode(df)
  308. tm.assert_frame_equal(result, df)
  309. class TestSeries(TestPackers):
  310. def setup_method(self, method):
  311. super(TestSeries, self).setup_method(method)
  312. self.d = {}
  313. s = tm.makeStringSeries()
  314. s.name = 'string'
  315. self.d['string'] = s
  316. s = tm.makeObjectSeries()
  317. s.name = 'object'
  318. self.d['object'] = s
  319. s = Series(iNaT, dtype='M8[ns]', index=range(5))
  320. self.d['date'] = s
  321. data = {
  322. 'A': [0., 1., 2., 3., np.nan],
  323. 'B': [0, 1, 0, 1, 0],
  324. 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
  325. 'D': date_range('1/1/2009', periods=5),
  326. 'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
  327. 'F': [Timestamp('20130102', tz='US/Eastern')] * 2 +
  328. [Timestamp('20130603', tz='CET')] * 3,
  329. 'G': [Timestamp('20130102', tz='US/Eastern')] * 5,
  330. 'H': Categorical([1, 2, 3, 4, 5]),
  331. 'I': Categorical([1, 2, 3, 4, 5], ordered=True),
  332. 'J': (np.bool_(1), 2, 3, 4, 5),
  333. }
  334. self.d['float'] = Series(data['A'])
  335. self.d['int'] = Series(data['B'])
  336. self.d['mixed'] = Series(data['E'])
  337. self.d['dt_tz_mixed'] = Series(data['F'])
  338. self.d['dt_tz'] = Series(data['G'])
  339. self.d['cat_ordered'] = Series(data['H'])
  340. self.d['cat_unordered'] = Series(data['I'])
  341. self.d['numpy_bool_mixed'] = Series(data['J'])
  342. def test_basic(self):
  343. # run multiple times here
  344. for n in range(10):
  345. for s, i in self.d.items():
  346. i_rec = self.encode_decode(i)
  347. assert_series_equal(i, i_rec)
  348. class TestCategorical(TestPackers):
  349. def setup_method(self, method):
  350. super(TestCategorical, self).setup_method(method)
  351. self.d = {}
  352. self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e'])
  353. self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'],
  354. ordered=True)
  355. self.d['plain_int'] = Categorical([5, 6, 7, 8])
  356. self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True)
  357. def test_basic(self):
  358. # run multiple times here
  359. for n in range(10):
  360. for s, i in self.d.items():
  361. i_rec = self.encode_decode(i)
  362. assert_categorical_equal(i, i_rec)
  363. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  364. class TestNDFrame(TestPackers):
  365. def setup_method(self, method):
  366. super(TestNDFrame, self).setup_method(method)
  367. data = {
  368. 'A': [0., 1., 2., 3., np.nan],
  369. 'B': [0, 1, 0, 1, 0],
  370. 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
  371. 'D': date_range('1/1/2009', periods=5),
  372. 'E': [0., 1, Timestamp('20100101'), 'foo', 2.],
  373. 'F': [Timestamp('20130102', tz='US/Eastern')] * 5,
  374. 'G': [Timestamp('20130603', tz='CET')] * 5,
  375. 'H': Categorical(['a', 'b', 'c', 'd', 'e']),
  376. 'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True),
  377. }
  378. self.frame = {
  379. 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)),
  380. 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)),
  381. 'mixed': DataFrame(data)}
  382. self.panel = {
  383. 'float': Panel(dict(ItemA=self.frame['float'],
  384. ItemB=self.frame['float'] + 1))}
  385. def test_basic_frame(self):
  386. for s, i in self.frame.items():
  387. i_rec = self.encode_decode(i)
  388. assert_frame_equal(i, i_rec)
  389. def test_basic_panel(self):
  390. with catch_warnings(record=True):
  391. for s, i in self.panel.items():
  392. i_rec = self.encode_decode(i)
  393. assert_panel_equal(i, i_rec)
  394. def test_multi(self):
  395. i_rec = self.encode_decode(self.frame)
  396. for k in self.frame.keys():
  397. assert_frame_equal(self.frame[k], i_rec[k])
  398. packed_items = tuple([self.frame['float'], self.frame['float'].A,
  399. self.frame['float'].B, None])
  400. l_rec = self.encode_decode(packed_items)
  401. check_arbitrary(packed_items, l_rec)
  402. # this is an oddity in that packed lists will be returned as tuples
  403. packed_items = [self.frame['float'], self.frame['float'].A,
  404. self.frame['float'].B, None]
  405. l_rec = self.encode_decode(packed_items)
  406. assert isinstance(l_rec, tuple)
  407. check_arbitrary(packed_items, l_rec)
  408. def test_iterator(self):
  409. packed_items = [self.frame['float'], self.frame['float'].A,
  410. self.frame['float'].B, None]
  411. with ensure_clean(self.path) as path:
  412. to_msgpack(path, *packed_items)
  413. for i, packed in enumerate(read_msgpack(path, iterator=True)):
  414. check_arbitrary(packed, packed_items[i])
  415. def tests_datetimeindex_freq_issue(self):
  416. # GH 5947
  417. # inferring freq on the datetimeindex
  418. df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013'))
  419. result = self.encode_decode(df)
  420. assert_frame_equal(result, df)
  421. df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013'))
  422. result = self.encode_decode(df)
  423. assert_frame_equal(result, df)
  424. def test_dataframe_duplicate_column_names(self):
  425. # GH 9618
  426. expected_1 = DataFrame(columns=['a', 'a'])
  427. expected_2 = DataFrame(columns=[1] * 100)
  428. expected_2.loc[0] = np.random.randn(100)
  429. expected_3 = DataFrame(columns=[1, 1])
  430. expected_3.loc[0] = ['abc', np.nan]
  431. result_1 = self.encode_decode(expected_1)
  432. result_2 = self.encode_decode(expected_2)
  433. result_3 = self.encode_decode(expected_3)
  434. assert_frame_equal(result_1, expected_1)
  435. assert_frame_equal(result_2, expected_2)
  436. assert_frame_equal(result_3, expected_3)
  437. class TestSparse(TestPackers):
  438. def _check_roundtrip(self, obj, comparator, **kwargs):
  439. # currently these are not implemetned
  440. # i_rec = self.encode_decode(obj)
  441. # comparator(obj, i_rec, **kwargs)
  442. msg = r"msgpack sparse (series|frame) is not implemented"
  443. with pytest.raises(NotImplementedError, match=msg):
  444. self.encode_decode(obj)
  445. def test_sparse_series(self):
  446. s = tm.makeStringSeries()
  447. s[3:5] = np.nan
  448. ss = s.to_sparse()
  449. self._check_roundtrip(ss, tm.assert_series_equal,
  450. check_series_type=True)
  451. ss2 = s.to_sparse(kind='integer')
  452. self._check_roundtrip(ss2, tm.assert_series_equal,
  453. check_series_type=True)
  454. ss3 = s.to_sparse(fill_value=0)
  455. self._check_roundtrip(ss3, tm.assert_series_equal,
  456. check_series_type=True)
  457. def test_sparse_frame(self):
  458. s = tm.makeDataFrame()
  459. s.loc[3:5, 1:3] = np.nan
  460. s.loc[8:10, -2] = np.nan
  461. ss = s.to_sparse()
  462. self._check_roundtrip(ss, tm.assert_frame_equal,
  463. check_frame_type=True)
  464. ss2 = s.to_sparse(kind='integer')
  465. self._check_roundtrip(ss2, tm.assert_frame_equal,
  466. check_frame_type=True)
  467. ss3 = s.to_sparse(fill_value=0)
  468. self._check_roundtrip(ss3, tm.assert_frame_equal,
  469. check_frame_type=True)
  470. class TestCompression(TestPackers):
  471. """See https://github.com/pandas-dev/pandas/pull/9783
  472. """
  473. def setup_method(self, method):
  474. try:
  475. from sqlalchemy import create_engine
  476. self._create_sql_engine = create_engine
  477. except ImportError:
  478. self._SQLALCHEMY_INSTALLED = False
  479. else:
  480. self._SQLALCHEMY_INSTALLED = True
  481. super(TestCompression, self).setup_method(method)
  482. data = {
  483. 'A': np.arange(1000, dtype=np.float64),
  484. 'B': np.arange(1000, dtype=np.int32),
  485. 'C': list(100 * 'abcdefghij'),
  486. 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
  487. 'E': [datetime.timedelta(days=x) for x in range(1000)],
  488. }
  489. self.frame = {
  490. 'float': DataFrame({k: data[k] for k in ['A', 'A']}),
  491. 'int': DataFrame({k: data[k] for k in ['B', 'B']}),
  492. 'mixed': DataFrame(data),
  493. }
  494. def test_plain(self):
  495. i_rec = self.encode_decode(self.frame)
  496. for k in self.frame.keys():
  497. assert_frame_equal(self.frame[k], i_rec[k])
  498. def _test_compression(self, compress):
  499. i_rec = self.encode_decode(self.frame, compress=compress)
  500. for k in self.frame.keys():
  501. value = i_rec[k]
  502. expected = self.frame[k]
  503. assert_frame_equal(value, expected)
  504. # make sure that we can write to the new frames
  505. for block in value._data.blocks:
  506. assert block.values.flags.writeable
  507. def test_compression_zlib(self):
  508. if not _ZLIB_INSTALLED:
  509. pytest.skip('no zlib')
  510. self._test_compression('zlib')
  511. def test_compression_blosc(self):
  512. if not _BLOSC_INSTALLED:
  513. pytest.skip('no blosc')
  514. self._test_compression('blosc')
  515. def _test_compression_warns_when_decompress_caches(
  516. self, monkeypatch, compress):
  517. not_garbage = []
  518. control = [] # copied data
  519. compress_module = globals()[compress]
  520. real_decompress = compress_module.decompress
  521. def decompress(ob):
  522. """mock decompress function that delegates to the real
  523. decompress but caches the result and a copy of the result.
  524. """
  525. res = real_decompress(ob)
  526. not_garbage.append(res) # hold a reference to this bytes object
  527. control.append(bytearray(res)) # copy the data here to check later
  528. return res
  529. # types mapped to values to add in place.
  530. rhs = {
  531. np.dtype('float64'): 1.0,
  532. np.dtype('int32'): 1,
  533. np.dtype('object'): 'a',
  534. np.dtype('datetime64[ns]'): np.timedelta64(1, 'ns'),
  535. np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'),
  536. }
  537. with monkeypatch.context() as m, \
  538. tm.assert_produces_warning(PerformanceWarning) as ws:
  539. m.setattr(compress_module, 'decompress', decompress)
  540. i_rec = self.encode_decode(self.frame, compress=compress)
  541. for k in self.frame.keys():
  542. value = i_rec[k]
  543. expected = self.frame[k]
  544. assert_frame_equal(value, expected)
  545. # make sure that we can write to the new frames even though
  546. # we needed to copy the data
  547. for block in value._data.blocks:
  548. assert block.values.flags.writeable
  549. # mutate the data in some way
  550. block.values[0] += rhs[block.dtype]
  551. for w in ws:
  552. # check the messages from our warnings
  553. assert str(w.message) == ('copying data after decompressing; '
  554. 'this may mean that decompress is '
  555. 'caching its result')
  556. for buf, control_buf in zip(not_garbage, control):
  557. # make sure none of our mutations above affected the
  558. # original buffers
  559. assert buf == control_buf
  560. def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch):
  561. if not _ZLIB_INSTALLED:
  562. pytest.skip('no zlib')
  563. self._test_compression_warns_when_decompress_caches(
  564. monkeypatch, 'zlib')
  565. def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch):
  566. if not _BLOSC_INSTALLED:
  567. pytest.skip('no blosc')
  568. self._test_compression_warns_when_decompress_caches(
  569. monkeypatch, 'blosc')
  570. def _test_small_strings_no_warn(self, compress):
  571. empty = np.array([], dtype='uint8')
  572. with tm.assert_produces_warning(None):
  573. empty_unpacked = self.encode_decode(empty, compress=compress)
  574. tm.assert_numpy_array_equal(empty_unpacked, empty)
  575. assert empty_unpacked.flags.writeable
  576. char = np.array([ord(b'a')], dtype='uint8')
  577. with tm.assert_produces_warning(None):
  578. char_unpacked = self.encode_decode(char, compress=compress)
  579. tm.assert_numpy_array_equal(char_unpacked, char)
  580. assert char_unpacked.flags.writeable
  581. # if this test fails I am sorry because the interpreter is now in a
  582. # bad state where b'a' points to 98 == ord(b'b').
  583. char_unpacked[0] = ord(b'b')
  584. # we compare the ord of bytes b'a' with unicode u'a' because the should
  585. # always be the same (unless we were able to mutate the shared
  586. # character singleton in which case ord(b'a') == ord(b'b').
  587. assert ord(b'a') == ord(u'a')
  588. tm.assert_numpy_array_equal(
  589. char_unpacked,
  590. np.array([ord(b'b')], dtype='uint8'),
  591. )
  592. def test_small_strings_no_warn_zlib(self):
  593. if not _ZLIB_INSTALLED:
  594. pytest.skip('no zlib')
  595. self._test_small_strings_no_warn('zlib')
  596. def test_small_strings_no_warn_blosc(self):
  597. if not _BLOSC_INSTALLED:
  598. pytest.skip('no blosc')
  599. self._test_small_strings_no_warn('blosc')
  600. def test_readonly_axis_blosc(self):
  601. # GH11880
  602. if not _BLOSC_INSTALLED:
  603. pytest.skip('no blosc')
  604. df1 = DataFrame({'A': list('abcd')})
  605. df2 = DataFrame(df1, index=[1., 2., 3., 4.])
  606. assert 1 in self.encode_decode(df1['A'], compress='blosc')
  607. assert 1. in self.encode_decode(df2['A'], compress='blosc')
  608. def test_readonly_axis_zlib(self):
  609. # GH11880
  610. df1 = DataFrame({'A': list('abcd')})
  611. df2 = DataFrame(df1, index=[1., 2., 3., 4.])
  612. assert 1 in self.encode_decode(df1['A'], compress='zlib')
  613. assert 1. in self.encode_decode(df2['A'], compress='zlib')
  614. def test_readonly_axis_blosc_to_sql(self):
  615. # GH11880
  616. if not _BLOSC_INSTALLED:
  617. pytest.skip('no blosc')
  618. if not self._SQLALCHEMY_INSTALLED:
  619. pytest.skip('no sqlalchemy')
  620. expected = DataFrame({'A': list('abcd')})
  621. df = self.encode_decode(expected, compress='blosc')
  622. eng = self._create_sql_engine("sqlite:///:memory:")
  623. df.to_sql('test', eng, if_exists='append')
  624. result = pandas.read_sql_table('test', eng, index_col='index')
  625. result.index.names = [None]
  626. assert_frame_equal(expected, result)
  627. def test_readonly_axis_zlib_to_sql(self):
  628. # GH11880
  629. if not _ZLIB_INSTALLED:
  630. pytest.skip('no zlib')
  631. if not self._SQLALCHEMY_INSTALLED:
  632. pytest.skip('no sqlalchemy')
  633. expected = DataFrame({'A': list('abcd')})
  634. df = self.encode_decode(expected, compress='zlib')
  635. eng = self._create_sql_engine("sqlite:///:memory:")
  636. df.to_sql('test', eng, if_exists='append')
  637. result = pandas.read_sql_table('test', eng, index_col='index')
  638. result.index.names = [None]
  639. assert_frame_equal(expected, result)
  640. class TestEncoding(TestPackers):
  641. def setup_method(self, method):
  642. super(TestEncoding, self).setup_method(method)
  643. data = {
  644. 'A': [compat.u('\u2019')] * 1000,
  645. 'B': np.arange(1000, dtype=np.int32),
  646. 'C': list(100 * 'abcdefghij'),
  647. 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000),
  648. 'E': [datetime.timedelta(days=x) for x in range(1000)],
  649. 'G': [400] * 1000
  650. }
  651. self.frame = {
  652. 'float': DataFrame({k: data[k] for k in ['A', 'A']}),
  653. 'int': DataFrame({k: data[k] for k in ['B', 'B']}),
  654. 'mixed': DataFrame(data),
  655. }
  656. self.utf_encodings = ['utf8', 'utf16', 'utf32']
  657. def test_utf(self):
  658. # GH10581
  659. for encoding in self.utf_encodings:
  660. for frame in compat.itervalues(self.frame):
  661. result = self.encode_decode(frame, encoding=encoding)
  662. assert_frame_equal(result, frame)
  663. def test_default_encoding(self):
  664. for frame in compat.itervalues(self.frame):
  665. result = frame.to_msgpack()
  666. expected = frame.to_msgpack(encoding='utf8')
  667. assert result == expected
  668. result = self.encode_decode(frame)
  669. assert_frame_equal(result, frame)
  670. files = glob.glob(os.path.join(os.path.dirname(__file__), "data",
  671. "legacy_msgpack", "*", "*.msgpack"))
  672. @pytest.fixture(params=files)
  673. def legacy_packer(request, datapath):
  674. return datapath(request.param)
  675. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  676. class TestMsgpack(object):
  677. """
  678. How to add msgpack tests:
  679. 1. Install pandas version intended to output the msgpack.
  680. TestPackers
  681. 2. Execute "generate_legacy_storage_files.py" to create the msgpack.
  682. $ python generate_legacy_storage_files.py <output_dir> msgpack
  683. 3. Move the created pickle to "data/legacy_msgpack/<version>" directory.
  684. """
  685. minimum_structure = {'series': ['float', 'int', 'mixed',
  686. 'ts', 'mi', 'dup'],
  687. 'frame': ['float', 'int', 'mixed', 'mi'],
  688. 'panel': ['float'],
  689. 'index': ['int', 'date', 'period'],
  690. 'mi': ['reg2']}
  691. def check_min_structure(self, data, version):
  692. for typ, v in self.minimum_structure.items():
  693. assert typ in data, '"{0}" not found in unpacked data'.format(typ)
  694. for kind in v:
  695. msg = '"{0}" not found in data["{1}"]'.format(kind, typ)
  696. assert kind in data[typ], msg
  697. def compare(self, current_data, all_data, vf, version):
  698. # GH12277 encoding default used to be latin-1, now utf-8
  699. if LooseVersion(version) < LooseVersion('0.18.0'):
  700. data = read_msgpack(vf, encoding='latin-1')
  701. else:
  702. data = read_msgpack(vf)
  703. self.check_min_structure(data, version)
  704. for typ, dv in data.items():
  705. assert typ in all_data, ('unpacked data contains '
  706. 'extra key "{0}"'
  707. .format(typ))
  708. for dt, result in dv.items():
  709. assert dt in current_data[typ], ('data["{0}"] contains extra '
  710. 'key "{1}"'.format(typ, dt))
  711. try:
  712. expected = current_data[typ][dt]
  713. except KeyError:
  714. continue
  715. # use a specific comparator
  716. # if available
  717. comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
  718. comparator = getattr(self, comp_method, None)
  719. if comparator is not None:
  720. comparator(result, expected, typ, version)
  721. else:
  722. check_arbitrary(result, expected)
  723. return data
  724. def compare_series_dt_tz(self, result, expected, typ, version):
  725. # 8260
  726. # dtype is object < 0.17.0
  727. if LooseVersion(version) < LooseVersion('0.17.0'):
  728. expected = expected.astype(object)
  729. tm.assert_series_equal(result, expected)
  730. else:
  731. tm.assert_series_equal(result, expected)
  732. def compare_frame_dt_mixed_tzs(self, result, expected, typ, version):
  733. # 8260
  734. # dtype is object < 0.17.0
  735. if LooseVersion(version) < LooseVersion('0.17.0'):
  736. expected = expected.astype(object)
  737. tm.assert_frame_equal(result, expected)
  738. else:
  739. tm.assert_frame_equal(result, expected)
  740. def test_msgpacks_legacy(self, current_packers_data, all_packers_data,
  741. legacy_packer, datapath):
  742. version = os.path.basename(os.path.dirname(legacy_packer))
  743. # GH12142 0.17 files packed in P2 can't be read in P3
  744. if (compat.PY3 and version.startswith('0.17.') and
  745. legacy_packer.split('.')[-4][-1] == '2'):
  746. msg = "Files packed in Py2 can't be read in Py3 ({})"
  747. pytest.skip(msg.format(version))
  748. try:
  749. with catch_warnings(record=True):
  750. self.compare(current_packers_data, all_packers_data,
  751. legacy_packer, version)
  752. except ImportError:
  753. # blosc not installed
  754. pass
  755. def test_msgpack_period_freq(self):
  756. # https://github.com/pandas-dev/pandas/issues/24135
  757. s = Series(np.random.rand(5), index=date_range('20130101', periods=5))
  758. r = read_msgpack(s.to_msgpack())
  759. repr(r)