test_block_internals.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. from datetime import datetime, timedelta
  4. import itertools
  5. import numpy as np
  6. import pytest
  7. from pandas.compat import StringIO
  8. import pandas as pd
  9. from pandas import (
  10. Categorical, DataFrame, Series, Timestamp, compat, date_range,
  11. option_context)
  12. from pandas.core.arrays import IntervalArray, integer_array
  13. from pandas.core.internals.blocks import IntBlock
  14. import pandas.util.testing as tm
  15. from pandas.util.testing import (
  16. assert_almost_equal, assert_frame_equal, assert_series_equal)
  17. # Segregated collection of methods that require the BlockManager internal data
  18. # structure
  19. class TestDataFrameBlockInternals():
  20. def test_setitem_invalidates_datetime_index_freq(self):
  21. # GH#24096 altering a datetime64tz column inplace invalidates the
  22. # `freq` attribute on the underlying DatetimeIndex
  23. dti = date_range('20130101', periods=3, tz='US/Eastern')
  24. ts = dti[1]
  25. df = DataFrame({'B': dti})
  26. assert df['B']._values.freq == 'D'
  27. df.iloc[1, 0] = pd.NaT
  28. assert df['B']._values.freq is None
  29. # check that the DatetimeIndex was not altered in place
  30. assert dti.freq == 'D'
  31. assert dti[1] == ts
  32. def test_cast_internals(self, float_frame):
  33. casted = DataFrame(float_frame._data, dtype=int)
  34. expected = DataFrame(float_frame._series, dtype=int)
  35. assert_frame_equal(casted, expected)
  36. casted = DataFrame(float_frame._data, dtype=np.int32)
  37. expected = DataFrame(float_frame._series, dtype=np.int32)
  38. assert_frame_equal(casted, expected)
  39. def test_consolidate(self, float_frame):
  40. float_frame['E'] = 7.
  41. consolidated = float_frame._consolidate()
  42. assert len(consolidated._data.blocks) == 1
  43. # Ensure copy, do I want this?
  44. recons = consolidated._consolidate()
  45. assert recons is not consolidated
  46. tm.assert_frame_equal(recons, consolidated)
  47. float_frame['F'] = 8.
  48. assert len(float_frame._data.blocks) == 3
  49. float_frame._consolidate(inplace=True)
  50. assert len(float_frame._data.blocks) == 1
  51. def test_consolidate_inplace(self, float_frame):
  52. frame = float_frame.copy() # noqa
  53. # triggers in-place consolidation
  54. for letter in range(ord('A'), ord('Z')):
  55. float_frame[chr(letter)] = chr(letter)
  56. def test_values_consolidate(self, float_frame):
  57. float_frame['E'] = 7.
  58. assert not float_frame._data.is_consolidated()
  59. _ = float_frame.values # noqa
  60. assert float_frame._data.is_consolidated()
  61. def test_modify_values(self, float_frame):
  62. float_frame.values[5] = 5
  63. assert (float_frame.values[5] == 5).all()
  64. # unconsolidated
  65. float_frame['E'] = 7.
  66. float_frame.values[6] = 6
  67. assert (float_frame.values[6] == 6).all()
  68. def test_boolean_set_uncons(self, float_frame):
  69. float_frame['E'] = 7.
  70. expected = float_frame.values.copy()
  71. expected[expected > 1] = 2
  72. float_frame[float_frame > 1] = 2
  73. assert_almost_equal(expected, float_frame.values)
  74. def test_values_numeric_cols(self, float_frame):
  75. float_frame['foo'] = 'bar'
  76. values = float_frame[['A', 'B', 'C', 'D']].values
  77. assert values.dtype == np.float64
  78. def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
  79. # mixed lcd
  80. values = mixed_float_frame[['A', 'B', 'C', 'D']].values
  81. assert values.dtype == np.float64
  82. values = mixed_float_frame[['A', 'B', 'C']].values
  83. assert values.dtype == np.float32
  84. values = mixed_float_frame[['C']].values
  85. assert values.dtype == np.float16
  86. # GH 10364
  87. # B uint64 forces float because there are other signed int types
  88. values = mixed_int_frame[['A', 'B', 'C', 'D']].values
  89. assert values.dtype == np.float64
  90. values = mixed_int_frame[['A', 'D']].values
  91. assert values.dtype == np.int64
  92. # B uint64 forces float because there are other signed int types
  93. values = mixed_int_frame[['A', 'B', 'C']].values
  94. assert values.dtype == np.float64
  95. # as B and C are both unsigned, no forcing to float is needed
  96. values = mixed_int_frame[['B', 'C']].values
  97. assert values.dtype == np.uint64
  98. values = mixed_int_frame[['A', 'C']].values
  99. assert values.dtype == np.int32
  100. values = mixed_int_frame[['C', 'D']].values
  101. assert values.dtype == np.int64
  102. values = mixed_int_frame[['A']].values
  103. assert values.dtype == np.int32
  104. values = mixed_int_frame[['C']].values
  105. assert values.dtype == np.uint8
  106. def test_constructor_with_convert(self):
  107. # this is actually mostly a test of lib.maybe_convert_objects
  108. # #2845
  109. df = DataFrame({'A': [2 ** 63 - 1]})
  110. result = df['A']
  111. expected = Series(np.asarray([2 ** 63 - 1], np.int64), name='A')
  112. assert_series_equal(result, expected)
  113. df = DataFrame({'A': [2 ** 63]})
  114. result = df['A']
  115. expected = Series(np.asarray([2 ** 63], np.uint64), name='A')
  116. assert_series_equal(result, expected)
  117. df = DataFrame({'A': [datetime(2005, 1, 1), True]})
  118. result = df['A']
  119. expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_),
  120. name='A')
  121. assert_series_equal(result, expected)
  122. df = DataFrame({'A': [None, 1]})
  123. result = df['A']
  124. expected = Series(np.asarray([np.nan, 1], np.float_), name='A')
  125. assert_series_equal(result, expected)
  126. df = DataFrame({'A': [1.0, 2]})
  127. result = df['A']
  128. expected = Series(np.asarray([1.0, 2], np.float_), name='A')
  129. assert_series_equal(result, expected)
  130. df = DataFrame({'A': [1.0 + 2.0j, 3]})
  131. result = df['A']
  132. expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name='A')
  133. assert_series_equal(result, expected)
  134. df = DataFrame({'A': [1.0 + 2.0j, 3.0]})
  135. result = df['A']
  136. expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name='A')
  137. assert_series_equal(result, expected)
  138. df = DataFrame({'A': [1.0 + 2.0j, True]})
  139. result = df['A']
  140. expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name='A')
  141. assert_series_equal(result, expected)
  142. df = DataFrame({'A': [1.0, None]})
  143. result = df['A']
  144. expected = Series(np.asarray([1.0, np.nan], np.float_), name='A')
  145. assert_series_equal(result, expected)
  146. df = DataFrame({'A': [1.0 + 2.0j, None]})
  147. result = df['A']
  148. expected = Series(np.asarray(
  149. [1.0 + 2.0j, np.nan], np.complex_), name='A')
  150. assert_series_equal(result, expected)
  151. df = DataFrame({'A': [2.0, 1, True, None]})
  152. result = df['A']
  153. expected = Series(np.asarray(
  154. [2.0, 1, True, None], np.object_), name='A')
  155. assert_series_equal(result, expected)
  156. df = DataFrame({'A': [2.0, 1, datetime(2006, 1, 1), None]})
  157. result = df['A']
  158. expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1),
  159. None], np.object_), name='A')
  160. assert_series_equal(result, expected)
  161. def test_construction_with_mixed(self, float_string_frame):
  162. # test construction edge cases with mixed types
  163. # f7u12, this does not work without extensive workaround
  164. data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
  165. [datetime(2000, 1, 2), datetime(2000, 1, 3),
  166. datetime(2000, 1, 1)]]
  167. df = DataFrame(data)
  168. # check dtypes
  169. result = df.get_dtype_counts().sort_values()
  170. expected = Series({'datetime64[ns]': 3})
  171. # mixed-type frames
  172. float_string_frame['datetime'] = datetime.now()
  173. float_string_frame['timedelta'] = timedelta(days=1, seconds=1)
  174. assert float_string_frame['datetime'].dtype == 'M8[ns]'
  175. assert float_string_frame['timedelta'].dtype == 'm8[ns]'
  176. result = float_string_frame.get_dtype_counts().sort_values()
  177. expected = Series({'float64': 4,
  178. 'object': 1,
  179. 'datetime64[ns]': 1,
  180. 'timedelta64[ns]': 1}).sort_values()
  181. assert_series_equal(result, expected)
  182. def test_construction_with_conversions(self):
  183. # convert from a numpy array of non-ns timedelta64
  184. arr = np.array([1, 2, 3], dtype='timedelta64[s]')
  185. df = DataFrame(index=range(3))
  186. df['A'] = arr
  187. expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3,
  188. freq='s')},
  189. index=range(3))
  190. assert_frame_equal(df, expected)
  191. expected = DataFrame({
  192. 'dt1': Timestamp('20130101'),
  193. 'dt2': date_range('20130101', periods=3),
  194. # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
  195. }, index=range(3))
  196. df = DataFrame(index=range(3))
  197. df['dt1'] = np.datetime64('2013-01-01')
  198. df['dt2'] = np.array(['2013-01-01', '2013-01-02', '2013-01-03'],
  199. dtype='datetime64[D]')
  200. # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
  201. # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
  202. assert_frame_equal(df, expected)
  203. def test_constructor_compound_dtypes(self):
  204. # GH 5191
  205. # compound dtypes should raise not-implementederror
  206. def f(dtype):
  207. data = list(itertools.repeat((datetime(2001, 1, 1),
  208. "aa", 20), 9))
  209. return DataFrame(data=data,
  210. columns=["A", "B", "C"],
  211. dtype=dtype)
  212. pytest.raises(NotImplementedError, f,
  213. [("A", "datetime64[h]"),
  214. ("B", "str"),
  215. ("C", "int32")])
  216. # these work (though results may be unexpected)
  217. f('int64')
  218. f('float64')
  219. # 10822
  220. # invalid error message on dt inference
  221. if not compat.is_platform_windows():
  222. f('M8[ns]')
  223. def test_equals_different_blocks(self):
  224. # GH 9330
  225. df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2],
  226. "C": ["w", "z"]})
  227. df1 = df0.reset_index()[["A", "B", "C"]]
  228. # this assert verifies that the above operations have
  229. # induced a block rearrangement
  230. assert (df0._data.blocks[0].dtype != df1._data.blocks[0].dtype)
  231. # do the real tests
  232. assert_frame_equal(df0, df1)
  233. assert df0.equals(df1)
  234. assert df1.equals(df0)
  235. def test_copy_blocks(self, float_frame):
  236. # API/ENH 9607
  237. df = DataFrame(float_frame, copy=True)
  238. column = df.columns[0]
  239. # use the default copy=True, change a column
  240. # deprecated 0.21.0
  241. with tm.assert_produces_warning(FutureWarning,
  242. check_stacklevel=False):
  243. blocks = df.as_blocks()
  244. for dtype, _df in blocks.items():
  245. if column in _df:
  246. _df.loc[:, column] = _df[column] + 1
  247. # make sure we did not change the original DataFrame
  248. assert not _df[column].equals(df[column])
  249. def test_no_copy_blocks(self, float_frame):
  250. # API/ENH 9607
  251. df = DataFrame(float_frame, copy=True)
  252. column = df.columns[0]
  253. # use the copy=False, change a column
  254. # deprecated 0.21.0
  255. with tm.assert_produces_warning(FutureWarning,
  256. check_stacklevel=False):
  257. blocks = df.as_blocks(copy=False)
  258. for dtype, _df in blocks.items():
  259. if column in _df:
  260. _df.loc[:, column] = _df[column] + 1
  261. # make sure we did change the original DataFrame
  262. assert _df[column].equals(df[column])
  263. def test_copy(self, float_frame, float_string_frame):
  264. cop = float_frame.copy()
  265. cop['E'] = cop['A']
  266. assert 'E' not in float_frame
  267. # copy objects
  268. copy = float_string_frame.copy()
  269. assert copy._data is not float_string_frame._data
  270. def test_pickle(self, float_string_frame, empty_frame, timezone_frame):
  271. unpickled = tm.round_trip_pickle(float_string_frame)
  272. assert_frame_equal(float_string_frame, unpickled)
  273. # buglet
  274. float_string_frame._data.ndim
  275. # empty
  276. unpickled = tm.round_trip_pickle(empty_frame)
  277. repr(unpickled)
  278. # tz frame
  279. unpickled = tm.round_trip_pickle(timezone_frame)
  280. assert_frame_equal(timezone_frame, unpickled)
  281. def test_consolidate_datetime64(self):
  282. # numpy vstack bug
  283. data = """\
  284. starting,ending,measure
  285. 2012-06-21 00:00,2012-06-23 07:00,77
  286. 2012-06-23 07:00,2012-06-23 16:30,65
  287. 2012-06-23 16:30,2012-06-25 08:00,77
  288. 2012-06-25 08:00,2012-06-26 12:00,0
  289. 2012-06-26 12:00,2012-06-27 08:00,77
  290. """
  291. df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
  292. ser_starting = df.starting
  293. ser_starting.index = ser_starting.values
  294. ser_starting = ser_starting.tz_localize('US/Eastern')
  295. ser_starting = ser_starting.tz_convert('UTC')
  296. ser_starting.index.name = 'starting'
  297. ser_ending = df.ending
  298. ser_ending.index = ser_ending.values
  299. ser_ending = ser_ending.tz_localize('US/Eastern')
  300. ser_ending = ser_ending.tz_convert('UTC')
  301. ser_ending.index.name = 'ending'
  302. df.starting = ser_starting.index
  303. df.ending = ser_ending.index
  304. tm.assert_index_equal(pd.DatetimeIndex(
  305. df.starting), ser_starting.index)
  306. tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
  307. def test_is_mixed_type(self, float_frame, float_string_frame):
  308. assert not float_frame._is_mixed_type
  309. assert float_string_frame._is_mixed_type
  310. def test_get_numeric_data(self):
  311. # TODO(wesm): unused?
  312. intname = np.dtype(np.int_).name # noqa
  313. floatname = np.dtype(np.float_).name # noqa
  314. datetime64name = np.dtype('M8[ns]').name
  315. objectname = np.dtype(np.object_).name
  316. df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
  317. 'f': Timestamp('20010102')},
  318. index=np.arange(10))
  319. result = df.get_dtype_counts()
  320. expected = Series({'int64': 1, 'float64': 1,
  321. datetime64name: 1, objectname: 1})
  322. result = result.sort_index()
  323. expected = expected.sort_index()
  324. assert_series_equal(result, expected)
  325. df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
  326. 'd': np.array([1.] * 10, dtype='float32'),
  327. 'e': np.array([1] * 10, dtype='int32'),
  328. 'f': np.array([1] * 10, dtype='int16'),
  329. 'g': Timestamp('20010102')},
  330. index=np.arange(10))
  331. result = df._get_numeric_data()
  332. expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
  333. assert_frame_equal(result, expected)
  334. only_obj = df.loc[:, ['c', 'g']]
  335. result = only_obj._get_numeric_data()
  336. expected = df.loc[:, []]
  337. assert_frame_equal(result, expected)
  338. df = DataFrame.from_dict(
  339. {'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]})
  340. result = df._get_numeric_data()
  341. expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
  342. assert_frame_equal(result, expected)
  343. df = result.copy()
  344. result = df._get_numeric_data()
  345. expected = df
  346. assert_frame_equal(result, expected)
  347. def test_get_numeric_data_extension_dtype(self):
  348. # GH 22290
  349. df = DataFrame({
  350. 'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'),
  351. 'B': Categorical(list('abcabc')),
  352. 'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'),
  353. 'D': IntervalArray.from_breaks(range(7))})
  354. result = df._get_numeric_data()
  355. expected = df.loc[:, ['A', 'C']]
  356. assert_frame_equal(result, expected)
  357. def test_convert_objects(self, float_string_frame):
  358. oops = float_string_frame.T.T
  359. converted = oops._convert(datetime=True)
  360. assert_frame_equal(converted, float_string_frame)
  361. assert converted['A'].dtype == np.float64
  362. # force numeric conversion
  363. float_string_frame['H'] = '1.'
  364. float_string_frame['I'] = '1'
  365. # add in some items that will be nan
  366. length = len(float_string_frame)
  367. float_string_frame['J'] = '1.'
  368. float_string_frame['K'] = '1'
  369. float_string_frame.loc[0:5, ['J', 'K']] = 'garbled'
  370. converted = float_string_frame._convert(datetime=True, numeric=True)
  371. assert converted['H'].dtype == 'float64'
  372. assert converted['I'].dtype == 'int64'
  373. assert converted['J'].dtype == 'float64'
  374. assert converted['K'].dtype == 'float64'
  375. assert len(converted['J'].dropna()) == length - 5
  376. assert len(converted['K'].dropna()) == length - 5
  377. # via astype
  378. converted = float_string_frame.copy()
  379. converted['H'] = converted['H'].astype('float64')
  380. converted['I'] = converted['I'].astype('int64')
  381. assert converted['H'].dtype == 'float64'
  382. assert converted['I'].dtype == 'int64'
  383. # via astype, but errors
  384. converted = float_string_frame.copy()
  385. with pytest.raises(ValueError, match='invalid literal'):
  386. converted['H'].astype('int32')
  387. # mixed in a single column
  388. df = DataFrame(dict(s=Series([1, 'na', 3, 4])))
  389. result = df._convert(datetime=True, numeric=True)
  390. expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
  391. assert_frame_equal(result, expected)
  392. def test_convert_objects_no_conversion(self):
  393. mixed1 = DataFrame(
  394. {'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']})
  395. mixed2 = mixed1._convert(datetime=True)
  396. assert_frame_equal(mixed1, mixed2)
  397. def test_infer_objects(self):
  398. # GH 11221
  399. df = DataFrame({'a': ['a', 1, 2, 3],
  400. 'b': ['b', 2.0, 3.0, 4.1],
  401. 'c': ['c', datetime(2016, 1, 1),
  402. datetime(2016, 1, 2),
  403. datetime(2016, 1, 3)],
  404. 'd': [1, 2, 3, 'd']},
  405. columns=['a', 'b', 'c', 'd'])
  406. df = df.iloc[1:].infer_objects()
  407. assert df['a'].dtype == 'int64'
  408. assert df['b'].dtype == 'float64'
  409. assert df['c'].dtype == 'M8[ns]'
  410. assert df['d'].dtype == 'object'
  411. expected = DataFrame({'a': [1, 2, 3],
  412. 'b': [2.0, 3.0, 4.1],
  413. 'c': [datetime(2016, 1, 1),
  414. datetime(2016, 1, 2),
  415. datetime(2016, 1, 3)],
  416. 'd': [2, 3, 'd']},
  417. columns=['a', 'b', 'c', 'd'])
  418. # reconstruct frame to verify inference is same
  419. tm.assert_frame_equal(df.reset_index(drop=True), expected)
  420. def test_stale_cached_series_bug_473(self):
  421. # this is chained, but ok
  422. with option_context('chained_assignment', None):
  423. Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'),
  424. columns=('e', 'f', 'g', 'h'))
  425. repr(Y)
  426. Y['e'] = Y['e'].astype('object')
  427. Y['g']['c'] = np.NaN
  428. repr(Y)
  429. result = Y.sum() # noqa
  430. exp = Y['g'].sum() # noqa
  431. assert pd.isna(Y['g']['c'])
  432. def test_get_X_columns(self):
  433. # numeric and object columns
  434. df = DataFrame({'a': [1, 2, 3],
  435. 'b': [True, False, True],
  436. 'c': ['foo', 'bar', 'baz'],
  437. 'd': [None, None, None],
  438. 'e': [3.14, 0.577, 2.773]})
  439. tm.assert_index_equal(df._get_numeric_data().columns,
  440. pd.Index(['a', 'b', 'e']))
  441. def test_strange_column_corruption_issue(self):
  442. # (wesm) Unclear how exactly this is related to internal matters
  443. df = DataFrame(index=[0, 1])
  444. df[0] = np.nan
  445. wasCol = {}
  446. # uncommenting these makes the results match
  447. # for col in xrange(100, 200):
  448. # wasCol[col] = 1
  449. # df[col] = np.nan
  450. for i, dt in enumerate(df.index):
  451. for col in range(100, 200):
  452. if col not in wasCol:
  453. wasCol[col] = 1
  454. df[col] = np.nan
  455. df[col][dt] = i
  456. myid = 100
  457. first = len(df.loc[pd.isna(df[myid]), [myid]])
  458. second = len(df.loc[pd.isna(df[myid]), [myid]])
  459. assert first == second == 0
  460. def test_constructor_no_pandas_array(self):
  461. # Ensure that PandasArray isn't allowed inside Series
  462. # See https://github.com/pandas-dev/pandas/issues/23995 for more.
  463. arr = pd.Series([1, 2, 3]).array
  464. result = pd.DataFrame({"A": arr})
  465. expected = pd.DataFrame({"A": [1, 2, 3]})
  466. tm.assert_frame_equal(result, expected)
  467. assert isinstance(result._data.blocks[0], IntBlock)