# -*- coding: utf-8 -*- from __future__ import print_function from datetime import datetime, timedelta import functools import itertools import numpy as np import numpy.ma as ma import pytest from pandas.compat import ( PY2, PY3, PY36, OrderedDict, is_platform_little_endian, lmap, long, lrange, lzip, range, zip) from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_integer_dtype import pandas as pd from pandas import ( Categorical, DataFrame, Index, MultiIndex, Series, Timedelta, Timestamp, _np_version_under1p13, compat, date_range, isna) from pandas.tests.frame.common import TestData import pandas.util.testing as tm MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64'] MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64'] class TestDataFrameConstructors(TestData): def test_constructor(self): df = DataFrame() assert len(df.index) == 0 df = DataFrame(data={}) assert len(df.index) == 0 def test_constructor_mixed(self): index, data = tm.getMixedTypeDict() # TODO(wesm), incomplete test? indexed_frame = DataFrame(data, index=index) # noqa unindexed_frame = DataFrame(data) # noqa assert self.mixed_frame['foo'].dtype == np.object_ def test_constructor_cast_failure(self): foo = DataFrame({'a': ['a', 'b', 'c']}, dtype=np.float64) assert foo['a'].dtype == object # GH 3010, constructing with odd arrays df = DataFrame(np.ones((4, 2))) # this is ok df['foo'] = np.ones((4, 2)).tolist() # this is not ok pytest.raises(ValueError, df.__setitem__, tuple(['test']), np.ones((4, 2))) # this is ok df['foo2'] = np.ones((4, 2)).tolist() def test_constructor_dtype_copy(self): orig_df = DataFrame({ 'col1': [1.], 'col2': [2.], 'col3': [3.]}) new_df = pd.DataFrame(orig_df, dtype=float, copy=True) new_df['col1'] = 200. assert orig_df['col1'][0] == 1. def test_constructor_dtype_nocast_view(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) should_be_view[0][0] = 99 assert df.values[0, 0] == 99 should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 assert df.values[0, 0] == 97 def test_constructor_dtype_list_data(self): df = DataFrame([[1, '2'], [None, 'a']], dtype=object) assert df.loc[1, 0] is None assert df.loc[0, 1] == '2' def test_constructor_list_frames(self): # see gh-3243 result = DataFrame([DataFrame([])]) assert result.shape == (1, 0) result = DataFrame([DataFrame(dict(A=lrange(5)))]) assert isinstance(result.iloc[0, 0], DataFrame) def test_constructor_mixed_dtypes(self): def _make_mixed_dtypes_df(typ, ad=None): if typ == 'int': dtypes = MIXED_INT_DTYPES arrays = [np.array(np.random.rand(10), dtype=d) for d in dtypes] elif typ == 'float': dtypes = MIXED_FLOAT_DTYPES arrays = [np.array(np.random.randint( 10, size=10), dtype=d) for d in dtypes] zipper = lzip(dtypes, arrays) for d, a in zipper: assert(a.dtype == d) if ad is None: ad = dict() ad.update({d: a for d, a in zipper}) return DataFrame(ad) def _check_mixed_dtypes(df, dtypes=None): if dtypes is None: dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES for d in dtypes: if d in df: assert(df.dtypes[d] == d) # mixed floating and integer coexinst in the same frame df = _make_mixed_dtypes_df('float') _check_mixed_dtypes(df) # add lots of types df = _make_mixed_dtypes_df('float', dict(A=1, B='foo', C='bar')) _check_mixed_dtypes(df) # GH 622 df = _make_mixed_dtypes_df('int') _check_mixed_dtypes(df) def test_constructor_complex_dtypes(self): # GH10952 a = np.random.rand(10).astype(np.complex64) b = np.random.rand(10).astype(np.complex128) df = DataFrame({'a': a, 'b': b}) assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 df = DataFrame({'A': ['x', None]}, dtype=string_dtype) result = df.isna() expected = DataFrame({"A": [False, True]}) tm.assert_frame_equal(result, expected) assert df.iloc[1, 0] is None df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype) assert np.isnan(df.iloc[1, 0]) def test_constructor_rec(self): rec = self.frame.to_records(index=False) if PY3: # unicode error under PY2 rec.dtype.names = list(rec.dtype.names)[::-1] index = self.frame.index df = DataFrame(rec) tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names)) df2 = DataFrame(rec, index=index) tm.assert_index_equal(df2.columns, pd.Index(rec.dtype.names)) tm.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] df3 = DataFrame(rec, index=rng, columns=['C', 'B']) expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B']) tm.assert_frame_equal(df3, expected) def test_constructor_bool(self): df = DataFrame({0: np.ones(10, dtype=bool), 1: np.zeros(10, dtype=bool)}) assert df.values.dtype == np.bool_ def test_constructor_overflow_int64(self): # see gh-14881 values = np.array([2 ** 64 - i for i in range(1, 10)], dtype=np.uint64) result = DataFrame({'a': values}) assert result['a'].dtype == np.uint64 # see gh-2355 data_scores = [(6311132704823138710, 273), (2685045978526272070, 23), (8921811264899370420, 45), (long(17019687244989530680), 270), (long(9930107427299601010), 273)] dtype = [('uid', 'u8'), ('score', 'u8')] data = np.zeros((len(data_scores),), dtype=dtype) data[:] = data_scores df_crawls = DataFrame(data) assert df_crawls['uid'].dtype == np.uint64 @pytest.mark.parametrize("values", [np.array([2**64], dtype=object), np.array([2**65]), [2**64 + 1], np.array([-2**63 - 4], dtype=object), np.array([-2**64 - 1]), [-2**65 - 2]]) def test_constructor_int_overflow(self, values): # see gh-18584 value = values[0] result = DataFrame(values) assert result[0].dtype == object assert result[0][0] == value def test_constructor_ordereddict(self): import random nitems = 100 nums = lrange(nitems) random.shuffle(nums) expected = ['A%d' % i for i in nums] df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems))) assert expected == list(df.columns) def test_constructor_dict(self): frame = DataFrame({'col1': self.ts1, 'col2': self.ts2}) # col2 is padded with NaN assert len(self.ts1) == 30 assert len(self.ts2) == 25 tm.assert_series_equal(self.ts1, frame['col1'], check_names=False) exp = pd.Series(np.concatenate([[np.nan] * 5, self.ts2.values]), index=self.ts1.index, name='col2') tm.assert_series_equal(exp, frame['col2']) frame = DataFrame({'col1': self.ts1, 'col2': self.ts2}, columns=['col2', 'col3', 'col4']) assert len(frame) == len(self.ts2) assert 'col1' not in frame assert isna(frame['col3']).all() # Corner cases assert len(DataFrame({})) == 0 # mix dict and array, wrong size - no spec for which error should raise # first with pytest.raises(ValueError): DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']}) # Length-one dict micro-optimization frame = DataFrame({'A': {'1': 1, '2': 2}}) tm.assert_index_equal(frame.index, pd.Index(['1', '2'])) # empty dict plus index idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx) assert frame.index is idx # empty with index and columns idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx, columns=idx) assert frame.index is idx assert frame.columns is idx assert len(frame._series) == 3 # with dict of empty list and Series frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) tm.assert_index_equal(frame.index, Index([], dtype=np.int64)) # GH 14381 # Dict with None value frame_none = DataFrame(dict(a=None), index=[0]) frame_none_list = DataFrame(dict(a=[None]), index=[0]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert frame_none.get_value(0, 'a') is None with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert frame_none_list.get_value(0, 'a') is None tm.assert_frame_equal(frame_none, frame_none_list) # GH10856 # dict with scalar values should raise error, even if columns passed msg = 'If using all scalar values, you must pass an index' with pytest.raises(ValueError, match=msg): DataFrame({'a': 0.7}) with pytest.raises(ValueError, match=msg): DataFrame({'a': 0.7}, columns=['a']) @pytest.mark.parametrize("scalar", [2, np.nan, None, 'D']) def test_constructor_invalid_items_unused(self, scalar): # No error if invalid (scalar) value is in fact not used: result = DataFrame({'a': scalar}, columns=['b']) expected = DataFrame(columns=['b']) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) def test_constructor_dict_nan_key(self, value): # GH 18455 cols = [1, value, 3] idx = ['a', value] values = [[0, 3], [1, 4], [2, 5]] data = {cols[c]: Series(values[c], index=idx) for c in range(3)} result = DataFrame(data).sort_values(1).sort_values('a', axis=1) expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), index=idx, columns=cols) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx).sort_values('a', axis=1) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("value", [np.nan, None, float('nan')]) def test_constructor_dict_nan_tuple_key(self, value): # GH 18455 cols = Index([(11, 21), (value, 22), (13, value)]) idx = Index([('a', value), (value, 2)]) values = [[0, 3], [1, 4], [2, 5]] data = {cols[c]: Series(values[c], index=idx) for c in range(3)} result = (DataFrame(data) .sort_values((11, 21)) .sort_values(('a', value), axis=1)) expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), index=idx, columns=cols) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx).sort_values(('a', value), axis=1) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6') def test_constructor_dict_order_insertion(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6 d = {'b': self.ts2, 'a': self.ts1} frame = DataFrame(data=d) expected = DataFrame(data=d, columns=list('ba')) tm.assert_frame_equal(frame, expected) @pytest.mark.skipif(PY36, reason='order by value for Python<3.6') def test_constructor_dict_order_by_values(self): # GH19018 # initialization ordering: by value if python<3.6 d = {'b': self.ts2, 'a': self.ts1} frame = DataFrame(data=d) expected = DataFrame(data=d, columns=list('ab')) tm.assert_frame_equal(frame, expected) def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame tuples = [(2, 3), (3, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) assert pd.isna(df).values.ravel().all() tuples = [(3, 3), (2, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) assert pd.isna(df).values.ravel().all() def test_constructor_error_msgs(self): msg = "Empty data passed with indices specified." # passing an empty array with columns specified. with pytest.raises(ValueError, match=msg): DataFrame(np.empty(0), columns=list('abc')) msg = "Mixing dicts with non-Series may lead to ambiguous ordering." # mix dict and array, wrong size with pytest.raises(ValueError, match=msg): DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']}) # wrong size ndarray, GH 3105 msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)" with pytest.raises(ValueError, match=msg): DataFrame(np.arange(12).reshape((4, 3)), columns=['foo', 'bar', 'baz'], index=pd.date_range('2000-01-01', periods=3)) arr = np.array([[4, 5, 6]]) msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): DataFrame(index=[0], columns=range(0, 4), data=arr) arr = np.array([4, 5, 6]) msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): DataFrame(index=[0], columns=range(0, 4), data=arr) # higher dim raise exception with pytest.raises(ValueError, match='Must pass 2-d input'): DataFrame(np.zeros((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) # wrong size axis labels msg = ("Shape of passed values " r"is \(2, 3\), indices " r"imply \(1, 3\)") with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=['A', 'B', 'C'], index=[1]) msg = ("Shape of passed values " r"is \(2, 3\), indices " r"imply \(2, 2\)") with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=['A', 'B'], index=[1, 2]) msg = ("If using all scalar " "values, you must pass " "an index") with pytest.raises(ValueError, match=msg): DataFrame({'a': False, 'b': True}) def test_constructor_with_embedded_frames(self): # embedded data frames df1 = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) df2 = DataFrame([df1, df1 + 10]) df2.dtypes str(df2) result = df2.loc[0, 0] tm.assert_frame_equal(result, df1) result = df2.loc[1, 0] tm.assert_frame_equal(result, df1 + 10) def test_constructor_subclass_dict(self): # Test for passing dict subclass to constructor data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))} df = DataFrame(data) refdf = DataFrame({col: dict(compat.iteritems(val)) for col, val in compat.iteritems(data)}) tm.assert_frame_equal(refdf, df) data = tm.TestSubDict(compat.iteritems(data)) df = DataFrame(data) tm.assert_frame_equal(refdf, df) # try with defaultdict from collections import defaultdict data = {} self.frame['B'][:10] = np.nan for k, v in compat.iteritems(self.frame): dct = defaultdict(dict) dct.update(v.to_dict()) data[k] = dct frame = DataFrame(data) tm.assert_frame_equal(self.frame.sort_index(), frame) def test_constructor_dict_block(self): expected = np.array([[4., 3., 2., 1.]]) df = DataFrame({'d': [4.], 'c': [3.], 'b': [2.], 'a': [1.]}, columns=['d', 'c', 'b', 'a']) tm.assert_numpy_array_equal(df.values, expected) def test_constructor_dict_cast(self): # cast float tests test_data = { 'A': {'1': 1, '2': 2}, 'B': {'1': '1', '2': '2', '3': '3'}, } frame = DataFrame(test_data, dtype=float) assert len(frame) == 3 assert frame['B'].dtype == np.float64 assert frame['A'].dtype == np.float64 frame = DataFrame(test_data) assert len(frame) == 3 assert frame['B'].dtype == np.object_ assert frame['A'].dtype == np.float64 # can't cast to float test_data = { 'A': dict(zip(range(20), tm.makeStringIndex(20))), 'B': dict(zip(range(15), np.random.randn(15))) } frame = DataFrame(test_data, dtype=float) assert len(frame) == 20 assert frame['A'].dtype == np.object_ assert frame['B'].dtype == np.float64 def test_constructor_dict_dont_upcast(self): d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}} df = DataFrame(d) assert isinstance(df['Col1']['Row2'], float) dm = DataFrame([[1, 2], ['a', 'b']], index=[1, 2], columns=[1, 2]) assert isinstance(dm[1][1], int) def test_constructor_dict_of_tuples(self): # GH #1491 data = {'a': (1, 2, 3), 'b': (4, 5, 6)} result = DataFrame(data) expected = DataFrame({k: list(v) for k, v in compat.iteritems(data)}) tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_dict_multiindex(self): def check(result, expected): return tm.assert_frame_equal(result, expected, check_dtype=True, check_index_type=True, check_column_type=True, check_names=True) d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4}, ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}} _d = sorted(d.items()) df = DataFrame(d) expected = DataFrame( [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d])).T expected.index = MultiIndex.from_tuples(expected.index) check(df, expected) d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111} _d.insert(0, ('z', d['z'])) expected = DataFrame( [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False)).T expected.index = Index(expected.index, tupleize_cols=False) df = DataFrame(d) df = df.reindex(columns=expected.columns, index=expected.index) check(df, expected) def test_constructor_dict_datetime64_index(self): # GH 10160 dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] def create_data(constructor): return {i: {constructor(s): 2 * i} for i, s in enumerate(dates_as_str)} data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) data_Timestamp = create_data(Timestamp) expected = DataFrame([{0: 0, 1: None, 2: None, 3: None}, {0: None, 1: 2, 2: None, 3: None}, {0: None, 1: None, 2: 4, 3: None}, {0: None, 1: None, 2: None, 3: 6}], index=[Timestamp(dt) for dt in dates_as_str]) result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) result_Timestamp = DataFrame(data_Timestamp) tm.assert_frame_equal(result_datetime64, expected) tm.assert_frame_equal(result_datetime, expected) tm.assert_frame_equal(result_Timestamp, expected) def test_constructor_dict_timedelta64_index(self): # GH 10160 td_as_int = [1, 2, 3, 4] def create_data(constructor): return {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)} data_timedelta64 = create_data(lambda x: np.timedelta64(x, 'D')) data_timedelta = create_data(lambda x: timedelta(days=x)) data_Timedelta = create_data(lambda x: Timedelta(x, 'D')) expected = DataFrame([{0: 0, 1: None, 2: None, 3: None}, {0: None, 1: 2, 2: None, 3: None}, {0: None, 1: None, 2: 4, 3: None}, {0: None, 1: None, 2: None, 3: 6}], index=[Timedelta(td, 'D') for td in td_as_int]) result_timedelta64 = DataFrame(data_timedelta64) result_timedelta = DataFrame(data_timedelta) result_Timedelta = DataFrame(data_Timedelta) tm.assert_frame_equal(result_timedelta64, expected) tm.assert_frame_equal(result_timedelta, expected) tm.assert_frame_equal(result_Timedelta, expected) def test_constructor_period(self): # PeriodIndex a = pd.PeriodIndex(['2012-01', 'NaT', '2012-04'], freq='M') b = pd.PeriodIndex(['2012-02-01', '2012-03-01', 'NaT'], freq='D') df = pd.DataFrame({'a': a, 'b': b}) assert df['a'].dtype == a.dtype assert df['b'].dtype == b.dtype # list of periods df = pd.DataFrame({'a': a.astype(object).tolist(), 'b': b.astype(object).tolist()}) assert df['a'].dtype == a.dtype assert df['b'].dtype == b.dtype def test_nested_dict_frame_constructor(self): rng = pd.period_range('1/1/2000', periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng) data = {} for col in df.columns: for row in df.index: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): data.setdefault(col, {})[row] = df.get_value(row, col) result = DataFrame(data, columns=rng) tm.assert_frame_equal(result, df) data = {} for col in df.columns: for row in df.index: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): data.setdefault(row, {})[col] = df.get_value(row, col) result = DataFrame(data, index=rng).T tm.assert_frame_equal(result, df) def _check_basic_constructor(self, empty): # mat: 2d matrix with shape (3, 2) to input. empty - makes sized # objects mat = empty((2, 3), dtype=float) # 2-D input frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 # 1-D input frame = DataFrame(empty((3,)), columns=['A'], index=[1, 2, 3]) assert len(frame.index) == 3 assert len(frame.columns) == 1 # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2], dtype=np.int64) assert frame.values.dtype == np.int64 # wrong size axis labels msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)' with pytest.raises(ValueError, match=msg): DataFrame(mat, columns=['A', 'B', 'C'], index=[1]) msg = r'Shape of passed values is \(2, 3\), indices imply \(2, 2\)' with pytest.raises(ValueError, match=msg): DataFrame(mat, columns=['A', 'B'], index=[1, 2]) # higher dim raise exception with pytest.raises(ValueError, match='Must pass 2-d input'): DataFrame(empty((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) # automatic labeling frame = DataFrame(mat) tm.assert_index_equal(frame.index, pd.Index(lrange(2))) tm.assert_index_equal(frame.columns, pd.Index(lrange(3))) frame = DataFrame(mat, index=[1, 2]) tm.assert_index_equal(frame.columns, pd.Index(lrange(3))) frame = DataFrame(mat, columns=['A', 'B', 'C']) tm.assert_index_equal(frame.index, pd.Index(lrange(2))) # 0-length axis frame = DataFrame(empty((0, 3))) assert len(frame.index) == 0 frame = DataFrame(empty((3, 0))) assert len(frame.columns) == 0 def test_constructor_ndarray(self): self._check_basic_constructor(np.ones) frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A']) assert len(frame) == 2 @pytest.mark.skipif(PY2 and _np_version_under1p13, reason="old numpy & py2") def test_constructor_maskedarray(self): self._check_basic_constructor(ma.masked_all) # Check non-masked values mat = ma.masked_all((2, 3), dtype=float) mat[0, 0] = 1.0 mat[1, 2] = 2.0 frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) assert 1.0 == frame['A'][1] assert 2.0 == frame['C'][2] # what is this even checking?? mat = ma.masked_all((2, 3), dtype=float) frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) assert np.all(~np.asarray(frame == frame)) @pytest.mark.skipif(PY2 and _np_version_under1p13, reason="old numpy & py2") def test_constructor_maskedarray_nonfloat(self): # masked int promoted to float mat = ma.masked_all((2, 3), dtype=int) # 2-D input frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert np.all(~np.asarray(frame == frame)) # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2], dtype=np.float64) assert frame.values.dtype == np.float64 # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = 1 mat2[1, 2] = 2 frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) assert 1 == frame['A'][1] assert 2 == frame['C'][2] # masked np.datetime64 stays (use NaT as null) mat = ma.masked_all((2, 3), dtype='M8[ns]') # 2-D input frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert isna(frame).values.all() # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2], dtype=np.int64) assert frame.values.dtype == np.int64 # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = 1 mat2[1, 2] = 2 frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) assert 1 == frame['A'].view('i8')[1] assert 2 == frame['C'].view('i8')[2] # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) # 2-D input frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert np.all(~np.asarray(frame == frame)) # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2], dtype=object) assert frame.values.dtype == object # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = True mat2[1, 2] = False frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) assert frame['A'][1] is True assert frame['C'][2] is False @pytest.mark.skipif(PY2 and _np_version_under1p13, reason="old numpy & py2") def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) expected = pd.DataFrame({ 'A': [np.nan, np.nan], 'B': [np.nan, np.nan]}, columns=['A', 'B'], index=[1, 2], dtype=float) tm.assert_frame_equal(result, expected) # Check case where mask is hard but no data are masked mat_hard = ma.ones((2, 2), dtype=float).harden_mask() result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) expected = pd.DataFrame({ 'A': [1.0, 1.0], 'B': [1.0, 1.0]}, columns=['A', 'B'], index=[1, 2], dtype=float) tm.assert_frame_equal(result, expected) @pytest.mark.skipif(PY2 and _np_version_under1p13, reason="old numpy & py2") def test_constructor_maskedrecarray_dtype(self): # Ensure constructor honors dtype data = np.ma.array( np.ma.zeros(5, dtype=[('date', '0 df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname: np.array([1.] * 10, dtype=floatname), intname: np.array([1] * 10, dtype=intname)}, index=np.arange(10)) result = df.get_dtype_counts() result = result.sort_index() tm.assert_series_equal(result, expected) # GH 2809 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) assert datetime_s.dtype == 'M8[ns]' df = DataFrame({'datetime_s': datetime_s}) result = df.get_dtype_counts() expected = Series({datetime64name: 1}) result = result.sort_index() expected = expected.sort_index() tm.assert_series_equal(result, expected) # GH 2810 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] dates = [ts.date() for ts in ind] df = DataFrame({'datetimes': datetimes, 'dates': dates}) result = df.get_dtype_counts() expected = Series({datetime64name: 1, objectname: 1}) result = result.sort_index() expected = expected.sort_index() tm.assert_series_equal(result, expected) # GH 7594 # don't coerce tz-aware import pytz tz = pytz.timezone('US/Eastern') dt = tz.localize(datetime(2012, 1, 1)) df = DataFrame({'End Date': dt}, index=[0]) assert df.iat[0, 0] == dt tm.assert_series_equal(df.dtypes, Series( {'End Date': 'datetime64[ns, US/Eastern]'})) df = DataFrame([{'End Date': dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal(df.dtypes, Series( {'End Date': 'datetime64[ns, US/Eastern]'})) # tz-aware (UTC and other tz's) # GH 8411 dr = date_range('20130101', periods=3) df = DataFrame({'value': dr}) assert df.iat[0, 0].tz is None dr = date_range('20130101', periods=3, tz='UTC') df = DataFrame({'value': dr}) assert str(df.iat[0, 0].tz) == 'UTC' dr = date_range('20130101', periods=3, tz='US/Eastern') df = DataFrame({'value': dr}) assert str(df.iat[0, 0].tz) == 'US/Eastern' # GH 7822 # preserver an index with a tz on dict construction i = date_range('1/1/2011', periods=5, freq='10s', tz='US/Eastern') expected = DataFrame( {'a': i.to_series(keep_tz=True).reset_index(drop=True)}) df = DataFrame() df['a'] = i tm.assert_frame_equal(df, expected) df = DataFrame({'a': i}) tm.assert_frame_equal(df, expected) # multiples i_no_tz = date_range('1/1/2011', periods=5, freq='10s') df = DataFrame({'a': i, 'b': i_no_tz}) expected = DataFrame({'a': i.to_series(keep_tz=True) .reset_index(drop=True), 'b': i_no_tz}) tm.assert_frame_equal(df, expected) def test_constructor_datetimes_with_nulls(self): # gh-15869 for arr in [np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None])]: result = DataFrame(arr).get_dtype_counts() expected = Series({'datetime64[ns]': 1}) tm.assert_series_equal(result, expected) def test_constructor_for_list_with_dtypes(self): # TODO(wesm): unused intname = np.dtype(np.int_).name # noqa floatname = np.dtype(np.float_).name # noqa datetime64name = np.dtype('M8[ns]').name objectname = np.dtype(np.object_).name # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.get_dtype_counts() expected = Series({'int64': 5}) df = DataFrame([np.array(np.arange(5), dtype='int32') for x in range(5)]) result = df.get_dtype_counts() expected = Series({'int32': 5}) # overflow issue? (we always expecte int64 upcasting here) df = DataFrame({'a': [2 ** 31, 2 ** 31 + 1]}) result = df.get_dtype_counts() expected = Series({'int64': 1}) tm.assert_series_equal(result, expected) # GH #2751 (construction with no index specified), make sure we cast to # platform values df = DataFrame([1, 2]) result = df.get_dtype_counts() expected = Series({'int64': 1}) tm.assert_series_equal(result, expected) df = DataFrame([1., 2.]) result = df.get_dtype_counts() expected = Series({'float64': 1}) tm.assert_series_equal(result, expected) df = DataFrame({'a': [1, 2]}) result = df.get_dtype_counts() expected = Series({'int64': 1}) tm.assert_series_equal(result, expected) df = DataFrame({'a': [1., 2.]}) result = df.get_dtype_counts() expected = Series({'float64': 1}) tm.assert_series_equal(result, expected) df = DataFrame({'a': 1}, index=lrange(3)) result = df.get_dtype_counts() expected = Series({'int64': 1}) tm.assert_series_equal(result, expected) df = DataFrame({'a': 1.}, index=lrange(3)) result = df.get_dtype_counts() expected = Series({'float64': 1}) tm.assert_series_equal(result, expected) # with object list df = DataFrame({'a': [1, 2, 4, 7], 'b': [1.2, 2.3, 5.1, 6.3], 'c': list('abcd'), 'd': [datetime(2000, 1, 1) for i in range(4)], 'e': [1., 2, 4., 7]}) result = df.get_dtype_counts() expected = Series( {'int64': 1, 'float64': 2, datetime64name: 1, objectname: 1}) result = result.sort_index() expected = expected.sort_index() tm.assert_series_equal(result, expected) def test_constructor_frame_copy(self): cop = DataFrame(self.frame, copy=True) cop['A'] = 5 assert (cop['A'] == 5).all() assert not (self.frame['A'] == 5).all() def test_constructor_ndarray_copy(self): df = DataFrame(self.frame.values) self.frame.values[5] = 5 assert (df.values[5] == 5).all() df = DataFrame(self.frame.values, copy=True) self.frame.values[6] = 6 assert not (df.values[6] == 6).all() def test_constructor_series_copy(self): series = self.frame._series df = DataFrame({'A': series['A']}) df['A'][:] = 5 assert not (series['A'] == 5).all() def test_constructor_with_nas(self): # GH 5016 # na's in indices def check(df): for i in range(len(df.columns)): df.iloc[:, i] indexer = np.arange(len(df.columns))[isna(df.columns)] # No NaN found -> error if len(indexer) == 0: def f(): df.loc[:, np.nan] pytest.raises(TypeError, f) # single nan should result in Series elif len(indexer) == 1: tm.assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan]) # multiple nans should result in DataFrame else: tm.assert_frame_equal(df.iloc[:, indexer], df.loc[:, np.nan]) df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]) check(df) df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan]) check(df) df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]) check(df) df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]) check(df) # GH 21428 (non-unique columns) df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1, 2, 2]) check(df) def test_constructor_lists_to_object_dtype(self): # from #1074 d = DataFrame({'a': [np.nan, False]}) assert d['a'].dtype == np.object_ assert not d['a'][1] def test_constructor_categorical(self): # GH8626 # dict creation df = DataFrame({'A': list('abc')}, dtype='category') expected = Series(list('abc'), dtype='category', name='A') tm.assert_series_equal(df['A'], expected) # to_frame s = Series(list('abc'), dtype='category') result = s.to_frame() expected = Series(list('abc'), dtype='category', name=0) tm.assert_series_equal(result[0], expected) result = s.to_frame(name='foo') expected = Series(list('abc'), dtype='category', name='foo') tm.assert_series_equal(result['foo'], expected) # list-like creation df = DataFrame(list('abc'), dtype='category') expected = Series(list('abc'), dtype='category', name=0) tm.assert_series_equal(df[0], expected) # ndim != 1 df = DataFrame([Categorical(list('abc'))]) expected = DataFrame({0: Series(list('abc'), dtype='category')}) tm.assert_frame_equal(df, expected) df = DataFrame([Categorical(list('abc')), Categorical(list('abd'))]) expected = DataFrame({0: Series(list('abc'), dtype='category'), 1: Series(list('abd'), dtype='category')}, columns=[0, 1]) tm.assert_frame_equal(df, expected) # mixed df = DataFrame([Categorical(list('abc')), list('def')]) expected = DataFrame({0: Series(list('abc'), dtype='category'), 1: list('def')}, columns=[0, 1]) tm.assert_frame_equal(df, expected) # invalid (shape) pytest.raises(ValueError, lambda: DataFrame([Categorical(list('abc')), Categorical(list('abdefg'))])) # ndim > 1 pytest.raises(NotImplementedError, lambda: Categorical(np.array([list('abcd')]))) def test_constructor_categorical_series(self): items = [1, 2, 3, 1] exp = Series(items).astype('category') res = Series(items, dtype='category') tm.assert_series_equal(res, exp) items = ["a", "b", "c", "a"] exp = Series(items).astype('category') res = Series(items, dtype='category') tm.assert_series_equal(res, exp) # insert into frame with different index # GH 8076 index = date_range('20000101', periods=3) expected = Series(Categorical(values=[np.nan, np.nan, np.nan], categories=['a', 'b', 'c'])) expected.index = index expected = DataFrame({'x': expected}) df = DataFrame( {'x': Series(['a', 'b', 'c'], dtype='category')}, index=index) tm.assert_frame_equal(df, expected) def test_from_records_to_records(self): # from numpy documentation arr = np.zeros((2,), dtype=('i4,f4,a10')) arr[:] = [(1, 2., 'Hello'), (2, 3., "World")] # TODO(wesm): unused frame = DataFrame.from_records(arr) # noqa index = pd.Index(np.arange(len(arr))[::-1]) indexed_frame = DataFrame.from_records(arr, index=index) tm.assert_index_equal(indexed_frame.index, index) # without names, it should go to last ditch arr2 = np.zeros((2, 3)) tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)' with pytest.raises(ValueError, match=msg): DataFrame.from_records(arr, index=index[:-1]) indexed_frame = DataFrame.from_records(arr, index='f1') # what to do? records = indexed_frame.to_records() assert len(records.dtype.names) == 3 records = indexed_frame.to_records(index=False) assert len(records.dtype.names) == 2 assert 'index' not in records.dtype.names def test_from_records_nones(self): tuples = [(1, 2, None, 3), (1, 2, None, 3), (None, 2, 5, 3)] df = DataFrame.from_records(tuples, columns=['a', 'b', 'c', 'd']) assert np.isnan(df['c'][0]) def test_from_records_iterator(self): arr = np.array([(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5., 5., 6, 6), (7., 7., 8, 8)], dtype=[('x', np.float64), ('u', np.float32), ('y', np.int64), ('z', np.int32)]) df = DataFrame.from_records(iter(arr), nrows=2) xp = DataFrame({'x': np.array([1.0, 3.0], dtype=np.float64), 'u': np.array([1.0, 3.0], dtype=np.float32), 'y': np.array([2, 4], dtype=np.int64), 'z': np.array([2, 4], dtype=np.int32)}) tm.assert_frame_equal(df.reindex_like(xp), xp) # no dtypes specified here, so just compare with the default arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)] df = DataFrame.from_records(iter(arr), columns=['x', 'y'], nrows=2) tm.assert_frame_equal(df, xp.reindex(columns=['x', 'y']), check_dtype=False) def test_from_records_tuples_generator(self): def tuple_generator(length): for i in range(length): letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' yield (i, letters[i % len(letters)], i / length) columns_names = ['Integer', 'String', 'Float'] columns = [[i[j] for i in tuple_generator( 10)] for j in range(len(columns_names))] data = {'Integer': columns[0], 'String': columns[1], 'Float': columns[2]} expected = DataFrame(data, columns=columns_names) generator = tuple_generator(10) result = DataFrame.from_records(generator, columns=columns_names) tm.assert_frame_equal(result, expected) def test_from_records_lists_generator(self): def list_generator(length): for i in range(length): letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' yield [i, letters[i % len(letters)], i / length] columns_names = ['Integer', 'String', 'Float'] columns = [[i[j] for i in list_generator( 10)] for j in range(len(columns_names))] data = {'Integer': columns[0], 'String': columns[1], 'Float': columns[2]} expected = DataFrame(data, columns=columns_names) generator = list_generator(10) result = DataFrame.from_records(generator, columns=columns_names) tm.assert_frame_equal(result, expected) def test_from_records_columns_not_modified(self): tuples = [(1, 2, 3), (1, 2, 3), (2, 5, 3)] columns = ['a', 'b', 'c'] original_columns = list(columns) df = DataFrame.from_records(tuples, columns=columns, index='a') # noqa assert columns == original_columns def test_from_records_decimal(self): from decimal import Decimal tuples = [(Decimal('1.5'),), (Decimal('2.5'),), (None,)] df = DataFrame.from_records(tuples, columns=['a']) assert df['a'].dtype == object df = DataFrame.from_records(tuples, columns=['a'], coerce_float=True) assert df['a'].dtype == np.float64 assert np.isnan(df['a'].values[-1]) def test_from_records_duplicates(self): result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'a']) expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'a']) tm.assert_frame_equal(result, expected) def test_from_records_set_index_name(self): def create_dict(order_id): return {'order_id': order_id, 'quantity': np.random.randint(1, 10), 'price': np.random.randint(1, 10)} documents = [create_dict(i) for i in range(10)] # demo missing data documents.append({'order_id': 10, 'quantity': 5}) result = DataFrame.from_records(documents, index='order_id') assert result.index.name == 'order_id' # MultiIndex result = DataFrame.from_records(documents, index=['order_id', 'quantity']) assert result.index.names == ('order_id', 'quantity') def test_from_records_misc_brokenness(self): # #2179 data = {1: ['foo'], 2: ['bar']} result = DataFrame.from_records(data, columns=['a', 'b']) exp = DataFrame(data, columns=['a', 'b']) tm.assert_frame_equal(result, exp) # overlap in index/index_names data = {'a': [1, 2, 3], 'b': [4, 5, 6]} result = DataFrame.from_records(data, index=['a', 'b', 'c']) exp = DataFrame(data, index=['a', 'b', 'c']) tm.assert_frame_equal(result, exp) # GH 2623 rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 'hi']) # test col upconverts to obj df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) results = df2_obj.get_dtype_counts() expected = Series({'datetime64[ns]': 1, 'object': 1}) rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 1]) df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) results = df2_obj.get_dtype_counts().sort_index() expected = Series({'datetime64[ns]': 1, 'int64': 1}) tm.assert_series_equal(results, expected) def test_from_records_empty(self): # 3562 result = DataFrame.from_records([], columns=['a', 'b', 'c']) expected = DataFrame(columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) result = DataFrame.from_records([], columns=['a', 'b', 'b']) expected = DataFrame(columns=['a', 'b', 'b']) tm.assert_frame_equal(result, expected) def test_from_records_empty_with_nonempty_fields_gh3682(self): a = np.array([(1, 2)], dtype=[('id', np.int64), ('value', np.int64)]) df = DataFrame.from_records(a, index='id') tm.assert_index_equal(df.index, Index([1], name='id')) assert df.index.name == 'id' tm.assert_index_equal(df.columns, Index(['value'])) b = np.array([], dtype=[('id', np.int64), ('value', np.int64)]) df = DataFrame.from_records(b, index='id') tm.assert_index_equal(df.index, Index([], name='id')) assert df.index.name == 'id' def test_from_records_with_datetimes(self): # this may fail on certain platforms because of a numpy issue # related GH6140 if not is_platform_little_endian(): pytest.skip("known failure of test on non-little endian") # construction with a null in a recarray # GH 6140 expected = DataFrame({'EXPIRY': [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] dtypes = [('EXPIRY', '