123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316 |
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- from datetime import datetime, timedelta
- import functools
- import itertools
- import numpy as np
- import numpy.ma as ma
- import pytest
- from pandas.compat import (
- PY2, PY3, PY36, OrderedDict, is_platform_little_endian, lmap, long, lrange,
- lzip, range, zip)
- from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
- from pandas.core.dtypes.common import is_integer_dtype
- import pandas as pd
- from pandas import (
- Categorical, DataFrame, Index, MultiIndex, Series, Timedelta, Timestamp,
- _np_version_under1p13, compat, date_range, isna)
- from pandas.tests.frame.common import TestData
- import pandas.util.testing as tm
- MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64']
- MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16',
- 'int32', 'int64']
- class TestDataFrameConstructors(TestData):
- def test_constructor(self):
- df = DataFrame()
- assert len(df.index) == 0
- df = DataFrame(data={})
- assert len(df.index) == 0
- def test_constructor_mixed(self):
- index, data = tm.getMixedTypeDict()
- # TODO(wesm), incomplete test?
- indexed_frame = DataFrame(data, index=index) # noqa
- unindexed_frame = DataFrame(data) # noqa
- assert self.mixed_frame['foo'].dtype == np.object_
- def test_constructor_cast_failure(self):
- foo = DataFrame({'a': ['a', 'b', 'c']}, dtype=np.float64)
- assert foo['a'].dtype == object
- # GH 3010, constructing with odd arrays
- df = DataFrame(np.ones((4, 2)))
- # this is ok
- df['foo'] = np.ones((4, 2)).tolist()
- # this is not ok
- pytest.raises(ValueError, df.__setitem__, tuple(['test']),
- np.ones((4, 2)))
- # this is ok
- df['foo2'] = np.ones((4, 2)).tolist()
- def test_constructor_dtype_copy(self):
- orig_df = DataFrame({
- 'col1': [1.],
- 'col2': [2.],
- 'col3': [3.]})
- new_df = pd.DataFrame(orig_df, dtype=float, copy=True)
- new_df['col1'] = 200.
- assert orig_df['col1'][0] == 1.
- def test_constructor_dtype_nocast_view(self):
- df = DataFrame([[1, 2]])
- should_be_view = DataFrame(df, dtype=df[0].dtype)
- should_be_view[0][0] = 99
- assert df.values[0, 0] == 99
- should_be_view = DataFrame(df.values, dtype=df[0].dtype)
- should_be_view[0][0] = 97
- assert df.values[0, 0] == 97
- def test_constructor_dtype_list_data(self):
- df = DataFrame([[1, '2'],
- [None, 'a']], dtype=object)
- assert df.loc[1, 0] is None
- assert df.loc[0, 1] == '2'
- def test_constructor_list_frames(self):
- # see gh-3243
- result = DataFrame([DataFrame([])])
- assert result.shape == (1, 0)
- result = DataFrame([DataFrame(dict(A=lrange(5)))])
- assert isinstance(result.iloc[0, 0], DataFrame)
- def test_constructor_mixed_dtypes(self):
- def _make_mixed_dtypes_df(typ, ad=None):
- if typ == 'int':
- dtypes = MIXED_INT_DTYPES
- arrays = [np.array(np.random.rand(10), dtype=d)
- for d in dtypes]
- elif typ == 'float':
- dtypes = MIXED_FLOAT_DTYPES
- arrays = [np.array(np.random.randint(
- 10, size=10), dtype=d) for d in dtypes]
- zipper = lzip(dtypes, arrays)
- for d, a in zipper:
- assert(a.dtype == d)
- if ad is None:
- ad = dict()
- ad.update({d: a for d, a in zipper})
- return DataFrame(ad)
- def _check_mixed_dtypes(df, dtypes=None):
- if dtypes is None:
- dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES
- for d in dtypes:
- if d in df:
- assert(df.dtypes[d] == d)
- # mixed floating and integer coexinst in the same frame
- df = _make_mixed_dtypes_df('float')
- _check_mixed_dtypes(df)
- # add lots of types
- df = _make_mixed_dtypes_df('float', dict(A=1, B='foo', C='bar'))
- _check_mixed_dtypes(df)
- # GH 622
- df = _make_mixed_dtypes_df('int')
- _check_mixed_dtypes(df)
- def test_constructor_complex_dtypes(self):
- # GH10952
- a = np.random.rand(10).astype(np.complex64)
- b = np.random.rand(10).astype(np.complex128)
- df = DataFrame({'a': a, 'b': b})
- assert a.dtype == df.a.dtype
- assert b.dtype == df.b.dtype
- def test_constructor_dtype_str_na_values(self, string_dtype):
- # https://github.com/pandas-dev/pandas/issues/21083
- df = DataFrame({'A': ['x', None]}, dtype=string_dtype)
- result = df.isna()
- expected = DataFrame({"A": [False, True]})
- tm.assert_frame_equal(result, expected)
- assert df.iloc[1, 0] is None
- df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype)
- assert np.isnan(df.iloc[1, 0])
- def test_constructor_rec(self):
- rec = self.frame.to_records(index=False)
- if PY3:
- # unicode error under PY2
- rec.dtype.names = list(rec.dtype.names)[::-1]
- index = self.frame.index
- df = DataFrame(rec)
- tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names))
- df2 = DataFrame(rec, index=index)
- tm.assert_index_equal(df2.columns, pd.Index(rec.dtype.names))
- tm.assert_index_equal(df2.index, index)
- rng = np.arange(len(rec))[::-1]
- df3 = DataFrame(rec, index=rng, columns=['C', 'B'])
- expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B'])
- tm.assert_frame_equal(df3, expected)
- def test_constructor_bool(self):
- df = DataFrame({0: np.ones(10, dtype=bool),
- 1: np.zeros(10, dtype=bool)})
- assert df.values.dtype == np.bool_
- def test_constructor_overflow_int64(self):
- # see gh-14881
- values = np.array([2 ** 64 - i for i in range(1, 10)],
- dtype=np.uint64)
- result = DataFrame({'a': values})
- assert result['a'].dtype == np.uint64
- # see gh-2355
- data_scores = [(6311132704823138710, 273), (2685045978526272070, 23),
- (8921811264899370420, 45),
- (long(17019687244989530680), 270),
- (long(9930107427299601010), 273)]
- dtype = [('uid', 'u8'), ('score', 'u8')]
- data = np.zeros((len(data_scores),), dtype=dtype)
- data[:] = data_scores
- df_crawls = DataFrame(data)
- assert df_crawls['uid'].dtype == np.uint64
- @pytest.mark.parametrize("values", [np.array([2**64], dtype=object),
- np.array([2**65]), [2**64 + 1],
- np.array([-2**63 - 4], dtype=object),
- np.array([-2**64 - 1]), [-2**65 - 2]])
- def test_constructor_int_overflow(self, values):
- # see gh-18584
- value = values[0]
- result = DataFrame(values)
- assert result[0].dtype == object
- assert result[0][0] == value
- def test_constructor_ordereddict(self):
- import random
- nitems = 100
- nums = lrange(nitems)
- random.shuffle(nums)
- expected = ['A%d' % i for i in nums]
- df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems)))
- assert expected == list(df.columns)
- def test_constructor_dict(self):
- frame = DataFrame({'col1': self.ts1,
- 'col2': self.ts2})
- # col2 is padded with NaN
- assert len(self.ts1) == 30
- assert len(self.ts2) == 25
- tm.assert_series_equal(self.ts1, frame['col1'], check_names=False)
- exp = pd.Series(np.concatenate([[np.nan] * 5, self.ts2.values]),
- index=self.ts1.index, name='col2')
- tm.assert_series_equal(exp, frame['col2'])
- frame = DataFrame({'col1': self.ts1,
- 'col2': self.ts2},
- columns=['col2', 'col3', 'col4'])
- assert len(frame) == len(self.ts2)
- assert 'col1' not in frame
- assert isna(frame['col3']).all()
- # Corner cases
- assert len(DataFrame({})) == 0
- # mix dict and array, wrong size - no spec for which error should raise
- # first
- with pytest.raises(ValueError):
- DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']})
- # Length-one dict micro-optimization
- frame = DataFrame({'A': {'1': 1, '2': 2}})
- tm.assert_index_equal(frame.index, pd.Index(['1', '2']))
- # empty dict plus index
- idx = Index([0, 1, 2])
- frame = DataFrame({}, index=idx)
- assert frame.index is idx
- # empty with index and columns
- idx = Index([0, 1, 2])
- frame = DataFrame({}, index=idx, columns=idx)
- assert frame.index is idx
- assert frame.columns is idx
- assert len(frame._series) == 3
- # with dict of empty list and Series
- frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B'])
- tm.assert_index_equal(frame.index, Index([], dtype=np.int64))
- # GH 14381
- # Dict with None value
- frame_none = DataFrame(dict(a=None), index=[0])
- frame_none_list = DataFrame(dict(a=[None]), index=[0])
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- assert frame_none.get_value(0, 'a') is None
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- assert frame_none_list.get_value(0, 'a') is None
- tm.assert_frame_equal(frame_none, frame_none_list)
- # GH10856
- # dict with scalar values should raise error, even if columns passed
- msg = 'If using all scalar values, you must pass an index'
- with pytest.raises(ValueError, match=msg):
- DataFrame({'a': 0.7})
- with pytest.raises(ValueError, match=msg):
- DataFrame({'a': 0.7}, columns=['a'])
- @pytest.mark.parametrize("scalar", [2, np.nan, None, 'D'])
- def test_constructor_invalid_items_unused(self, scalar):
- # No error if invalid (scalar) value is in fact not used:
- result = DataFrame({'a': scalar}, columns=['b'])
- expected = DataFrame(columns=['b'])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
- def test_constructor_dict_nan_key(self, value):
- # GH 18455
- cols = [1, value, 3]
- idx = ['a', value]
- values = [[0, 3], [1, 4], [2, 5]]
- data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
- result = DataFrame(data).sort_values(1).sort_values('a', axis=1)
- expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
- index=idx, columns=cols)
- tm.assert_frame_equal(result, expected)
- result = DataFrame(data, index=idx).sort_values('a', axis=1)
- tm.assert_frame_equal(result, expected)
- result = DataFrame(data, index=idx, columns=cols)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("value", [np.nan, None, float('nan')])
- def test_constructor_dict_nan_tuple_key(self, value):
- # GH 18455
- cols = Index([(11, 21), (value, 22), (13, value)])
- idx = Index([('a', value), (value, 2)])
- values = [[0, 3], [1, 4], [2, 5]]
- data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
- result = (DataFrame(data)
- .sort_values((11, 21))
- .sort_values(('a', value), axis=1))
- expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
- index=idx, columns=cols)
- tm.assert_frame_equal(result, expected)
- result = DataFrame(data, index=idx).sort_values(('a', value), axis=1)
- tm.assert_frame_equal(result, expected)
- result = DataFrame(data, index=idx, columns=cols)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6')
- def test_constructor_dict_order_insertion(self):
- # GH19018
- # initialization ordering: by insertion order if python>= 3.6
- d = {'b': self.ts2, 'a': self.ts1}
- frame = DataFrame(data=d)
- expected = DataFrame(data=d, columns=list('ba'))
- tm.assert_frame_equal(frame, expected)
- @pytest.mark.skipif(PY36, reason='order by value for Python<3.6')
- def test_constructor_dict_order_by_values(self):
- # GH19018
- # initialization ordering: by value if python<3.6
- d = {'b': self.ts2, 'a': self.ts1}
- frame = DataFrame(data=d)
- expected = DataFrame(data=d, columns=list('ab'))
- tm.assert_frame_equal(frame, expected)
- def test_constructor_multi_index(self):
- # GH 4078
- # construction error with mi and all-nan frame
- tuples = [(2, 3), (3, 3), (3, 3)]
- mi = MultiIndex.from_tuples(tuples)
- df = DataFrame(index=mi, columns=mi)
- assert pd.isna(df).values.ravel().all()
- tuples = [(3, 3), (2, 3), (3, 3)]
- mi = MultiIndex.from_tuples(tuples)
- df = DataFrame(index=mi, columns=mi)
- assert pd.isna(df).values.ravel().all()
- def test_constructor_error_msgs(self):
- msg = "Empty data passed with indices specified."
- # passing an empty array with columns specified.
- with pytest.raises(ValueError, match=msg):
- DataFrame(np.empty(0), columns=list('abc'))
- msg = "Mixing dicts with non-Series may lead to ambiguous ordering."
- # mix dict and array, wrong size
- with pytest.raises(ValueError, match=msg):
- DataFrame({'A': {'a': 'a', 'b': 'b'},
- 'B': ['a', 'b', 'c']})
- # wrong size ndarray, GH 3105
- msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)"
- with pytest.raises(ValueError, match=msg):
- DataFrame(np.arange(12).reshape((4, 3)),
- columns=['foo', 'bar', 'baz'],
- index=pd.date_range('2000-01-01', periods=3))
- arr = np.array([[4, 5, 6]])
- msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)"
- with pytest.raises(ValueError, match=msg):
- DataFrame(index=[0], columns=range(0, 4), data=arr)
- arr = np.array([4, 5, 6])
- msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)"
- with pytest.raises(ValueError, match=msg):
- DataFrame(index=[0], columns=range(0, 4), data=arr)
- # higher dim raise exception
- with pytest.raises(ValueError, match='Must pass 2-d input'):
- DataFrame(np.zeros((3, 3, 3)), columns=['A', 'B', 'C'], index=[1])
- # wrong size axis labels
- msg = ("Shape of passed values "
- r"is \(2, 3\), indices "
- r"imply \(1, 3\)")
- with pytest.raises(ValueError, match=msg):
- DataFrame(np.random.rand(2, 3), columns=['A', 'B', 'C'], index=[1])
- msg = ("Shape of passed values "
- r"is \(2, 3\), indices "
- r"imply \(2, 2\)")
- with pytest.raises(ValueError, match=msg):
- DataFrame(np.random.rand(2, 3), columns=['A', 'B'], index=[1, 2])
- msg = ("If using all scalar "
- "values, you must pass "
- "an index")
- with pytest.raises(ValueError, match=msg):
- DataFrame({'a': False, 'b': True})
- def test_constructor_with_embedded_frames(self):
- # embedded data frames
- df1 = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
- df2 = DataFrame([df1, df1 + 10])
- df2.dtypes
- str(df2)
- result = df2.loc[0, 0]
- tm.assert_frame_equal(result, df1)
- result = df2.loc[1, 0]
- tm.assert_frame_equal(result, df1 + 10)
- def test_constructor_subclass_dict(self):
- # Test for passing dict subclass to constructor
- data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)),
- 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))}
- df = DataFrame(data)
- refdf = DataFrame({col: dict(compat.iteritems(val))
- for col, val in compat.iteritems(data)})
- tm.assert_frame_equal(refdf, df)
- data = tm.TestSubDict(compat.iteritems(data))
- df = DataFrame(data)
- tm.assert_frame_equal(refdf, df)
- # try with defaultdict
- from collections import defaultdict
- data = {}
- self.frame['B'][:10] = np.nan
- for k, v in compat.iteritems(self.frame):
- dct = defaultdict(dict)
- dct.update(v.to_dict())
- data[k] = dct
- frame = DataFrame(data)
- tm.assert_frame_equal(self.frame.sort_index(), frame)
- def test_constructor_dict_block(self):
- expected = np.array([[4., 3., 2., 1.]])
- df = DataFrame({'d': [4.], 'c': [3.], 'b': [2.], 'a': [1.]},
- columns=['d', 'c', 'b', 'a'])
- tm.assert_numpy_array_equal(df.values, expected)
- def test_constructor_dict_cast(self):
- # cast float tests
- test_data = {
- 'A': {'1': 1, '2': 2},
- 'B': {'1': '1', '2': '2', '3': '3'},
- }
- frame = DataFrame(test_data, dtype=float)
- assert len(frame) == 3
- assert frame['B'].dtype == np.float64
- assert frame['A'].dtype == np.float64
- frame = DataFrame(test_data)
- assert len(frame) == 3
- assert frame['B'].dtype == np.object_
- assert frame['A'].dtype == np.float64
- # can't cast to float
- test_data = {
- 'A': dict(zip(range(20), tm.makeStringIndex(20))),
- 'B': dict(zip(range(15), np.random.randn(15)))
- }
- frame = DataFrame(test_data, dtype=float)
- assert len(frame) == 20
- assert frame['A'].dtype == np.object_
- assert frame['B'].dtype == np.float64
- def test_constructor_dict_dont_upcast(self):
- d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}}
- df = DataFrame(d)
- assert isinstance(df['Col1']['Row2'], float)
- dm = DataFrame([[1, 2], ['a', 'b']], index=[1, 2], columns=[1, 2])
- assert isinstance(dm[1][1], int)
- def test_constructor_dict_of_tuples(self):
- # GH #1491
- data = {'a': (1, 2, 3), 'b': (4, 5, 6)}
- result = DataFrame(data)
- expected = DataFrame({k: list(v) for k, v in compat.iteritems(data)})
- tm.assert_frame_equal(result, expected, check_dtype=False)
- def test_constructor_dict_multiindex(self):
- def check(result, expected):
- return tm.assert_frame_equal(result, expected, check_dtype=True,
- check_index_type=True,
- check_column_type=True,
- check_names=True)
- d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2},
- ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4},
- ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}}
- _d = sorted(d.items())
- df = DataFrame(d)
- expected = DataFrame(
- [x[1] for x in _d],
- index=MultiIndex.from_tuples([x[0] for x in _d])).T
- expected.index = MultiIndex.from_tuples(expected.index)
- check(df, expected)
- d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111}
- _d.insert(0, ('z', d['z']))
- expected = DataFrame(
- [x[1] for x in _d],
- index=Index([x[0] for x in _d], tupleize_cols=False)).T
- expected.index = Index(expected.index, tupleize_cols=False)
- df = DataFrame(d)
- df = df.reindex(columns=expected.columns, index=expected.index)
- check(df, expected)
- def test_constructor_dict_datetime64_index(self):
- # GH 10160
- dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15']
- def create_data(constructor):
- return {i: {constructor(s): 2 * i}
- for i, s in enumerate(dates_as_str)}
- data_datetime64 = create_data(np.datetime64)
- data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d'))
- data_Timestamp = create_data(Timestamp)
- expected = DataFrame([{0: 0, 1: None, 2: None, 3: None},
- {0: None, 1: 2, 2: None, 3: None},
- {0: None, 1: None, 2: 4, 3: None},
- {0: None, 1: None, 2: None, 3: 6}],
- index=[Timestamp(dt) for dt in dates_as_str])
- result_datetime64 = DataFrame(data_datetime64)
- result_datetime = DataFrame(data_datetime)
- result_Timestamp = DataFrame(data_Timestamp)
- tm.assert_frame_equal(result_datetime64, expected)
- tm.assert_frame_equal(result_datetime, expected)
- tm.assert_frame_equal(result_Timestamp, expected)
- def test_constructor_dict_timedelta64_index(self):
- # GH 10160
- td_as_int = [1, 2, 3, 4]
- def create_data(constructor):
- return {i: {constructor(s): 2 * i}
- for i, s in enumerate(td_as_int)}
- data_timedelta64 = create_data(lambda x: np.timedelta64(x, 'D'))
- data_timedelta = create_data(lambda x: timedelta(days=x))
- data_Timedelta = create_data(lambda x: Timedelta(x, 'D'))
- expected = DataFrame([{0: 0, 1: None, 2: None, 3: None},
- {0: None, 1: 2, 2: None, 3: None},
- {0: None, 1: None, 2: 4, 3: None},
- {0: None, 1: None, 2: None, 3: 6}],
- index=[Timedelta(td, 'D') for td in td_as_int])
- result_timedelta64 = DataFrame(data_timedelta64)
- result_timedelta = DataFrame(data_timedelta)
- result_Timedelta = DataFrame(data_Timedelta)
- tm.assert_frame_equal(result_timedelta64, expected)
- tm.assert_frame_equal(result_timedelta, expected)
- tm.assert_frame_equal(result_Timedelta, expected)
- def test_constructor_period(self):
- # PeriodIndex
- a = pd.PeriodIndex(['2012-01', 'NaT', '2012-04'], freq='M')
- b = pd.PeriodIndex(['2012-02-01', '2012-03-01', 'NaT'], freq='D')
- df = pd.DataFrame({'a': a, 'b': b})
- assert df['a'].dtype == a.dtype
- assert df['b'].dtype == b.dtype
- # list of periods
- df = pd.DataFrame({'a': a.astype(object).tolist(),
- 'b': b.astype(object).tolist()})
- assert df['a'].dtype == a.dtype
- assert df['b'].dtype == b.dtype
- def test_nested_dict_frame_constructor(self):
- rng = pd.period_range('1/1/2000', periods=5)
- df = DataFrame(np.random.randn(10, 5), columns=rng)
- data = {}
- for col in df.columns:
- for row in df.index:
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- data.setdefault(col, {})[row] = df.get_value(row, col)
- result = DataFrame(data, columns=rng)
- tm.assert_frame_equal(result, df)
- data = {}
- for col in df.columns:
- for row in df.index:
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- data.setdefault(row, {})[col] = df.get_value(row, col)
- result = DataFrame(data, index=rng).T
- tm.assert_frame_equal(result, df)
- def _check_basic_constructor(self, empty):
- # mat: 2d matrix with shape (3, 2) to input. empty - makes sized
- # objects
- mat = empty((2, 3), dtype=float)
- # 2-D input
- frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
- assert len(frame.index) == 2
- assert len(frame.columns) == 3
- # 1-D input
- frame = DataFrame(empty((3,)), columns=['A'], index=[1, 2, 3])
- assert len(frame.index) == 3
- assert len(frame.columns) == 1
- # cast type
- frame = DataFrame(mat, columns=['A', 'B', 'C'],
- index=[1, 2], dtype=np.int64)
- assert frame.values.dtype == np.int64
- # wrong size axis labels
- msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)'
- with pytest.raises(ValueError, match=msg):
- DataFrame(mat, columns=['A', 'B', 'C'], index=[1])
- msg = r'Shape of passed values is \(2, 3\), indices imply \(2, 2\)'
- with pytest.raises(ValueError, match=msg):
- DataFrame(mat, columns=['A', 'B'], index=[1, 2])
- # higher dim raise exception
- with pytest.raises(ValueError, match='Must pass 2-d input'):
- DataFrame(empty((3, 3, 3)), columns=['A', 'B', 'C'],
- index=[1])
- # automatic labeling
- frame = DataFrame(mat)
- tm.assert_index_equal(frame.index, pd.Index(lrange(2)))
- tm.assert_index_equal(frame.columns, pd.Index(lrange(3)))
- frame = DataFrame(mat, index=[1, 2])
- tm.assert_index_equal(frame.columns, pd.Index(lrange(3)))
- frame = DataFrame(mat, columns=['A', 'B', 'C'])
- tm.assert_index_equal(frame.index, pd.Index(lrange(2)))
- # 0-length axis
- frame = DataFrame(empty((0, 3)))
- assert len(frame.index) == 0
- frame = DataFrame(empty((3, 0)))
- assert len(frame.columns) == 0
- def test_constructor_ndarray(self):
- self._check_basic_constructor(np.ones)
- frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A'])
- assert len(frame) == 2
- @pytest.mark.skipif(PY2 and _np_version_under1p13,
- reason="old numpy & py2")
- def test_constructor_maskedarray(self):
- self._check_basic_constructor(ma.masked_all)
- # Check non-masked values
- mat = ma.masked_all((2, 3), dtype=float)
- mat[0, 0] = 1.0
- mat[1, 2] = 2.0
- frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
- assert 1.0 == frame['A'][1]
- assert 2.0 == frame['C'][2]
- # what is this even checking??
- mat = ma.masked_all((2, 3), dtype=float)
- frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
- assert np.all(~np.asarray(frame == frame))
- @pytest.mark.skipif(PY2 and _np_version_under1p13,
- reason="old numpy & py2")
- def test_constructor_maskedarray_nonfloat(self):
- # masked int promoted to float
- mat = ma.masked_all((2, 3), dtype=int)
- # 2-D input
- frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
- assert len(frame.index) == 2
- assert len(frame.columns) == 3
- assert np.all(~np.asarray(frame == frame))
- # cast type
- frame = DataFrame(mat, columns=['A', 'B', 'C'],
- index=[1, 2], dtype=np.float64)
- assert frame.values.dtype == np.float64
- # Check non-masked values
- mat2 = ma.copy(mat)
- mat2[0, 0] = 1
- mat2[1, 2] = 2
- frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2])
- assert 1 == frame['A'][1]
- assert 2 == frame['C'][2]
- # masked np.datetime64 stays (use NaT as null)
- mat = ma.masked_all((2, 3), dtype='M8[ns]')
- # 2-D input
- frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
- assert len(frame.index) == 2
- assert len(frame.columns) == 3
- assert isna(frame).values.all()
- # cast type
- frame = DataFrame(mat, columns=['A', 'B', 'C'],
- index=[1, 2], dtype=np.int64)
- assert frame.values.dtype == np.int64
- # Check non-masked values
- mat2 = ma.copy(mat)
- mat2[0, 0] = 1
- mat2[1, 2] = 2
- frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2])
- assert 1 == frame['A'].view('i8')[1]
- assert 2 == frame['C'].view('i8')[2]
- # masked bool promoted to object
- mat = ma.masked_all((2, 3), dtype=bool)
- # 2-D input
- frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2])
- assert len(frame.index) == 2
- assert len(frame.columns) == 3
- assert np.all(~np.asarray(frame == frame))
- # cast type
- frame = DataFrame(mat, columns=['A', 'B', 'C'],
- index=[1, 2], dtype=object)
- assert frame.values.dtype == object
- # Check non-masked values
- mat2 = ma.copy(mat)
- mat2[0, 0] = True
- mat2[1, 2] = False
- frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2])
- assert frame['A'][1] is True
- assert frame['C'][2] is False
- @pytest.mark.skipif(PY2 and _np_version_under1p13,
- reason="old numpy & py2")
- def test_constructor_maskedarray_hardened(self):
- # Check numpy masked arrays with hard masks -- from GH24574
- mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask()
- result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2])
- expected = pd.DataFrame({
- 'A': [np.nan, np.nan],
- 'B': [np.nan, np.nan]},
- columns=['A', 'B'],
- index=[1, 2],
- dtype=float)
- tm.assert_frame_equal(result, expected)
- # Check case where mask is hard but no data are masked
- mat_hard = ma.ones((2, 2), dtype=float).harden_mask()
- result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2])
- expected = pd.DataFrame({
- 'A': [1.0, 1.0],
- 'B': [1.0, 1.0]},
- columns=['A', 'B'],
- index=[1, 2],
- dtype=float)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.skipif(PY2 and _np_version_under1p13,
- reason="old numpy & py2")
- def test_constructor_maskedrecarray_dtype(self):
- # Ensure constructor honors dtype
- data = np.ma.array(
- np.ma.zeros(5, dtype=[('date', '<f8'), ('price', '<f8')]),
- mask=[False] * 5)
- data = data.view(ma.mrecords.mrecarray)
- result = pd.DataFrame(data, dtype=int)
- expected = pd.DataFrame(np.zeros((5, 2), dtype=int),
- columns=['date', 'price'])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.skipif(PY2 and _np_version_under1p13,
- reason="old numpy & py2")
- def test_constructor_mrecarray(self):
- # Ensure mrecarray produces frame identical to dict of masked arrays
- # from GH3479
- assert_fr_equal = functools.partial(tm.assert_frame_equal,
- check_index_type=True,
- check_column_type=True,
- check_frame_type=True)
- arrays = [
- ('float', np.array([1.5, 2.0])),
- ('int', np.array([1, 2])),
- ('str', np.array(['abc', 'def'])),
- ]
- for name, arr in arrays[:]:
- arrays.append(('masked1_' + name,
- np.ma.masked_array(arr, mask=[False, True])))
- arrays.append(('masked_all', np.ma.masked_all((2,))))
- arrays.append(('masked_none',
- np.ma.masked_array([1.0, 2.5], mask=False)))
- # call assert_frame_equal for all selections of 3 arrays
- for comb in itertools.combinations(arrays, 3):
- names, data = zip(*comb)
- mrecs = ma.mrecords.fromarrays(data, names=names)
- # fill the comb
- comb = {k: (v.filled() if hasattr(v, 'filled') else v)
- for k, v in comb}
- expected = DataFrame(comb, columns=names)
- result = DataFrame(mrecs)
- assert_fr_equal(result, expected)
- # specify columns
- expected = DataFrame(comb, columns=names[::-1])
- result = DataFrame(mrecs, columns=names[::-1])
- assert_fr_equal(result, expected)
- # specify index
- expected = DataFrame(comb, columns=names, index=[1, 2])
- result = DataFrame(mrecs, index=[1, 2])
- assert_fr_equal(result, expected)
- def test_constructor_corner_shape(self):
- df = DataFrame(index=[])
- assert df.values.shape == (0, 0)
- @pytest.mark.parametrize("data, index, columns, dtype, expected", [
- (None, lrange(10), ['a', 'b'], object, np.object_),
- (None, None, ['a', 'b'], 'int64', np.dtype('int64')),
- (None, lrange(10), ['a', 'b'], int, np.dtype('float64')),
- ({}, None, ['foo', 'bar'], None, np.object_),
- ({'b': 1}, lrange(10), list('abc'), int, np.dtype('float64'))
- ])
- def test_constructor_dtype(self, data, index, columns, dtype, expected):
- df = DataFrame(data, index, columns, dtype)
- assert df.values.dtype == expected
- def test_constructor_scalar_inference(self):
- data = {'int': 1, 'bool': True,
- 'float': 3., 'complex': 4j, 'object': 'foo'}
- df = DataFrame(data, index=np.arange(10))
- assert df['int'].dtype == np.int64
- assert df['bool'].dtype == np.bool_
- assert df['float'].dtype == np.float64
- assert df['complex'].dtype == np.complex128
- assert df['object'].dtype == np.object_
- def test_constructor_arrays_and_scalars(self):
- df = DataFrame({'a': np.random.randn(10), 'b': True})
- exp = DataFrame({'a': df['a'].values, 'b': [True] * 10})
- tm.assert_frame_equal(df, exp)
- with pytest.raises(ValueError, match='must pass an index'):
- DataFrame({'a': False, 'b': True})
- def test_constructor_DataFrame(self):
- df = DataFrame(self.frame)
- tm.assert_frame_equal(df, self.frame)
- df_casted = DataFrame(self.frame, dtype=np.int64)
- assert df_casted.values.dtype == np.int64
- def test_constructor_more(self):
- # used to be in test_matrix.py
- arr = np.random.randn(10)
- dm = DataFrame(arr, columns=['A'], index=np.arange(10))
- assert dm.values.ndim == 2
- arr = np.random.randn(0)
- dm = DataFrame(arr)
- assert dm.values.ndim == 2
- assert dm.values.ndim == 2
- # no data specified
- dm = DataFrame(columns=['A', 'B'], index=np.arange(10))
- assert dm.values.shape == (10, 2)
- dm = DataFrame(columns=['A', 'B'])
- assert dm.values.shape == (0, 2)
- dm = DataFrame(index=np.arange(10))
- assert dm.values.shape == (10, 0)
- # can't cast
- mat = np.array(['foo', 'bar'], dtype=object).reshape(2, 1)
- with pytest.raises(ValueError, match='cast'):
- DataFrame(mat, index=[0, 1], columns=[0], dtype=float)
- dm = DataFrame(DataFrame(self.frame._series))
- tm.assert_frame_equal(dm, self.frame)
- # int cast
- dm = DataFrame({'A': np.ones(10, dtype=int),
- 'B': np.ones(10, dtype=np.float64)},
- index=np.arange(10))
- assert len(dm.columns) == 2
- assert dm.values.dtype == np.float64
- def test_constructor_empty_list(self):
- df = DataFrame([], index=[])
- expected = DataFrame(index=[])
- tm.assert_frame_equal(df, expected)
- # GH 9939
- df = DataFrame([], columns=['A', 'B'])
- expected = DataFrame({}, columns=['A', 'B'])
- tm.assert_frame_equal(df, expected)
- # Empty generator: list(empty_gen()) == []
- def empty_gen():
- return
- yield
- df = DataFrame(empty_gen(), columns=['A', 'B'])
- tm.assert_frame_equal(df, expected)
- def test_constructor_list_of_lists(self):
- # GH #484
- df = DataFrame(data=[[1, 'a'], [2, 'b']], columns=["num", "str"])
- assert is_integer_dtype(df['num'])
- assert df['str'].dtype == np.object_
- # GH 4851
- # list of 0-dim ndarrays
- expected = DataFrame({0: np.arange(10)})
- data = [np.array(x) for x in range(10)]
- result = DataFrame(data)
- tm.assert_frame_equal(result, expected)
- def test_constructor_sequence_like(self):
- # GH 3783
- # collections.Squence like
- class DummyContainer(compat.Sequence):
- def __init__(self, lst):
- self._lst = lst
- def __getitem__(self, n):
- return self._lst.__getitem__(n)
- def __len__(self, n):
- return self._lst.__len__()
- lst_containers = [DummyContainer([1, 'a']), DummyContainer([2, 'b'])]
- columns = ["num", "str"]
- result = DataFrame(lst_containers, columns=columns)
- expected = DataFrame([[1, 'a'], [2, 'b']], columns=columns)
- tm.assert_frame_equal(result, expected, check_dtype=False)
- # GH 4297
- # support Array
- import array
- result = DataFrame({'A': array.array('i', range(10))})
- expected = DataFrame({'A': list(range(10))})
- tm.assert_frame_equal(result, expected, check_dtype=False)
- expected = DataFrame([list(range(10)), list(range(10))])
- result = DataFrame([array.array('i', range(10)),
- array.array('i', range(10))])
- tm.assert_frame_equal(result, expected, check_dtype=False)
- def test_constructor_iterable(self):
- # GH 21987
- class Iter():
- def __iter__(self):
- for i in range(10):
- yield [1, 2, 3]
- expected = DataFrame([[1, 2, 3]] * 10)
- result = DataFrame(Iter())
- tm.assert_frame_equal(result, expected)
- def test_constructor_iterator(self):
- expected = DataFrame([list(range(10)), list(range(10))])
- result = DataFrame([range(10), range(10)])
- tm.assert_frame_equal(result, expected)
- def test_constructor_generator(self):
- # related #2305
- gen1 = (i for i in range(10))
- gen2 = (i for i in range(10))
- expected = DataFrame([list(range(10)), list(range(10))])
- result = DataFrame([gen1, gen2])
- tm.assert_frame_equal(result, expected)
- gen = ([i, 'a'] for i in range(10))
- result = DataFrame(gen)
- expected = DataFrame({0: range(10), 1: 'a'})
- tm.assert_frame_equal(result, expected, check_dtype=False)
- def test_constructor_list_of_dicts(self):
- data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]),
- OrderedDict([['a', 1.5], ['b', 3], ['d', 6]]),
- OrderedDict([['a', 1.5], ['d', 6]]),
- OrderedDict(),
- OrderedDict([['a', 1.5], ['b', 3], ['c', 4]]),
- OrderedDict([['b', 3], ['c', 4], ['d', 6]])]
- result = DataFrame(data)
- expected = DataFrame.from_dict(dict(zip(range(len(data)), data)),
- orient='index')
- tm.assert_frame_equal(result, expected.reindex(result.index))
- result = DataFrame([{}])
- expected = DataFrame(index=[0])
- tm.assert_frame_equal(result, expected)
- def test_constructor_ordered_dict_preserve_order(self):
- # see gh-13304
- expected = DataFrame([[2, 1]], columns=['b', 'a'])
- data = OrderedDict()
- data['b'] = [2]
- data['a'] = [1]
- result = DataFrame(data)
- tm.assert_frame_equal(result, expected)
- data = OrderedDict()
- data['b'] = 2
- data['a'] = 1
- result = DataFrame([data])
- tm.assert_frame_equal(result, expected)
- def test_constructor_ordered_dict_conflicting_orders(self):
- # the first dict element sets the ordering for the DataFrame,
- # even if there are conflicting orders from subsequent ones
- row_one = OrderedDict()
- row_one['b'] = 2
- row_one['a'] = 1
- row_two = OrderedDict()
- row_two['a'] = 1
- row_two['b'] = 2
- row_three = {'b': 2, 'a': 1}
- expected = DataFrame([[2, 1], [2, 1]], columns=['b', 'a'])
- result = DataFrame([row_one, row_two])
- tm.assert_frame_equal(result, expected)
- expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=['b', 'a'])
- result = DataFrame([row_one, row_two, row_three])
- tm.assert_frame_equal(result, expected)
- def test_constructor_list_of_series(self):
- data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]),
- OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])]
- sdict = OrderedDict(zip(['x', 'y'], data))
- idx = Index(['a', 'b', 'c'])
- # all named
- data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'),
- Series([1.5, 3, 6], idx, name='y')]
- result = DataFrame(data2)
- expected = DataFrame.from_dict(sdict, orient='index')
- tm.assert_frame_equal(result, expected)
- # some unnamed
- data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'),
- Series([1.5, 3, 6], idx)]
- result = DataFrame(data2)
- sdict = OrderedDict(zip(['x', 'Unnamed 0'], data))
- expected = DataFrame.from_dict(sdict, orient='index')
- tm.assert_frame_equal(result.sort_index(), expected)
- # none named
- data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]),
- OrderedDict([['a', 1.5], ['b', 3], ['d', 6]]),
- OrderedDict([['a', 1.5], ['d', 6]]),
- OrderedDict(),
- OrderedDict([['a', 1.5], ['b', 3], ['c', 4]]),
- OrderedDict([['b', 3], ['c', 4], ['d', 6]])]
- data = [Series(d) for d in data]
- result = DataFrame(data)
- sdict = OrderedDict(zip(range(len(data)), data))
- expected = DataFrame.from_dict(sdict, orient='index')
- tm.assert_frame_equal(result, expected.reindex(result.index))
- result2 = DataFrame(data, index=np.arange(6))
- tm.assert_frame_equal(result, result2)
- result = DataFrame([Series({})])
- expected = DataFrame(index=[0])
- tm.assert_frame_equal(result, expected)
- data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]),
- OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])]
- sdict = OrderedDict(zip(range(len(data)), data))
- idx = Index(['a', 'b', 'c'])
- data2 = [Series([1.5, 3, 4], idx, dtype='O'),
- Series([1.5, 3, 6], idx)]
- result = DataFrame(data2)
- expected = DataFrame.from_dict(sdict, orient='index')
- tm.assert_frame_equal(result, expected)
- def test_constructor_list_of_series_aligned_index(self):
- series = [pd.Series(i, index=['b', 'a', 'c'], name=str(i))
- for i in range(3)]
- result = pd.DataFrame(series)
- expected = pd.DataFrame({'b': [0, 1, 2],
- 'a': [0, 1, 2],
- 'c': [0, 1, 2]},
- columns=['b', 'a', 'c'],
- index=['0', '1', '2'])
- tm.assert_frame_equal(result, expected)
- def test_constructor_list_of_derived_dicts(self):
- class CustomDict(dict):
- pass
- d = {'a': 1.5, 'b': 3}
- data_custom = [CustomDict(d)]
- data = [d]
- result_custom = DataFrame(data_custom)
- result = DataFrame(data)
- tm.assert_frame_equal(result, result_custom)
- def test_constructor_ragged(self):
- data = {'A': np.random.randn(10),
- 'B': np.random.randn(8)}
- with pytest.raises(ValueError, match='arrays must all be same length'):
- DataFrame(data)
- def test_constructor_scalar(self):
- idx = Index(lrange(3))
- df = DataFrame({"a": 0}, index=idx)
- expected = DataFrame({"a": [0, 0, 0]}, index=idx)
- tm.assert_frame_equal(df, expected, check_dtype=False)
- def test_constructor_Series_copy_bug(self):
- df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A'])
- df.copy()
- def test_constructor_mixed_dict_and_Series(self):
- data = {}
- data['A'] = {'foo': 1, 'bar': 2, 'baz': 3}
- data['B'] = Series([4, 3, 2, 1], index=['bar', 'qux', 'baz', 'foo'])
- result = DataFrame(data)
- assert result.index.is_monotonic
- # ordering ambiguous, raise exception
- with pytest.raises(ValueError, match='ambiguous ordering'):
- DataFrame({'A': ['a', 'b'], 'B': {'a': 'a', 'b': 'b'}})
- # this is OK though
- result = DataFrame({'A': ['a', 'b'],
- 'B': Series(['a', 'b'], index=['a', 'b'])})
- expected = DataFrame({'A': ['a', 'b'], 'B': ['a', 'b']},
- index=['a', 'b'])
- tm.assert_frame_equal(result, expected)
- def test_constructor_mixed_type_rows(self):
- # Issue 25075
- data = [[1, 2], (3, 4)]
- result = DataFrame(data)
- expected = DataFrame([[1, 2], [3, 4]])
- tm.assert_frame_equal(result, expected)
- def test_constructor_tuples(self):
- result = DataFrame({'A': [(1, 2), (3, 4)]})
- expected = DataFrame({'A': Series([(1, 2), (3, 4)])})
- tm.assert_frame_equal(result, expected)
- def test_constructor_namedtuples(self):
- # GH11181
- from collections import namedtuple
- named_tuple = namedtuple("Pandas", list('ab'))
- tuples = [named_tuple(1, 3), named_tuple(2, 4)]
- expected = DataFrame({'a': [1, 2], 'b': [3, 4]})
- result = DataFrame(tuples)
- tm.assert_frame_equal(result, expected)
- # with columns
- expected = DataFrame({'y': [1, 2], 'z': [3, 4]})
- result = DataFrame(tuples, columns=['y', 'z'])
- tm.assert_frame_equal(result, expected)
- def test_constructor_orient(self):
- data_dict = self.mixed_frame.T._series
- recons = DataFrame.from_dict(data_dict, orient='index')
- expected = self.mixed_frame.sort_index()
- tm.assert_frame_equal(recons, expected)
- # dict of sequence
- a = {'hi': [32, 3, 3],
- 'there': [3, 5, 3]}
- rs = DataFrame.from_dict(a, orient='index')
- xp = DataFrame.from_dict(a).T.reindex(list(a.keys()))
- tm.assert_frame_equal(rs, xp)
- def test_from_dict_columns_parameter(self):
- # GH 18529
- # Test new columns parameter for from_dict that was added to make
- # from_items(..., orient='index', columns=[...]) easier to replicate
- result = DataFrame.from_dict(OrderedDict([('A', [1, 2]),
- ('B', [4, 5])]),
- orient='index', columns=['one', 'two'])
- expected = DataFrame([[1, 2], [4, 5]], index=['A', 'B'],
- columns=['one', 'two'])
- tm.assert_frame_equal(result, expected)
- msg = "cannot use columns parameter with orient='columns'"
- with pytest.raises(ValueError, match=msg):
- DataFrame.from_dict(dict([('A', [1, 2]), ('B', [4, 5])]),
- orient='columns', columns=['one', 'two'])
- with pytest.raises(ValueError, match=msg):
- DataFrame.from_dict(dict([('A', [1, 2]), ('B', [4, 5])]),
- columns=['one', 'two'])
- def test_constructor_Series_named(self):
- a = Series([1, 2, 3], index=['a', 'b', 'c'], name='x')
- df = DataFrame(a)
- assert df.columns[0] == 'x'
- tm.assert_index_equal(df.index, a.index)
- # ndarray like
- arr = np.random.randn(10)
- s = Series(arr, name='x')
- df = DataFrame(s)
- expected = DataFrame(dict(x=s))
- tm.assert_frame_equal(df, expected)
- s = Series(arr, index=range(3, 13))
- df = DataFrame(s)
- expected = DataFrame({0: s})
- tm.assert_frame_equal(df, expected)
- pytest.raises(ValueError, DataFrame, s, columns=[1, 2])
- # #2234
- a = Series([], name='x')
- df = DataFrame(a)
- assert df.columns[0] == 'x'
- # series with name and w/o
- s1 = Series(arr, name='x')
- df = DataFrame([s1, arr]).T
- expected = DataFrame({'x': s1, 'Unnamed 0': arr},
- columns=['x', 'Unnamed 0'])
- tm.assert_frame_equal(df, expected)
- # this is a bit non-intuitive here; the series collapse down to arrays
- df = DataFrame([arr, s1]).T
- expected = DataFrame({1: s1, 0: arr}, columns=[0, 1])
- tm.assert_frame_equal(df, expected)
- def test_constructor_Series_named_and_columns(self):
- # GH 9232 validation
- s0 = Series(range(5), name=0)
- s1 = Series(range(5), name=1)
- # matching name and column gives standard frame
- tm.assert_frame_equal(pd.DataFrame(s0, columns=[0]),
- s0.to_frame())
- tm.assert_frame_equal(pd.DataFrame(s1, columns=[1]),
- s1.to_frame())
- # non-matching produces empty frame
- assert pd.DataFrame(s0, columns=[1]).empty
- assert pd.DataFrame(s1, columns=[0]).empty
- def test_constructor_Series_differently_indexed(self):
- # name
- s1 = Series([1, 2, 3], index=['a', 'b', 'c'], name='x')
- # no name
- s2 = Series([1, 2, 3], index=['a', 'b', 'c'])
- other_index = Index(['a', 'b'])
- df1 = DataFrame(s1, index=other_index)
- exp1 = DataFrame(s1.reindex(other_index))
- assert df1.columns[0] == 'x'
- tm.assert_frame_equal(df1, exp1)
- df2 = DataFrame(s2, index=other_index)
- exp2 = DataFrame(s2.reindex(other_index))
- assert df2.columns[0] == 0
- tm.assert_index_equal(df2.index, other_index)
- tm.assert_frame_equal(df2, exp2)
- def test_constructor_manager_resize(self):
- index = list(self.frame.index[:5])
- columns = list(self.frame.columns[:3])
- result = DataFrame(self.frame._data, index=index,
- columns=columns)
- tm.assert_index_equal(result.index, Index(index))
- tm.assert_index_equal(result.columns, Index(columns))
- def test_constructor_from_items(self):
- items = [(c, self.frame[c]) for c in self.frame.columns]
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- recons = DataFrame.from_items(items)
- tm.assert_frame_equal(recons, self.frame)
- # pass some columns
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- recons = DataFrame.from_items(items, columns=['C', 'B', 'A'])
- tm.assert_frame_equal(recons, self.frame.loc[:, ['C', 'B', 'A']])
- # orient='index'
- row_items = [(idx, self.mixed_frame.xs(idx))
- for idx in self.mixed_frame.index]
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- recons = DataFrame.from_items(row_items,
- columns=self.mixed_frame.columns,
- orient='index')
- tm.assert_frame_equal(recons, self.mixed_frame)
- assert recons['A'].dtype == np.float64
- msg = "Must pass columns with orient='index'"
- with pytest.raises(TypeError, match=msg):
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- DataFrame.from_items(row_items, orient='index')
- # orient='index', but thar be tuples
- arr = construct_1d_object_array_from_listlike(
- [('bar', 'baz')] * len(self.mixed_frame))
- self.mixed_frame['foo'] = arr
- row_items = [(idx, list(self.mixed_frame.xs(idx)))
- for idx in self.mixed_frame.index]
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- recons = DataFrame.from_items(row_items,
- columns=self.mixed_frame.columns,
- orient='index')
- tm.assert_frame_equal(recons, self.mixed_frame)
- assert isinstance(recons['foo'][0], tuple)
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- rs = DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])],
- orient='index',
- columns=['one', 'two', 'three'])
- xp = DataFrame([[1, 2, 3], [4, 5, 6]], index=['A', 'B'],
- columns=['one', 'two', 'three'])
- tm.assert_frame_equal(rs, xp)
- def test_constructor_from_items_scalars(self):
- # GH 17312
- msg = (r'The value in each \(key, value\) '
- 'pair must be an array, Series, or dict')
- with pytest.raises(ValueError, match=msg):
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- DataFrame.from_items([('A', 1), ('B', 4)])
- msg = (r'The value in each \(key, value\) '
- 'pair must be an array, Series, or dict')
- with pytest.raises(ValueError, match=msg):
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- DataFrame.from_items([('A', 1), ('B', 2)], columns=['col1'],
- orient='index')
- def test_from_items_deprecation(self):
- # GH 17320
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])])
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])],
- columns=['col1', 'col2', 'col3'],
- orient='index')
- def test_constructor_mix_series_nonseries(self):
- df = DataFrame({'A': self.frame['A'],
- 'B': list(self.frame['B'])}, columns=['A', 'B'])
- tm.assert_frame_equal(df, self.frame.loc[:, ['A', 'B']])
- msg = 'does not match index length'
- with pytest.raises(ValueError, match=msg):
- DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])[:-2]})
- def test_constructor_miscast_na_int_dtype(self):
- df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64)
- expected = DataFrame([[np.nan, 1], [1, 0]])
- tm.assert_frame_equal(df, expected)
- def test_constructor_column_duplicates(self):
- # it works! #2079
- df = DataFrame([[8, 5]], columns=['a', 'a'])
- edf = DataFrame([[8, 5]])
- edf.columns = ['a', 'a']
- tm.assert_frame_equal(df, edf)
- idf = DataFrame.from_records([(8, 5)],
- columns=['a', 'a'])
- tm.assert_frame_equal(idf, edf)
- pytest.raises(ValueError, DataFrame.from_dict,
- OrderedDict([('b', 8), ('a', 5), ('a', 6)]))
- def test_constructor_empty_with_string_dtype(self):
- # GH 9428
- expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object)
- df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str)
- tm.assert_frame_equal(df, expected)
- df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_)
- tm.assert_frame_equal(df, expected)
- df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.unicode_)
- tm.assert_frame_equal(df, expected)
- df = DataFrame(index=[0, 1], columns=[0, 1], dtype='U5')
- tm.assert_frame_equal(df, expected)
- def test_constructor_single_value(self):
- # expecting single value upcasting here
- df = DataFrame(0., index=[1, 2, 3], columns=['a', 'b', 'c'])
- tm.assert_frame_equal(df,
- DataFrame(np.zeros(df.shape).astype('float64'),
- df.index, df.columns))
- df = DataFrame(0, index=[1, 2, 3], columns=['a', 'b', 'c'])
- tm.assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'),
- df.index, df.columns))
- df = DataFrame('a', index=[1, 2], columns=['a', 'c'])
- tm.assert_frame_equal(df, DataFrame(np.array([['a', 'a'], ['a', 'a']],
- dtype=object),
- index=[1, 2], columns=['a', 'c']))
- pytest.raises(ValueError, DataFrame, 'a', [1, 2])
- pytest.raises(ValueError, DataFrame, 'a', columns=['a', 'c'])
- msg = 'incompatible data and dtype'
- with pytest.raises(TypeError, match=msg):
- DataFrame('a', [1, 2], ['a', 'c'], float)
- def test_constructor_with_datetimes(self):
- intname = np.dtype(np.int_).name
- floatname = np.dtype(np.float_).name
- datetime64name = np.dtype('M8[ns]').name
- objectname = np.dtype(np.object_).name
- # single item
- df = DataFrame({'A': 1, 'B': 'foo', 'C': 'bar',
- 'D': Timestamp("20010101"),
- 'E': datetime(2001, 1, 2, 0, 0)},
- index=np.arange(10))
- result = df.get_dtype_counts()
- expected = Series({'int64': 1, datetime64name: 2, objectname: 2})
- result.sort_index()
- expected.sort_index()
- tm.assert_series_equal(result, expected)
- # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0
- # ndarray with a dtype specified)
- df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
- floatname: np.array(1., dtype=floatname),
- intname: np.array(1, dtype=intname)},
- index=np.arange(10))
- result = df.get_dtype_counts()
- expected = {objectname: 1}
- if intname == 'int64':
- expected['int64'] = 2
- else:
- expected['int64'] = 1
- expected[intname] = 1
- if floatname == 'float64':
- expected['float64'] = 2
- else:
- expected['float64'] = 1
- expected[floatname] = 1
- result = result.sort_index()
- expected = Series(expected).sort_index()
- tm.assert_series_equal(result, expected)
- # check with ndarray construction ndim>0
- df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
- floatname: np.array([1.] * 10, dtype=floatname),
- intname: np.array([1] * 10, dtype=intname)},
- index=np.arange(10))
- result = df.get_dtype_counts()
- result = result.sort_index()
- tm.assert_series_equal(result, expected)
- # GH 2809
- ind = date_range(start="2000-01-01", freq="D", periods=10)
- datetimes = [ts.to_pydatetime() for ts in ind]
- datetime_s = Series(datetimes)
- assert datetime_s.dtype == 'M8[ns]'
- df = DataFrame({'datetime_s': datetime_s})
- result = df.get_dtype_counts()
- expected = Series({datetime64name: 1})
- result = result.sort_index()
- expected = expected.sort_index()
- tm.assert_series_equal(result, expected)
- # GH 2810
- ind = date_range(start="2000-01-01", freq="D", periods=10)
- datetimes = [ts.to_pydatetime() for ts in ind]
- dates = [ts.date() for ts in ind]
- df = DataFrame({'datetimes': datetimes, 'dates': dates})
- result = df.get_dtype_counts()
- expected = Series({datetime64name: 1, objectname: 1})
- result = result.sort_index()
- expected = expected.sort_index()
- tm.assert_series_equal(result, expected)
- # GH 7594
- # don't coerce tz-aware
- import pytz
- tz = pytz.timezone('US/Eastern')
- dt = tz.localize(datetime(2012, 1, 1))
- df = DataFrame({'End Date': dt}, index=[0])
- assert df.iat[0, 0] == dt
- tm.assert_series_equal(df.dtypes, Series(
- {'End Date': 'datetime64[ns, US/Eastern]'}))
- df = DataFrame([{'End Date': dt}])
- assert df.iat[0, 0] == dt
- tm.assert_series_equal(df.dtypes, Series(
- {'End Date': 'datetime64[ns, US/Eastern]'}))
- # tz-aware (UTC and other tz's)
- # GH 8411
- dr = date_range('20130101', periods=3)
- df = DataFrame({'value': dr})
- assert df.iat[0, 0].tz is None
- dr = date_range('20130101', periods=3, tz='UTC')
- df = DataFrame({'value': dr})
- assert str(df.iat[0, 0].tz) == 'UTC'
- dr = date_range('20130101', periods=3, tz='US/Eastern')
- df = DataFrame({'value': dr})
- assert str(df.iat[0, 0].tz) == 'US/Eastern'
- # GH 7822
- # preserver an index with a tz on dict construction
- i = date_range('1/1/2011', periods=5, freq='10s', tz='US/Eastern')
- expected = DataFrame(
- {'a': i.to_series(keep_tz=True).reset_index(drop=True)})
- df = DataFrame()
- df['a'] = i
- tm.assert_frame_equal(df, expected)
- df = DataFrame({'a': i})
- tm.assert_frame_equal(df, expected)
- # multiples
- i_no_tz = date_range('1/1/2011', periods=5, freq='10s')
- df = DataFrame({'a': i, 'b': i_no_tz})
- expected = DataFrame({'a': i.to_series(keep_tz=True)
- .reset_index(drop=True), 'b': i_no_tz})
- tm.assert_frame_equal(df, expected)
- def test_constructor_datetimes_with_nulls(self):
- # gh-15869
- for arr in [np.array([None, None, None, None,
- datetime.now(), None]),
- np.array([None, None, datetime.now(), None])]:
- result = DataFrame(arr).get_dtype_counts()
- expected = Series({'datetime64[ns]': 1})
- tm.assert_series_equal(result, expected)
- def test_constructor_for_list_with_dtypes(self):
- # TODO(wesm): unused
- intname = np.dtype(np.int_).name # noqa
- floatname = np.dtype(np.float_).name # noqa
- datetime64name = np.dtype('M8[ns]').name
- objectname = np.dtype(np.object_).name
- # test list of lists/ndarrays
- df = DataFrame([np.arange(5) for x in range(5)])
- result = df.get_dtype_counts()
- expected = Series({'int64': 5})
- df = DataFrame([np.array(np.arange(5), dtype='int32')
- for x in range(5)])
- result = df.get_dtype_counts()
- expected = Series({'int32': 5})
- # overflow issue? (we always expecte int64 upcasting here)
- df = DataFrame({'a': [2 ** 31, 2 ** 31 + 1]})
- result = df.get_dtype_counts()
- expected = Series({'int64': 1})
- tm.assert_series_equal(result, expected)
- # GH #2751 (construction with no index specified), make sure we cast to
- # platform values
- df = DataFrame([1, 2])
- result = df.get_dtype_counts()
- expected = Series({'int64': 1})
- tm.assert_series_equal(result, expected)
- df = DataFrame([1., 2.])
- result = df.get_dtype_counts()
- expected = Series({'float64': 1})
- tm.assert_series_equal(result, expected)
- df = DataFrame({'a': [1, 2]})
- result = df.get_dtype_counts()
- expected = Series({'int64': 1})
- tm.assert_series_equal(result, expected)
- df = DataFrame({'a': [1., 2.]})
- result = df.get_dtype_counts()
- expected = Series({'float64': 1})
- tm.assert_series_equal(result, expected)
- df = DataFrame({'a': 1}, index=lrange(3))
- result = df.get_dtype_counts()
- expected = Series({'int64': 1})
- tm.assert_series_equal(result, expected)
- df = DataFrame({'a': 1.}, index=lrange(3))
- result = df.get_dtype_counts()
- expected = Series({'float64': 1})
- tm.assert_series_equal(result, expected)
- # with object list
- df = DataFrame({'a': [1, 2, 4, 7], 'b': [1.2, 2.3, 5.1, 6.3],
- 'c': list('abcd'),
- 'd': [datetime(2000, 1, 1) for i in range(4)],
- 'e': [1., 2, 4., 7]})
- result = df.get_dtype_counts()
- expected = Series(
- {'int64': 1, 'float64': 2, datetime64name: 1, objectname: 1})
- result = result.sort_index()
- expected = expected.sort_index()
- tm.assert_series_equal(result, expected)
- def test_constructor_frame_copy(self):
- cop = DataFrame(self.frame, copy=True)
- cop['A'] = 5
- assert (cop['A'] == 5).all()
- assert not (self.frame['A'] == 5).all()
- def test_constructor_ndarray_copy(self):
- df = DataFrame(self.frame.values)
- self.frame.values[5] = 5
- assert (df.values[5] == 5).all()
- df = DataFrame(self.frame.values, copy=True)
- self.frame.values[6] = 6
- assert not (df.values[6] == 6).all()
- def test_constructor_series_copy(self):
- series = self.frame._series
- df = DataFrame({'A': series['A']})
- df['A'][:] = 5
- assert not (series['A'] == 5).all()
- def test_constructor_with_nas(self):
- # GH 5016
- # na's in indices
- def check(df):
- for i in range(len(df.columns)):
- df.iloc[:, i]
- indexer = np.arange(len(df.columns))[isna(df.columns)]
- # No NaN found -> error
- if len(indexer) == 0:
- def f():
- df.loc[:, np.nan]
- pytest.raises(TypeError, f)
- # single nan should result in Series
- elif len(indexer) == 1:
- tm.assert_series_equal(df.iloc[:, indexer[0]],
- df.loc[:, np.nan])
- # multiple nans should result in DataFrame
- else:
- tm.assert_frame_equal(df.iloc[:, indexer],
- df.loc[:, np.nan])
- df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan])
- check(df)
- df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan])
- check(df)
- df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]],
- columns=[np.nan, 1.1, 2.2, np.nan])
- check(df)
- df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]],
- columns=[np.nan, 1.1, 2.2, np.nan])
- check(df)
- # GH 21428 (non-unique columns)
- df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]],
- columns=[np.nan, 1, 2, 2])
- check(df)
- def test_constructor_lists_to_object_dtype(self):
- # from #1074
- d = DataFrame({'a': [np.nan, False]})
- assert d['a'].dtype == np.object_
- assert not d['a'][1]
- def test_constructor_categorical(self):
- # GH8626
- # dict creation
- df = DataFrame({'A': list('abc')}, dtype='category')
- expected = Series(list('abc'), dtype='category', name='A')
- tm.assert_series_equal(df['A'], expected)
- # to_frame
- s = Series(list('abc'), dtype='category')
- result = s.to_frame()
- expected = Series(list('abc'), dtype='category', name=0)
- tm.assert_series_equal(result[0], expected)
- result = s.to_frame(name='foo')
- expected = Series(list('abc'), dtype='category', name='foo')
- tm.assert_series_equal(result['foo'], expected)
- # list-like creation
- df = DataFrame(list('abc'), dtype='category')
- expected = Series(list('abc'), dtype='category', name=0)
- tm.assert_series_equal(df[0], expected)
- # ndim != 1
- df = DataFrame([Categorical(list('abc'))])
- expected = DataFrame({0: Series(list('abc'), dtype='category')})
- tm.assert_frame_equal(df, expected)
- df = DataFrame([Categorical(list('abc')), Categorical(list('abd'))])
- expected = DataFrame({0: Series(list('abc'), dtype='category'),
- 1: Series(list('abd'), dtype='category')},
- columns=[0, 1])
- tm.assert_frame_equal(df, expected)
- # mixed
- df = DataFrame([Categorical(list('abc')), list('def')])
- expected = DataFrame({0: Series(list('abc'), dtype='category'),
- 1: list('def')}, columns=[0, 1])
- tm.assert_frame_equal(df, expected)
- # invalid (shape)
- pytest.raises(ValueError,
- lambda: DataFrame([Categorical(list('abc')),
- Categorical(list('abdefg'))]))
- # ndim > 1
- pytest.raises(NotImplementedError,
- lambda: Categorical(np.array([list('abcd')])))
- def test_constructor_categorical_series(self):
- items = [1, 2, 3, 1]
- exp = Series(items).astype('category')
- res = Series(items, dtype='category')
- tm.assert_series_equal(res, exp)
- items = ["a", "b", "c", "a"]
- exp = Series(items).astype('category')
- res = Series(items, dtype='category')
- tm.assert_series_equal(res, exp)
- # insert into frame with different index
- # GH 8076
- index = date_range('20000101', periods=3)
- expected = Series(Categorical(values=[np.nan, np.nan, np.nan],
- categories=['a', 'b', 'c']))
- expected.index = index
- expected = DataFrame({'x': expected})
- df = DataFrame(
- {'x': Series(['a', 'b', 'c'], dtype='category')}, index=index)
- tm.assert_frame_equal(df, expected)
- def test_from_records_to_records(self):
- # from numpy documentation
- arr = np.zeros((2,), dtype=('i4,f4,a10'))
- arr[:] = [(1, 2., 'Hello'), (2, 3., "World")]
- # TODO(wesm): unused
- frame = DataFrame.from_records(arr) # noqa
- index = pd.Index(np.arange(len(arr))[::-1])
- indexed_frame = DataFrame.from_records(arr, index=index)
- tm.assert_index_equal(indexed_frame.index, index)
- # without names, it should go to last ditch
- arr2 = np.zeros((2, 3))
- tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2))
- # wrong length
- msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)'
- with pytest.raises(ValueError, match=msg):
- DataFrame.from_records(arr, index=index[:-1])
- indexed_frame = DataFrame.from_records(arr, index='f1')
- # what to do?
- records = indexed_frame.to_records()
- assert len(records.dtype.names) == 3
- records = indexed_frame.to_records(index=False)
- assert len(records.dtype.names) == 2
- assert 'index' not in records.dtype.names
- def test_from_records_nones(self):
- tuples = [(1, 2, None, 3),
- (1, 2, None, 3),
- (None, 2, 5, 3)]
- df = DataFrame.from_records(tuples, columns=['a', 'b', 'c', 'd'])
- assert np.isnan(df['c'][0])
- def test_from_records_iterator(self):
- arr = np.array([(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5., 5., 6, 6),
- (7., 7., 8, 8)],
- dtype=[('x', np.float64), ('u', np.float32),
- ('y', np.int64), ('z', np.int32)])
- df = DataFrame.from_records(iter(arr), nrows=2)
- xp = DataFrame({'x': np.array([1.0, 3.0], dtype=np.float64),
- 'u': np.array([1.0, 3.0], dtype=np.float32),
- 'y': np.array([2, 4], dtype=np.int64),
- 'z': np.array([2, 4], dtype=np.int32)})
- tm.assert_frame_equal(df.reindex_like(xp), xp)
- # no dtypes specified here, so just compare with the default
- arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)]
- df = DataFrame.from_records(iter(arr), columns=['x', 'y'],
- nrows=2)
- tm.assert_frame_equal(df, xp.reindex(columns=['x', 'y']),
- check_dtype=False)
- def test_from_records_tuples_generator(self):
- def tuple_generator(length):
- for i in range(length):
- letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
- yield (i, letters[i % len(letters)], i / length)
- columns_names = ['Integer', 'String', 'Float']
- columns = [[i[j] for i in tuple_generator(
- 10)] for j in range(len(columns_names))]
- data = {'Integer': columns[0],
- 'String': columns[1], 'Float': columns[2]}
- expected = DataFrame(data, columns=columns_names)
- generator = tuple_generator(10)
- result = DataFrame.from_records(generator, columns=columns_names)
- tm.assert_frame_equal(result, expected)
- def test_from_records_lists_generator(self):
- def list_generator(length):
- for i in range(length):
- letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
- yield [i, letters[i % len(letters)], i / length]
- columns_names = ['Integer', 'String', 'Float']
- columns = [[i[j] for i in list_generator(
- 10)] for j in range(len(columns_names))]
- data = {'Integer': columns[0],
- 'String': columns[1], 'Float': columns[2]}
- expected = DataFrame(data, columns=columns_names)
- generator = list_generator(10)
- result = DataFrame.from_records(generator, columns=columns_names)
- tm.assert_frame_equal(result, expected)
- def test_from_records_columns_not_modified(self):
- tuples = [(1, 2, 3),
- (1, 2, 3),
- (2, 5, 3)]
- columns = ['a', 'b', 'c']
- original_columns = list(columns)
- df = DataFrame.from_records(tuples, columns=columns, index='a') # noqa
- assert columns == original_columns
- def test_from_records_decimal(self):
- from decimal import Decimal
- tuples = [(Decimal('1.5'),), (Decimal('2.5'),), (None,)]
- df = DataFrame.from_records(tuples, columns=['a'])
- assert df['a'].dtype == object
- df = DataFrame.from_records(tuples, columns=['a'], coerce_float=True)
- assert df['a'].dtype == np.float64
- assert np.isnan(df['a'].values[-1])
- def test_from_records_duplicates(self):
- result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)],
- columns=['a', 'b', 'a'])
- expected = DataFrame([(1, 2, 3), (4, 5, 6)],
- columns=['a', 'b', 'a'])
- tm.assert_frame_equal(result, expected)
- def test_from_records_set_index_name(self):
- def create_dict(order_id):
- return {'order_id': order_id, 'quantity': np.random.randint(1, 10),
- 'price': np.random.randint(1, 10)}
- documents = [create_dict(i) for i in range(10)]
- # demo missing data
- documents.append({'order_id': 10, 'quantity': 5})
- result = DataFrame.from_records(documents, index='order_id')
- assert result.index.name == 'order_id'
- # MultiIndex
- result = DataFrame.from_records(documents,
- index=['order_id', 'quantity'])
- assert result.index.names == ('order_id', 'quantity')
- def test_from_records_misc_brokenness(self):
- # #2179
- data = {1: ['foo'], 2: ['bar']}
- result = DataFrame.from_records(data, columns=['a', 'b'])
- exp = DataFrame(data, columns=['a', 'b'])
- tm.assert_frame_equal(result, exp)
- # overlap in index/index_names
- data = {'a': [1, 2, 3], 'b': [4, 5, 6]}
- result = DataFrame.from_records(data, index=['a', 'b', 'c'])
- exp = DataFrame(data, index=['a', 'b', 'c'])
- tm.assert_frame_equal(result, exp)
- # GH 2623
- rows = []
- rows.append([datetime(2010, 1, 1), 1])
- rows.append([datetime(2010, 1, 2), 'hi']) # test col upconverts to obj
- df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
- results = df2_obj.get_dtype_counts()
- expected = Series({'datetime64[ns]': 1, 'object': 1})
- rows = []
- rows.append([datetime(2010, 1, 1), 1])
- rows.append([datetime(2010, 1, 2), 1])
- df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
- results = df2_obj.get_dtype_counts().sort_index()
- expected = Series({'datetime64[ns]': 1, 'int64': 1})
- tm.assert_series_equal(results, expected)
- def test_from_records_empty(self):
- # 3562
- result = DataFrame.from_records([], columns=['a', 'b', 'c'])
- expected = DataFrame(columns=['a', 'b', 'c'])
- tm.assert_frame_equal(result, expected)
- result = DataFrame.from_records([], columns=['a', 'b', 'b'])
- expected = DataFrame(columns=['a', 'b', 'b'])
- tm.assert_frame_equal(result, expected)
- def test_from_records_empty_with_nonempty_fields_gh3682(self):
- a = np.array([(1, 2)], dtype=[('id', np.int64), ('value', np.int64)])
- df = DataFrame.from_records(a, index='id')
- tm.assert_index_equal(df.index, Index([1], name='id'))
- assert df.index.name == 'id'
- tm.assert_index_equal(df.columns, Index(['value']))
- b = np.array([], dtype=[('id', np.int64), ('value', np.int64)])
- df = DataFrame.from_records(b, index='id')
- tm.assert_index_equal(df.index, Index([], name='id'))
- assert df.index.name == 'id'
- def test_from_records_with_datetimes(self):
- # this may fail on certain platforms because of a numpy issue
- # related GH6140
- if not is_platform_little_endian():
- pytest.skip("known failure of test on non-little endian")
- # construction with a null in a recarray
- # GH 6140
- expected = DataFrame({'EXPIRY': [datetime(2005, 3, 1, 0, 0), None]})
- arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])]
- dtypes = [('EXPIRY', '<M8[ns]')]
- try:
- recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
- except (ValueError):
- pytest.skip("known failure of numpy rec array creation")
- result = DataFrame.from_records(recarray)
- tm.assert_frame_equal(result, expected)
- # coercion should work too
- arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])]
- dtypes = [('EXPIRY', '<M8[m]')]
- recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
- result = DataFrame.from_records(recarray)
- tm.assert_frame_equal(result, expected)
- def test_from_records_sequencelike(self):
- df = DataFrame({'A': np.array(np.random.randn(6), dtype=np.float64),
- 'A1': np.array(np.random.randn(6), dtype=np.float64),
- 'B': np.array(np.arange(6), dtype=np.int64),
- 'C': ['foo'] * 6,
- 'D': np.array([True, False] * 3, dtype=bool),
- 'E': np.array(np.random.randn(6), dtype=np.float32),
- 'E1': np.array(np.random.randn(6), dtype=np.float32),
- 'F': np.array(np.arange(6), dtype=np.int32)})
- # this is actually tricky to create the recordlike arrays and
- # have the dtypes be intact
- blocks = df._to_dict_of_blocks()
- tuples = []
- columns = []
- dtypes = []
- for dtype, b in compat.iteritems(blocks):
- columns.extend(b.columns)
- dtypes.extend([(c, np.dtype(dtype).descr[0][1])
- for c in b.columns])
- for i in range(len(df.index)):
- tup = []
- for _, b in compat.iteritems(blocks):
- tup.extend(b.iloc[i].values)
- tuples.append(tuple(tup))
- recarray = np.array(tuples, dtype=dtypes).view(np.recarray)
- recarray2 = df.to_records()
- lists = [list(x) for x in tuples]
- # tuples (lose the dtype info)
- result = (DataFrame.from_records(tuples, columns=columns)
- .reindex(columns=df.columns))
- # created recarray and with to_records recarray (have dtype info)
- result2 = (DataFrame.from_records(recarray, columns=columns)
- .reindex(columns=df.columns))
- result3 = (DataFrame.from_records(recarray2, columns=columns)
- .reindex(columns=df.columns))
- # list of tupels (no dtype info)
- result4 = (DataFrame.from_records(lists, columns=columns)
- .reindex(columns=df.columns))
- tm.assert_frame_equal(result, df, check_dtype=False)
- tm.assert_frame_equal(result2, df)
- tm.assert_frame_equal(result3, df)
- tm.assert_frame_equal(result4, df, check_dtype=False)
- # tuples is in the order of the columns
- result = DataFrame.from_records(tuples)
- tm.assert_index_equal(result.columns, pd.Index(lrange(8)))
- # test exclude parameter & we are casting the results here (as we don't
- # have dtype info to recover)
- columns_to_test = [columns.index('C'), columns.index('E1')]
- exclude = list(set(range(8)) - set(columns_to_test))
- result = DataFrame.from_records(tuples, exclude=exclude)
- result.columns = [columns[i] for i in sorted(columns_to_test)]
- tm.assert_series_equal(result['C'], df['C'])
- tm.assert_series_equal(result['E1'], df['E1'].astype('float64'))
- # empty case
- result = DataFrame.from_records([], columns=['foo', 'bar', 'baz'])
- assert len(result) == 0
- tm.assert_index_equal(result.columns,
- pd.Index(['foo', 'bar', 'baz']))
- result = DataFrame.from_records([])
- assert len(result) == 0
- assert len(result.columns) == 0
- def test_from_records_dictlike(self):
- # test the dict methods
- df = DataFrame({'A': np.array(np.random.randn(6), dtype=np.float64),
- 'A1': np.array(np.random.randn(6), dtype=np.float64),
- 'B': np.array(np.arange(6), dtype=np.int64),
- 'C': ['foo'] * 6,
- 'D': np.array([True, False] * 3, dtype=bool),
- 'E': np.array(np.random.randn(6), dtype=np.float32),
- 'E1': np.array(np.random.randn(6), dtype=np.float32),
- 'F': np.array(np.arange(6), dtype=np.int32)})
- # columns is in a different order here than the actual items iterated
- # from the dict
- blocks = df._to_dict_of_blocks()
- columns = []
- for dtype, b in compat.iteritems(blocks):
- columns.extend(b.columns)
- asdict = {x: y for x, y in compat.iteritems(df)}
- asdict2 = {x: y.values for x, y in compat.iteritems(df)}
- # dict of series & dict of ndarrays (have dtype info)
- results = []
- results.append(DataFrame.from_records(
- asdict).reindex(columns=df.columns))
- results.append(DataFrame.from_records(asdict, columns=columns)
- .reindex(columns=df.columns))
- results.append(DataFrame.from_records(asdict2, columns=columns)
- .reindex(columns=df.columns))
- for r in results:
- tm.assert_frame_equal(r, df)
- def test_from_records_with_index_data(self):
- df = DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
- data = np.random.randn(10)
- df1 = DataFrame.from_records(df, index=data)
- tm.assert_index_equal(df1.index, Index(data))
- def test_from_records_bad_index_column(self):
- df = DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
- # should pass
- df1 = DataFrame.from_records(df, index=['C'])
- tm.assert_index_equal(df1.index, Index(df.C))
- df1 = DataFrame.from_records(df, index='C')
- tm.assert_index_equal(df1.index, Index(df.C))
- # should fail
- pytest.raises(ValueError, DataFrame.from_records, df, index=[2])
- pytest.raises(KeyError, DataFrame.from_records, df, index=2)
- def test_from_records_non_tuple(self):
- class Record(object):
- def __init__(self, *args):
- self.args = args
- def __getitem__(self, i):
- return self.args[i]
- def __iter__(self):
- return iter(self.args)
- recs = [Record(1, 2, 3), Record(4, 5, 6), Record(7, 8, 9)]
- tups = lmap(tuple, recs)
- result = DataFrame.from_records(recs)
- expected = DataFrame.from_records(tups)
- tm.assert_frame_equal(result, expected)
- def test_from_records_len0_with_columns(self):
- # #2633
- result = DataFrame.from_records([], index='foo',
- columns=['foo', 'bar'])
- expected = Index(['bar'])
- assert len(result) == 0
- assert result.index.name == 'foo'
- tm.assert_index_equal(result.columns, expected)
- def test_to_frame_with_falsey_names(self):
- # GH 16114
- result = Series(name=0).to_frame().dtypes
- expected = Series({0: np.float64})
- tm.assert_series_equal(result, expected)
- result = DataFrame(Series(name=0)).dtypes
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize('dtype', [None, 'uint8', 'category'])
- def test_constructor_range_dtype(self, dtype):
- # GH 16804
- expected = DataFrame({'A': [0, 1, 2, 3, 4]}, dtype=dtype or 'int64')
- result = DataFrame({'A': range(5)}, dtype=dtype)
- tm.assert_frame_equal(result, expected)
- def test_frame_from_list_subclass(self):
- # GH21226
- class List(list):
- pass
- expected = DataFrame([[1, 2, 3], [4, 5, 6]])
- result = DataFrame(List([List([1, 2, 3]), List([4, 5, 6])]))
- tm.assert_frame_equal(result, expected)
- class TestDataFrameConstructorWithDatetimeTZ(TestData):
- def test_from_dict(self):
- # 8260
- # support datetime64 with tz
- idx = Index(date_range('20130101', periods=3, tz='US/Eastern'),
- name='foo')
- dr = date_range('20130110', periods=3)
- # construction
- df = DataFrame({'A': idx, 'B': dr})
- assert df['A'].dtype, 'M8[ns, US/Eastern'
- assert df['A'].name == 'A'
- tm.assert_series_equal(df['A'], Series(idx, name='A'))
- tm.assert_series_equal(df['B'], Series(dr, name='B'))
- def test_from_index(self):
- # from index
- idx2 = date_range('20130101', periods=3, tz='US/Eastern', name='foo')
- df2 = DataFrame(idx2)
- tm.assert_series_equal(df2['foo'], Series(idx2, name='foo'))
- df2 = DataFrame(Series(idx2))
- tm.assert_series_equal(df2['foo'], Series(idx2, name='foo'))
- idx2 = date_range('20130101', periods=3, tz='US/Eastern')
- df2 = DataFrame(idx2)
- tm.assert_series_equal(df2[0], Series(idx2, name=0))
- df2 = DataFrame(Series(idx2))
- tm.assert_series_equal(df2[0], Series(idx2, name=0))
- def test_frame_dict_constructor_datetime64_1680(self):
- dr = date_range('1/1/2012', periods=10)
- s = Series(dr, index=dr)
- # it works!
- DataFrame({'a': 'foo', 'b': s}, index=dr)
- DataFrame({'a': 'foo', 'b': s.values}, index=dr)
- def test_frame_datetime64_mixed_index_ctor_1681(self):
- dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
- ts = Series(dr)
- # it works!
- d = DataFrame({'A': 'foo', 'B': ts}, index=dr)
- assert d['B'].isna().all()
- def test_frame_timeseries_to_records(self):
- index = date_range('1/1/2000', periods=10)
- df = DataFrame(np.random.randn(10, 3), index=index,
- columns=['a', 'b', 'c'])
- result = df.to_records()
- result['index'].dtype == 'M8[ns]'
- result = df.to_records(index=False)
- def test_frame_timeseries_column(self):
- # GH19157
- dr = date_range(start='20130101T10:00:00', periods=3, freq='T',
- tz='US/Eastern')
- result = DataFrame(dr, columns=['timestamps'])
- expected = DataFrame({'timestamps': [
- Timestamp('20130101T10:00:00', tz='US/Eastern'),
- Timestamp('20130101T10:01:00', tz='US/Eastern'),
- Timestamp('20130101T10:02:00', tz='US/Eastern')]})
- tm.assert_frame_equal(result, expected)
- def test_nested_dict_construction(self):
- # GH22227
- columns = ['Nevada', 'Ohio']
- pop = {'Nevada': {2001: 2.4, 2002: 2.9},
- 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
- result = pd.DataFrame(pop, index=[2001, 2002, 2003], columns=columns)
- expected = pd.DataFrame(
- [(2.4, 1.7), (2.9, 3.6), (np.nan, np.nan)],
- columns=columns,
- index=pd.Index([2001, 2002, 2003])
- )
- tm.assert_frame_equal(result, expected)
|