12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499 |
- # coding=utf-8
- # pylint: disable-msg=E1101,W0612
- from distutils.version import LooseVersion
- from itertools import product
- import operator
- import numpy as np
- from numpy import nan
- import pytest
- from pandas.compat import PY35, lrange, range
- import pandas.util._test_decorators as td
- import pandas as pd
- from pandas import (
- Categorical, CategoricalIndex, DataFrame, Series, compat, date_range, isna,
- notna)
- from pandas.api.types import is_scalar
- from pandas.core.index import MultiIndex
- from pandas.core.indexes.datetimes import Timestamp
- import pandas.util.testing as tm
- from pandas.util.testing import (
- assert_almost_equal, assert_frame_equal, assert_index_equal,
- assert_series_equal)
- class TestSeriesAnalytics(object):
- def test_describe(self):
- s = Series([0, 1, 2, 3, 4], name='int_data')
- result = s.describe()
- expected = Series([5, 2, s.std(), 0, 1, 2, 3, 4],
- name='int_data',
- index=['count', 'mean', 'std', 'min', '25%',
- '50%', '75%', 'max'])
- tm.assert_series_equal(result, expected)
- s = Series([True, True, False, False, False], name='bool_data')
- result = s.describe()
- expected = Series([5, 2, False, 3], name='bool_data',
- index=['count', 'unique', 'top', 'freq'])
- tm.assert_series_equal(result, expected)
- s = Series(['a', 'a', 'b', 'c', 'd'], name='str_data')
- result = s.describe()
- expected = Series([5, 4, 'a', 2], name='str_data',
- index=['count', 'unique', 'top', 'freq'])
- tm.assert_series_equal(result, expected)
- def test_describe_with_tz(self, tz_naive_fixture):
- # GH 21332
- tz = tz_naive_fixture
- name = str(tz_naive_fixture)
- start = Timestamp(2018, 1, 1)
- end = Timestamp(2018, 1, 5)
- s = Series(date_range(start, end, tz=tz), name=name)
- result = s.describe()
- expected = Series(
- [5, 5, s.value_counts().index[0], 1, start.tz_localize(tz),
- end.tz_localize(tz)
- ],
- name=name,
- index=['count', 'unique', 'top', 'freq', 'first', 'last']
- )
- tm.assert_series_equal(result, expected)
- def test_argsort(self, datetime_series):
- self._check_accum_op('argsort', datetime_series, check_dtype=False)
- argsorted = datetime_series.argsort()
- assert issubclass(argsorted.dtype.type, np.integer)
- # GH 2967 (introduced bug in 0.11-dev I think)
- s = Series([Timestamp('201301%02d' % (i + 1)) for i in range(5)])
- assert s.dtype == 'datetime64[ns]'
- shifted = s.shift(-1)
- assert shifted.dtype == 'datetime64[ns]'
- assert isna(shifted[4])
- result = s.argsort()
- expected = Series(lrange(5), dtype='int64')
- assert_series_equal(result, expected)
- result = shifted.argsort()
- expected = Series(lrange(4) + [-1], dtype='int64')
- assert_series_equal(result, expected)
- def test_argsort_stable(self):
- s = Series(np.random.randint(0, 100, size=10000))
- mindexer = s.argsort(kind='mergesort')
- qindexer = s.argsort()
- mexpected = np.argsort(s.values, kind='mergesort')
- qexpected = np.argsort(s.values, kind='quicksort')
- tm.assert_series_equal(mindexer, Series(mexpected),
- check_dtype=False)
- tm.assert_series_equal(qindexer, Series(qexpected),
- check_dtype=False)
- msg = (r"ndarray Expected type <(class|type) 'numpy\.ndarray'>,"
- r" found <class 'pandas\.core\.series\.Series'> instead")
- with pytest.raises(AssertionError, match=msg):
- tm.assert_numpy_array_equal(qindexer, mindexer)
- def test_cumsum(self, datetime_series):
- self._check_accum_op('cumsum', datetime_series)
- def test_cumprod(self, datetime_series):
- self._check_accum_op('cumprod', datetime_series)
- def test_cummin(self, datetime_series):
- tm.assert_numpy_array_equal(datetime_series.cummin().values,
- np.minimum
- .accumulate(np.array(datetime_series)))
- ts = datetime_series.copy()
- ts[::2] = np.NaN
- result = ts.cummin()[1::2]
- expected = np.minimum.accumulate(ts.dropna())
- tm.assert_series_equal(result, expected)
- def test_cummax(self, datetime_series):
- tm.assert_numpy_array_equal(datetime_series.cummax().values,
- np.maximum
- .accumulate(np.array(datetime_series)))
- ts = datetime_series.copy()
- ts[::2] = np.NaN
- result = ts.cummax()[1::2]
- expected = np.maximum.accumulate(ts.dropna())
- tm.assert_series_equal(result, expected)
- def test_cummin_datetime64(self):
- s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1',
- 'NaT', '2000-1-3']))
- expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT',
- '2000-1-1', 'NaT', '2000-1-1']))
- result = s.cummin(skipna=True)
- tm.assert_series_equal(expected, result)
- expected = pd.Series(pd.to_datetime(
- ['NaT', '2000-1-2', '2000-1-2', '2000-1-1', '2000-1-1', '2000-1-1'
- ]))
- result = s.cummin(skipna=False)
- tm.assert_series_equal(expected, result)
- def test_cummax_datetime64(self):
- s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1',
- 'NaT', '2000-1-3']))
- expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT',
- '2000-1-2', 'NaT', '2000-1-3']))
- result = s.cummax(skipna=True)
- tm.assert_series_equal(expected, result)
- expected = pd.Series(pd.to_datetime(
- ['NaT', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-3'
- ]))
- result = s.cummax(skipna=False)
- tm.assert_series_equal(expected, result)
- def test_cummin_timedelta64(self):
- s = pd.Series(pd.to_timedelta(['NaT',
- '2 min',
- 'NaT',
- '1 min',
- 'NaT',
- '3 min', ]))
- expected = pd.Series(pd.to_timedelta(['NaT',
- '2 min',
- 'NaT',
- '1 min',
- 'NaT',
- '1 min', ]))
- result = s.cummin(skipna=True)
- tm.assert_series_equal(expected, result)
- expected = pd.Series(pd.to_timedelta(['NaT',
- '2 min',
- '2 min',
- '1 min',
- '1 min',
- '1 min', ]))
- result = s.cummin(skipna=False)
- tm.assert_series_equal(expected, result)
- def test_cummax_timedelta64(self):
- s = pd.Series(pd.to_timedelta(['NaT',
- '2 min',
- 'NaT',
- '1 min',
- 'NaT',
- '3 min', ]))
- expected = pd.Series(pd.to_timedelta(['NaT',
- '2 min',
- 'NaT',
- '2 min',
- 'NaT',
- '3 min', ]))
- result = s.cummax(skipna=True)
- tm.assert_series_equal(expected, result)
- expected = pd.Series(pd.to_timedelta(['NaT',
- '2 min',
- '2 min',
- '2 min',
- '2 min',
- '3 min', ]))
- result = s.cummax(skipna=False)
- tm.assert_series_equal(expected, result)
- def test_npdiff(self):
- pytest.skip("skipping due to Series no longer being an "
- "ndarray")
- # no longer works as the return type of np.diff is now nd.array
- s = Series(np.arange(5))
- r = np.diff(s)
- assert_series_equal(Series([nan, 0, 0, 0, nan]), r)
- def _check_accum_op(self, name, datetime_series_, check_dtype=True):
- func = getattr(np, name)
- tm.assert_numpy_array_equal(func(datetime_series_).values,
- func(np.array(datetime_series_)),
- check_dtype=check_dtype)
- # with missing values
- ts = datetime_series_.copy()
- ts[::2] = np.NaN
- result = func(ts)[1::2]
- expected = func(np.array(ts.dropna()))
- tm.assert_numpy_array_equal(result.values, expected,
- check_dtype=False)
- def test_compress(self):
- cond = [True, False, True, False, False]
- s = Series([1, -1, 5, 8, 7],
- index=list('abcde'), name='foo')
- expected = Series(s.values.compress(cond),
- index=list('ac'), name='foo')
- with tm.assert_produces_warning(FutureWarning):
- result = s.compress(cond)
- tm.assert_series_equal(result, expected)
- def test_numpy_compress(self):
- cond = [True, False, True, False, False]
- s = Series([1, -1, 5, 8, 7],
- index=list('abcde'), name='foo')
- expected = Series(s.values.compress(cond),
- index=list('ac'), name='foo')
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- tm.assert_series_equal(np.compress(cond, s), expected)
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- msg = "the 'axis' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- np.compress(cond, s, axis=1)
- msg = "the 'out' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- np.compress(cond, s, out=s)
- def test_round(self, datetime_series):
- datetime_series.index.name = "index_name"
- result = datetime_series.round(2)
- expected = Series(np.round(datetime_series.values, 2),
- index=datetime_series.index, name='ts')
- assert_series_equal(result, expected)
- assert result.name == datetime_series.name
- def test_numpy_round(self):
- # See gh-12600
- s = Series([1.53, 1.36, 0.06])
- out = np.round(s, decimals=0)
- expected = Series([2., 1., 0.])
- assert_series_equal(out, expected)
- msg = "the 'out' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- np.round(s, decimals=0, out=s)
- def test_built_in_round(self):
- if not compat.PY3:
- pytest.skip(
- 'build in round cannot be overridden prior to Python 3')
- s = Series([1.123, 2.123, 3.123], index=lrange(3))
- result = round(s)
- expected_rounded0 = Series([1., 2., 3.], index=lrange(3))
- tm.assert_series_equal(result, expected_rounded0)
- decimals = 2
- expected_rounded = Series([1.12, 2.12, 3.12], index=lrange(3))
- result = round(s, decimals)
- tm.assert_series_equal(result, expected_rounded)
- def test_prod_numpy16_bug(self):
- s = Series([1., 1., 1.], index=lrange(3))
- result = s.prod()
- assert not isinstance(result, Series)
- @td.skip_if_no_scipy
- def test_corr(self, datetime_series):
- import scipy.stats as stats
- # full overlap
- tm.assert_almost_equal(datetime_series.corr(datetime_series), 1)
- # partial overlap
- tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]),
- 1)
- assert isna(datetime_series[:15].corr(datetime_series[5:],
- min_periods=12))
- ts1 = datetime_series[:15].reindex(datetime_series.index)
- ts2 = datetime_series[5:].reindex(datetime_series.index)
- assert isna(ts1.corr(ts2, min_periods=12))
- # No overlap
- assert np.isnan(datetime_series[::2].corr(datetime_series[1::2]))
- # all NA
- cp = datetime_series[:10].copy()
- cp[:] = np.nan
- assert isna(cp.corr(cp))
- A = tm.makeTimeSeries()
- B = tm.makeTimeSeries()
- result = A.corr(B)
- expected, _ = stats.pearsonr(A, B)
- tm.assert_almost_equal(result, expected)
- @td.skip_if_no_scipy
- def test_corr_rank(self):
- import scipy
- import scipy.stats as stats
- # kendall and spearman
- A = tm.makeTimeSeries()
- B = tm.makeTimeSeries()
- A[-5:] = A[:5]
- result = A.corr(B, method='kendall')
- expected = stats.kendalltau(A, B)[0]
- tm.assert_almost_equal(result, expected)
- result = A.corr(B, method='spearman')
- expected = stats.spearmanr(A, B)[0]
- tm.assert_almost_equal(result, expected)
- # these methods got rewritten in 0.8
- if LooseVersion(scipy.__version__) < LooseVersion('0.9'):
- pytest.skip("skipping corr rank because of scipy version "
- "{0}".format(scipy.__version__))
- # results from R
- A = Series(
- [-0.89926396, 0.94209606, -1.03289164, -0.95445587, 0.76910310, -
- 0.06430576, -2.09704447, 0.40660407, -0.89926396, 0.94209606])
- B = Series(
- [-1.01270225, -0.62210117, -1.56895827, 0.59592943, -0.01680292,
- 1.17258718, -1.06009347, -0.10222060, -0.89076239, 0.89372375])
- kexp = 0.4319297
- sexp = 0.5853767
- tm.assert_almost_equal(A.corr(B, method='kendall'), kexp)
- tm.assert_almost_equal(A.corr(B, method='spearman'), sexp)
- def test_corr_invalid_method(self):
- # GH PR #22298
- s1 = pd.Series(np.random.randn(10))
- s2 = pd.Series(np.random.randn(10))
- msg = ("method must be either 'pearson', 'spearman', "
- "or 'kendall'")
- with pytest.raises(ValueError, match=msg):
- s1.corr(s2, method="____")
- def test_corr_callable_method(self, datetime_series):
- # simple correlation example
- # returns 1 if exact equality, 0 otherwise
- my_corr = lambda a, b: 1. if (a == b).all() else 0.
- # simple example
- s1 = Series([1, 2, 3, 4, 5])
- s2 = Series([5, 4, 3, 2, 1])
- expected = 0
- tm.assert_almost_equal(
- s1.corr(s2, method=my_corr),
- expected)
- # full overlap
- tm.assert_almost_equal(datetime_series.corr(
- datetime_series, method=my_corr), 1.)
- # partial overlap
- tm.assert_almost_equal(datetime_series[:15].corr(
- datetime_series[5:], method=my_corr), 1.)
- # No overlap
- assert np.isnan(datetime_series[::2].corr(
- datetime_series[1::2], method=my_corr))
- # dataframe example
- df = pd.DataFrame([s1, s2])
- expected = pd.DataFrame([
- {0: 1., 1: 0}, {0: 0, 1: 1.}])
- tm.assert_almost_equal(
- df.transpose().corr(method=my_corr), expected)
- def test_cov(self, datetime_series):
- # full overlap
- tm.assert_almost_equal(datetime_series.cov(datetime_series),
- datetime_series.std() ** 2)
- # partial overlap
- tm.assert_almost_equal(datetime_series[:15].cov(datetime_series[5:]),
- datetime_series[5:15].std() ** 2)
- # No overlap
- assert np.isnan(datetime_series[::2].cov(datetime_series[1::2]))
- # all NA
- cp = datetime_series[:10].copy()
- cp[:] = np.nan
- assert isna(cp.cov(cp))
- # min_periods
- assert isna(datetime_series[:15].cov(datetime_series[5:],
- min_periods=12))
- ts1 = datetime_series[:15].reindex(datetime_series.index)
- ts2 = datetime_series[5:].reindex(datetime_series.index)
- assert isna(ts1.cov(ts2, min_periods=12))
- def test_count(self, datetime_series):
- assert datetime_series.count() == len(datetime_series)
- datetime_series[::2] = np.NaN
- assert datetime_series.count() == np.isfinite(datetime_series).sum()
- mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
- ts = Series(np.arange(len(mi)), index=mi)
- left = ts.count(level=1)
- right = Series([2, 3, 1], index=[1, 2, nan])
- assert_series_equal(left, right)
- ts.iloc[[0, 3, 5]] = nan
- assert_series_equal(ts.count(level=1), right - 1)
- def test_dot(self):
- a = Series(np.random.randn(4), index=['p', 'q', 'r', 's'])
- b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'],
- columns=['p', 'q', 'r', 's']).T
- result = a.dot(b)
- expected = Series(np.dot(a.values, b.values), index=['1', '2', '3'])
- assert_series_equal(result, expected)
- # Check index alignment
- b2 = b.reindex(index=reversed(b.index))
- result = a.dot(b)
- assert_series_equal(result, expected)
- # Check ndarray argument
- result = a.dot(b.values)
- assert np.all(result == expected.values)
- assert_almost_equal(a.dot(b['2'].values), expected['2'])
- # Check series argument
- assert_almost_equal(a.dot(b['1']), expected['1'])
- assert_almost_equal(a.dot(b2['1']), expected['1'])
- msg = r"Dot product shape mismatch, \(4L?,\) vs \(3L?,\)"
- # exception raised is of type Exception
- with pytest.raises(Exception, match=msg):
- a.dot(a.values[:3])
- msg = "matrices are not aligned"
- with pytest.raises(ValueError, match=msg):
- a.dot(b.T)
- @pytest.mark.skipif(not PY35,
- reason='matmul supported for Python>=3.5')
- def test_matmul(self):
- # matmul test is for GH #10259
- a = Series(np.random.randn(4), index=['p', 'q', 'r', 's'])
- b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'],
- columns=['p', 'q', 'r', 's']).T
- # Series @ DataFrame
- result = operator.matmul(a, b)
- expected = Series(np.dot(a.values, b.values), index=['1', '2', '3'])
- assert_series_equal(result, expected)
- # DataFrame @ Series
- result = operator.matmul(b.T, a)
- expected = Series(np.dot(b.T.values, a.T.values),
- index=['1', '2', '3'])
- assert_series_equal(result, expected)
- # Series @ Series
- result = operator.matmul(a, a)
- expected = np.dot(a.values, a.values)
- assert_almost_equal(result, expected)
- # GH 21530
- # vector (1D np.array) @ Series (__rmatmul__)
- result = operator.matmul(a.values, a)
- expected = np.dot(a.values, a.values)
- assert_almost_equal(result, expected)
- # GH 21530
- # vector (1D list) @ Series (__rmatmul__)
- result = operator.matmul(a.values.tolist(), a)
- expected = np.dot(a.values, a.values)
- assert_almost_equal(result, expected)
- # GH 21530
- # matrix (2D np.array) @ Series (__rmatmul__)
- result = operator.matmul(b.T.values, a)
- expected = np.dot(b.T.values, a.values)
- assert_almost_equal(result, expected)
- # GH 21530
- # matrix (2D nested lists) @ Series (__rmatmul__)
- result = operator.matmul(b.T.values.tolist(), a)
- expected = np.dot(b.T.values, a.values)
- assert_almost_equal(result, expected)
- # mixed dtype DataFrame @ Series
- a['p'] = int(a.p)
- result = operator.matmul(b.T, a)
- expected = Series(np.dot(b.T.values, a.T.values),
- index=['1', '2', '3'])
- assert_series_equal(result, expected)
- # different dtypes DataFrame @ Series
- a = a.astype(int)
- result = operator.matmul(b.T, a)
- expected = Series(np.dot(b.T.values, a.T.values),
- index=['1', '2', '3'])
- assert_series_equal(result, expected)
- msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)"
- # exception raised is of type Exception
- with pytest.raises(Exception, match=msg):
- a.dot(a.values[:3])
- msg = "matrices are not aligned"
- with pytest.raises(ValueError, match=msg):
- a.dot(b.T)
- def test_clip(self, datetime_series):
- val = datetime_series.median()
- with tm.assert_produces_warning(FutureWarning):
- assert datetime_series.clip_lower(val).min() == val
- with tm.assert_produces_warning(FutureWarning):
- assert datetime_series.clip_upper(val).max() == val
- assert datetime_series.clip(lower=val).min() == val
- assert datetime_series.clip(upper=val).max() == val
- result = datetime_series.clip(-0.5, 0.5)
- expected = np.clip(datetime_series, -0.5, 0.5)
- assert_series_equal(result, expected)
- assert isinstance(expected, Series)
- def test_clip_types_and_nulls(self):
- sers = [Series([np.nan, 1.0, 2.0, 3.0]), Series([None, 'a', 'b', 'c']),
- Series(pd.to_datetime(
- [np.nan, 1, 2, 3], unit='D'))]
- for s in sers:
- thresh = s[2]
- with tm.assert_produces_warning(FutureWarning):
- lower = s.clip_lower(thresh)
- with tm.assert_produces_warning(FutureWarning):
- upper = s.clip_upper(thresh)
- assert lower[notna(lower)].min() == thresh
- assert upper[notna(upper)].max() == thresh
- assert list(isna(s)) == list(isna(lower))
- assert list(isna(s)) == list(isna(upper))
- def test_clip_with_na_args(self):
- """Should process np.nan argument as None """
- # GH # 17276
- s = Series([1, 2, 3])
- assert_series_equal(s.clip(np.nan), Series([1, 2, 3]))
- assert_series_equal(s.clip(upper=np.nan, lower=np.nan),
- Series([1, 2, 3]))
- # GH #19992
- assert_series_equal(s.clip(lower=[0, 4, np.nan]),
- Series([1, 4, np.nan]))
- assert_series_equal(s.clip(upper=[1, np.nan, 1]),
- Series([1, np.nan, 1]))
- def test_clip_against_series(self):
- # GH #6966
- s = Series([1.0, 1.0, 4.0])
- threshold = Series([1.0, 2.0, 3.0])
- with tm.assert_produces_warning(FutureWarning):
- assert_series_equal(s.clip_lower(threshold),
- Series([1.0, 2.0, 4.0]))
- with tm.assert_produces_warning(FutureWarning):
- assert_series_equal(s.clip_upper(threshold),
- Series([1.0, 1.0, 3.0]))
- lower = Series([1.0, 2.0, 3.0])
- upper = Series([1.5, 2.5, 3.5])
- assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5]))
- assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5]))
- @pytest.mark.parametrize("inplace", [True, False])
- @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])])
- def test_clip_against_list_like(self, inplace, upper):
- # GH #15390
- original = pd.Series([5, 6, 7])
- result = original.clip(upper=upper, inplace=inplace)
- expected = pd.Series([1, 2, 3])
- if inplace:
- result = original
- tm.assert_series_equal(result, expected, check_exact=True)
- def test_clip_with_datetimes(self):
- # GH 11838
- # naive and tz-aware datetimes
- t = Timestamp('2015-12-01 09:30:30')
- s = Series([Timestamp('2015-12-01 09:30:00'),
- Timestamp('2015-12-01 09:31:00')])
- result = s.clip(upper=t)
- expected = Series([Timestamp('2015-12-01 09:30:00'),
- Timestamp('2015-12-01 09:30:30')])
- assert_series_equal(result, expected)
- t = Timestamp('2015-12-01 09:30:30', tz='US/Eastern')
- s = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'),
- Timestamp('2015-12-01 09:31:00', tz='US/Eastern')])
- result = s.clip(upper=t)
- expected = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'),
- Timestamp('2015-12-01 09:30:30', tz='US/Eastern')])
- assert_series_equal(result, expected)
- def test_cummethods_bool(self):
- # GH 6270
- a = pd.Series([False, False, False, True, True, False, False])
- b = ~a
- c = pd.Series([False] * len(b))
- d = ~c
- methods = {'cumsum': np.cumsum,
- 'cumprod': np.cumprod,
- 'cummin': np.minimum.accumulate,
- 'cummax': np.maximum.accumulate}
- args = product((a, b, c, d), methods)
- for s, method in args:
- expected = Series(methods[method](s.values))
- result = getattr(s, method)()
- assert_series_equal(result, expected)
- e = pd.Series([False, True, nan, False])
- cse = pd.Series([0, 1, nan, 1], dtype=object)
- cpe = pd.Series([False, 0, nan, 0])
- cmin = pd.Series([False, False, nan, False])
- cmax = pd.Series([False, True, nan, True])
- expecteds = {'cumsum': cse,
- 'cumprod': cpe,
- 'cummin': cmin,
- 'cummax': cmax}
- for method in methods:
- res = getattr(e, method)()
- assert_series_equal(res, expecteds[method])
- def test_isin(self):
- s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])
- result = s.isin(['A', 'C'])
- expected = Series([True, False, True, False, False, False, True, True])
- assert_series_equal(result, expected)
- # GH: 16012
- # This specific issue has to have a series over 1e6 in len, but the
- # comparison array (in_list) must be large enough so that numpy doesn't
- # do a manual masking trick that will avoid this issue altogether
- s = Series(list('abcdefghijk' * 10 ** 5))
- # If numpy doesn't do the manual comparison/mask, these
- # unorderable mixed types are what cause the exception in numpy
- in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E',
- 'K', 'E', 'S', 'I', 'R', 'R'] * 6
- assert s.isin(in_list).sum() == 200000
- def test_isin_with_string_scalar(self):
- # GH4763
- s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])
- msg = (r"only list-like objects are allowed to be passed to isin\(\),"
- r" you passed a \[str\]")
- with pytest.raises(TypeError, match=msg):
- s.isin('a')
- s = Series(['aaa', 'b', 'c'])
- with pytest.raises(TypeError, match=msg):
- s.isin('aaa')
- def test_isin_with_i8(self):
- # GH 5021
- expected = Series([True, True, False, False, False])
- expected2 = Series([False, True, False, False, False])
- # datetime64[ns]
- s = Series(date_range('jan-01-2013', 'jan-05-2013'))
- result = s.isin(s[0:2])
- assert_series_equal(result, expected)
- result = s.isin(s[0:2].values)
- assert_series_equal(result, expected)
- # fails on dtype conversion in the first place
- result = s.isin(s[0:2].values.astype('datetime64[D]'))
- assert_series_equal(result, expected)
- result = s.isin([s[1]])
- assert_series_equal(result, expected2)
- result = s.isin([np.datetime64(s[1])])
- assert_series_equal(result, expected2)
- result = s.isin(set(s[0:2]))
- assert_series_equal(result, expected)
- # timedelta64[ns]
- s = Series(pd.to_timedelta(lrange(5), unit='d'))
- result = s.isin(s[0:2])
- assert_series_equal(result, expected)
- @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
- def test_isin_empty(self, empty):
- # see gh-16991
- s = Series(["a", "b"])
- expected = Series([False, False])
- result = s.isin(empty)
- tm.assert_series_equal(expected, result)
- def test_ptp(self):
- # GH21614
- N = 1000
- arr = np.random.randn(N)
- ser = Series(arr)
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- assert np.ptp(ser) == np.ptp(arr)
- # GH11163
- s = Series([3, 5, np.nan, -3, 10])
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- assert s.ptp() == 13
- assert pd.isna(s.ptp(skipna=False))
- mi = pd.MultiIndex.from_product([['a', 'b'], [1, 2, 3]])
- s = pd.Series([1, np.nan, 7, 3, 5, np.nan], index=mi)
- expected = pd.Series([6, 2], index=['a', 'b'], dtype=np.float64)
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- tm.assert_series_equal(s.ptp(level=0), expected)
- expected = pd.Series([np.nan, np.nan], index=['a', 'b'])
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- tm.assert_series_equal(s.ptp(level=0, skipna=False), expected)
- msg = r"No axis named 1 for object type <(class|type) 'type'>"
- with pytest.raises(ValueError, match=msg):
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- s.ptp(axis=1)
- s = pd.Series(['a', 'b', 'c', 'd', 'e'])
- msg = r"unsupported operand type\(s\) for -: 'str' and 'str'"
- with pytest.raises(TypeError, match=msg):
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- s.ptp()
- msg = r"Series\.ptp does not implement numeric_only\."
- with pytest.raises(NotImplementedError, match=msg):
- with tm.assert_produces_warning(FutureWarning,
- check_stacklevel=False):
- s.ptp(numeric_only=True)
- def test_repeat(self):
- s = Series(np.random.randn(3), index=['a', 'b', 'c'])
- reps = s.repeat(5)
- exp = Series(s.values.repeat(5), index=s.index.values.repeat(5))
- assert_series_equal(reps, exp)
- to_rep = [2, 3, 4]
- reps = s.repeat(to_rep)
- exp = Series(s.values.repeat(to_rep),
- index=s.index.values.repeat(to_rep))
- assert_series_equal(reps, exp)
- def test_numpy_repeat(self):
- s = Series(np.arange(3), name='x')
- expected = Series(s.values.repeat(2), name='x',
- index=s.index.values.repeat(2))
- assert_series_equal(np.repeat(s, 2), expected)
- msg = "the 'axis' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- np.repeat(s, 2, axis=0)
- def test_searchsorted(self):
- s = Series([1, 2, 3])
- result = s.searchsorted(1, side='left')
- assert is_scalar(result)
- assert result == 0
- result = s.searchsorted(1, side='right')
- assert is_scalar(result)
- assert result == 1
- def test_searchsorted_numeric_dtypes_scalar(self):
- s = Series([1, 2, 90, 1000, 3e9])
- r = s.searchsorted(30)
- assert is_scalar(r)
- assert r == 2
- r = s.searchsorted([30])
- e = np.array([2], dtype=np.intp)
- tm.assert_numpy_array_equal(r, e)
- def test_searchsorted_numeric_dtypes_vector(self):
- s = Series([1, 2, 90, 1000, 3e9])
- r = s.searchsorted([91, 2e6])
- e = np.array([3, 4], dtype=np.intp)
- tm.assert_numpy_array_equal(r, e)
- def test_search_sorted_datetime64_scalar(self):
- s = Series(pd.date_range('20120101', periods=10, freq='2D'))
- v = pd.Timestamp('20120102')
- r = s.searchsorted(v)
- assert is_scalar(r)
- assert r == 1
- def test_search_sorted_datetime64_list(self):
- s = Series(pd.date_range('20120101', periods=10, freq='2D'))
- v = [pd.Timestamp('20120102'), pd.Timestamp('20120104')]
- r = s.searchsorted(v)
- e = np.array([1, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(r, e)
- def test_searchsorted_sorter(self):
- # GH8490
- s = Series([3, 1, 2])
- r = s.searchsorted([0, 3], sorter=np.argsort(s))
- e = np.array([0, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(r, e)
- def test_is_monotonic(self):
- s = Series(np.random.randint(0, 10, size=1000))
- assert not s.is_monotonic
- s = Series(np.arange(1000))
- assert s.is_monotonic is True
- assert s.is_monotonic_increasing is True
- s = Series(np.arange(1000, 0, -1))
- assert s.is_monotonic_decreasing is True
- s = Series(pd.date_range('20130101', periods=10))
- assert s.is_monotonic is True
- assert s.is_monotonic_increasing is True
- s = Series(list(reversed(s.tolist())))
- assert s.is_monotonic is False
- assert s.is_monotonic_decreasing is True
- def test_sort_index_level(self):
- mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
- s = Series([1, 2], mi)
- backwards = s.iloc[[1, 0]]
- res = s.sort_index(level='A')
- assert_series_equal(backwards, res)
- res = s.sort_index(level=['A', 'B'])
- assert_series_equal(backwards, res)
- res = s.sort_index(level='A', sort_remaining=False)
- assert_series_equal(s, res)
- res = s.sort_index(level=['A', 'B'], sort_remaining=False)
- assert_series_equal(s, res)
- def test_apply_categorical(self):
- values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'),
- ordered=True)
- s = pd.Series(values, name='XX', index=list('abcdefg'))
- result = s.apply(lambda x: x.lower())
- # should be categorical dtype when the number of categories are
- # the same
- values = pd.Categorical(list('abbabcd'), categories=list('dcba'),
- ordered=True)
- exp = pd.Series(values, name='XX', index=list('abcdefg'))
- tm.assert_series_equal(result, exp)
- tm.assert_categorical_equal(result.values, exp.values)
- result = s.apply(lambda x: 'A')
- exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg'))
- tm.assert_series_equal(result, exp)
- assert result.dtype == np.object
- def test_shift_int(self, datetime_series):
- ts = datetime_series.astype(int)
- shifted = ts.shift(1)
- expected = ts.astype(float).shift(1)
- assert_series_equal(shifted, expected)
- def test_shift_categorical(self):
- # GH 9416
- s = pd.Series(['a', 'b', 'c', 'd'], dtype='category')
- assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna())
- sp1 = s.shift(1)
- assert_index_equal(s.index, sp1.index)
- assert np.all(sp1.values.codes[:1] == -1)
- assert np.all(s.values.codes[:-1] == sp1.values.codes[1:])
- sn2 = s.shift(-2)
- assert_index_equal(s.index, sn2.index)
- assert np.all(sn2.values.codes[-2:] == -1)
- assert np.all(s.values.codes[2:] == sn2.values.codes[:-2])
- assert_index_equal(s.values.categories, sp1.values.categories)
- assert_index_equal(s.values.categories, sn2.values.categories)
- def test_unstack(self):
- from numpy import nan
- index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']],
- codes=[[1, 1, 0, 0], [0, 1, 0, 2]])
- s = Series(np.arange(4.), index=index)
- unstacked = s.unstack()
- expected = DataFrame([[2., nan, 3.], [0., 1., nan]],
- index=['bar', 'foo'],
- columns=['one', 'three', 'two'])
- assert_frame_equal(unstacked, expected)
- unstacked = s.unstack(level=0)
- assert_frame_equal(unstacked, expected.T)
- index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
- codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
- [0, 1, 0, 1, 0, 1]])
- s = Series(np.random.randn(6), index=index)
- exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]],
- codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]])
- expected = DataFrame({'bar': s.values},
- index=exp_index).sort_index(level=0)
- unstacked = s.unstack(0).sort_index()
- assert_frame_equal(unstacked, expected)
- # GH5873
- idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]])
- ts = pd.Series([1, 2], index=idx)
- left = ts.unstack()
- right = DataFrame([[nan, 1], [2, nan]], index=[101, 102],
- columns=[nan, 3.5])
- assert_frame_equal(left, right)
- idx = pd.MultiIndex.from_arrays([['cat', 'cat', 'cat', 'dog', 'dog'
- ], ['a', 'a', 'b', 'a', 'b'],
- [1, 2, 1, 1, np.nan]])
- ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx)
- right = DataFrame([[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]],
- columns=['cat', 'dog'])
- tpls = [('a', 1), ('a', 2), ('b', nan), ('b', 1)]
- right.index = pd.MultiIndex.from_tuples(tpls)
- assert_frame_equal(ts.unstack(level=0), right)
- def test_value_counts_datetime(self):
- # most dtypes are tested in test_base.py
- values = [pd.Timestamp('2011-01-01 09:00'),
- pd.Timestamp('2011-01-01 10:00'),
- pd.Timestamp('2011-01-01 11:00'),
- pd.Timestamp('2011-01-01 09:00'),
- pd.Timestamp('2011-01-01 09:00'),
- pd.Timestamp('2011-01-01 11:00')]
- exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00',
- '2011-01-01 10:00'])
- exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
- s = pd.Series(values, name='xxx')
- tm.assert_series_equal(s.value_counts(), exp)
- # check DatetimeIndex outputs the same result
- idx = pd.DatetimeIndex(values, name='xxx')
- tm.assert_series_equal(idx.value_counts(), exp)
- # normalize
- exp = pd.Series(np.array([3., 2., 1]) / 6.,
- index=exp_idx, name='xxx')
- tm.assert_series_equal(s.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_datetime_tz(self):
- values = [pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
- pd.Timestamp('2011-01-01 10:00', tz='US/Eastern'),
- pd.Timestamp('2011-01-01 11:00', tz='US/Eastern'),
- pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
- pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
- pd.Timestamp('2011-01-01 11:00', tz='US/Eastern')]
- exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00',
- '2011-01-01 10:00'], tz='US/Eastern')
- exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
- s = pd.Series(values, name='xxx')
- tm.assert_series_equal(s.value_counts(), exp)
- idx = pd.DatetimeIndex(values, name='xxx')
- tm.assert_series_equal(idx.value_counts(), exp)
- exp = pd.Series(np.array([3., 2., 1]) / 6.,
- index=exp_idx, name='xxx')
- tm.assert_series_equal(s.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_period(self):
- values = [pd.Period('2011-01', freq='M'),
- pd.Period('2011-02', freq='M'),
- pd.Period('2011-03', freq='M'),
- pd.Period('2011-01', freq='M'),
- pd.Period('2011-01', freq='M'),
- pd.Period('2011-03', freq='M')]
- exp_idx = pd.PeriodIndex(['2011-01', '2011-03', '2011-02'], freq='M')
- exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
- s = pd.Series(values, name='xxx')
- tm.assert_series_equal(s.value_counts(), exp)
- # check DatetimeIndex outputs the same result
- idx = pd.PeriodIndex(values, name='xxx')
- tm.assert_series_equal(idx.value_counts(), exp)
- # normalize
- exp = pd.Series(np.array([3., 2., 1]) / 6.,
- index=exp_idx, name='xxx')
- tm.assert_series_equal(s.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_categorical_ordered(self):
- # most dtypes are tested in test_base.py
- values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True)
- exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3],
- ordered=True)
- exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
- s = pd.Series(values, name='xxx')
- tm.assert_series_equal(s.value_counts(), exp)
- # check CategoricalIndex outputs the same result
- idx = pd.CategoricalIndex(values, name='xxx')
- tm.assert_series_equal(idx.value_counts(), exp)
- # normalize
- exp = pd.Series(np.array([3., 2., 1]) / 6.,
- index=exp_idx, name='xxx')
- tm.assert_series_equal(s.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_categorical_not_ordered(self):
- values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False)
- exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3],
- ordered=False)
- exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
- s = pd.Series(values, name='xxx')
- tm.assert_series_equal(s.value_counts(), exp)
- # check CategoricalIndex outputs the same result
- idx = pd.CategoricalIndex(values, name='xxx')
- tm.assert_series_equal(idx.value_counts(), exp)
- # normalize
- exp = pd.Series(np.array([3., 2., 1]) / 6.,
- index=exp_idx, name='xxx')
- tm.assert_series_equal(s.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- @pytest.mark.parametrize("func", [np.any, np.all])
- @pytest.mark.parametrize("kwargs", [
- dict(keepdims=True),
- dict(out=object()),
- ])
- @td.skip_if_np_lt_115
- def test_validate_any_all_out_keepdims_raises(self, kwargs, func):
- s = pd.Series([1, 2])
- param = list(kwargs)[0]
- name = func.__name__
- msg = (r"the '{arg}' parameter is not "
- r"supported in the pandas "
- r"implementation of {fname}\(\)").format(arg=param, fname=name)
- with pytest.raises(ValueError, match=msg):
- func(s, **kwargs)
- @td.skip_if_np_lt_115
- def test_validate_sum_initial(self):
- s = pd.Series([1, 2])
- msg = (r"the 'initial' parameter is not "
- r"supported in the pandas "
- r"implementation of sum\(\)")
- with pytest.raises(ValueError, match=msg):
- np.sum(s, initial=10)
- def test_validate_median_initial(self):
- s = pd.Series([1, 2])
- msg = (r"the 'overwrite_input' parameter is not "
- r"supported in the pandas "
- r"implementation of median\(\)")
- with pytest.raises(ValueError, match=msg):
- # It seems like np.median doesn't dispatch, so we use the
- # method instead of the ufunc.
- s.median(overwrite_input=True)
- @td.skip_if_np_lt_115
- def test_validate_stat_keepdims(self):
- s = pd.Series([1, 2])
- msg = (r"the 'keepdims' parameter is not "
- r"supported in the pandas "
- r"implementation of sum\(\)")
- with pytest.raises(ValueError, match=msg):
- np.sum(s, keepdims=True)
- main_dtypes = [
- 'datetime',
- 'datetimetz',
- 'timedelta',
- 'int8',
- 'int16',
- 'int32',
- 'int64',
- 'float32',
- 'float64',
- 'uint8',
- 'uint16',
- 'uint32',
- 'uint64'
- ]
- @pytest.fixture
- def s_main_dtypes():
- """A DataFrame with many dtypes
- * datetime
- * datetimetz
- * timedelta
- * [u]int{8,16,32,64}
- * float{32,64}
- The columns are the name of the dtype.
- """
- df = pd.DataFrame(
- {'datetime': pd.to_datetime(['2003', '2002',
- '2001', '2002',
- '2005']),
- 'datetimetz': pd.to_datetime(
- ['2003', '2002',
- '2001', '2002',
- '2005']).tz_localize('US/Eastern'),
- 'timedelta': pd.to_timedelta(['3d', '2d', '1d',
- '2d', '5d'])})
- for dtype in ['int8', 'int16', 'int32', 'int64',
- 'float32', 'float64',
- 'uint8', 'uint16', 'uint32', 'uint64']:
- df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype)
- return df
- @pytest.fixture(params=main_dtypes)
- def s_main_dtypes_split(request, s_main_dtypes):
- """Each series in s_main_dtypes."""
- return s_main_dtypes[request.param]
- def assert_check_nselect_boundary(vals, dtype, method):
- # helper function for 'test_boundary_{dtype}' tests
- s = Series(vals, dtype=dtype)
- result = getattr(s, method)(3)
- expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1]
- expected = s.loc[expected_idxr]
- tm.assert_series_equal(result, expected)
- class TestNLargestNSmallest(object):
- @pytest.mark.parametrize(
- "r", [Series([3., 2, 1, 2, '5'], dtype='object'),
- Series([3., 2, 1, 2, 5], dtype='object'),
- # not supported on some archs
- # Series([3., 2, 1, 2, 5], dtype='complex256'),
- Series([3., 2, 1, 2, 5], dtype='complex128'),
- Series(list('abcde')),
- Series(list('abcde'), dtype='category')])
- def test_error(self, r):
- dt = r.dtype
- msg = ("Cannot use method 'n(larg|small)est' with "
- "dtype {dt}".format(dt=dt))
- args = 2, len(r), 0, -1
- methods = r.nlargest, r.nsmallest
- for method, arg in product(methods, args):
- with pytest.raises(TypeError, match=msg):
- method(arg)
- def test_nsmallest_nlargest(self, s_main_dtypes_split):
- # float, int, datetime64 (use i8), timedelts64 (same),
- # object that are numbers, object that are strings
- s = s_main_dtypes_split
- assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
- assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]])
- empty = s.iloc[0:0]
- assert_series_equal(s.nsmallest(0), empty)
- assert_series_equal(s.nsmallest(-1), empty)
- assert_series_equal(s.nlargest(0), empty)
- assert_series_equal(s.nlargest(-1), empty)
- assert_series_equal(s.nsmallest(len(s)), s.sort_values())
- assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values())
- assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]])
- assert_series_equal(s.nlargest(len(s) + 1),
- s.iloc[[4, 0, 1, 3, 2]])
- def test_misc(self):
- s = Series([3., np.nan, 1, 2, 5])
- assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
- assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
- msg = 'keep must be either "first", "last"'
- with pytest.raises(ValueError, match=msg):
- s.nsmallest(keep='invalid')
- with pytest.raises(ValueError, match=msg):
- s.nlargest(keep='invalid')
- # GH 15297
- s = Series([1] * 5, index=[1, 2, 3, 4, 5])
- expected_first = Series([1] * 3, index=[1, 2, 3])
- expected_last = Series([1] * 3, index=[5, 4, 3])
- result = s.nsmallest(3)
- assert_series_equal(result, expected_first)
- result = s.nsmallest(3, keep='last')
- assert_series_equal(result, expected_last)
- result = s.nlargest(3)
- assert_series_equal(result, expected_first)
- result = s.nlargest(3, keep='last')
- assert_series_equal(result, expected_last)
- @pytest.mark.parametrize('n', range(1, 5))
- def test_n(self, n):
- # GH 13412
- s = Series([1, 4, 3, 2], index=[0, 0, 1, 1])
- result = s.nlargest(n)
- expected = s.sort_values(ascending=False).head(n)
- assert_series_equal(result, expected)
- result = s.nsmallest(n)
- expected = s.sort_values().head(n)
- assert_series_equal(result, expected)
- def test_boundary_integer(self, nselect_method, any_int_dtype):
- # GH 21426
- dtype_info = np.iinfo(any_int_dtype)
- min_val, max_val = dtype_info.min, dtype_info.max
- vals = [min_val, min_val + 1, max_val - 1, max_val]
- assert_check_nselect_boundary(vals, any_int_dtype, nselect_method)
- def test_boundary_float(self, nselect_method, float_dtype):
- # GH 21426
- dtype_info = np.finfo(float_dtype)
- min_val, max_val = dtype_info.min, dtype_info.max
- min_2nd, max_2nd = np.nextafter(
- [min_val, max_val], 0, dtype=float_dtype)
- vals = [min_val, min_2nd, max_2nd, max_val]
- assert_check_nselect_boundary(vals, float_dtype, nselect_method)
- @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]'])
- def test_boundary_datetimelike(self, nselect_method, dtype):
- # GH 21426
- # use int64 bounds and +1 to min_val since true minimum is NaT
- # (include min_val/NaT at end to maintain same expected_idxr)
- dtype_info = np.iinfo('int64')
- min_val, max_val = dtype_info.min, dtype_info.max
- vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val]
- assert_check_nselect_boundary(vals, dtype, nselect_method)
- def test_duplicate_keep_all_ties(self):
- # see gh-16818
- s = Series([10, 9, 8, 7, 7, 7, 7, 6])
- result = s.nlargest(4, keep='all')
- expected = Series([10, 9, 8, 7, 7, 7, 7])
- assert_series_equal(result, expected)
- result = s.nsmallest(2, keep='all')
- expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6])
- assert_series_equal(result, expected)
- class TestCategoricalSeriesAnalytics(object):
- def test_count(self):
- s = Series(Categorical([np.nan, 1, 2, np.nan],
- categories=[5, 4, 3, 2, 1], ordered=True))
- result = s.count()
- assert result == 2
- def test_value_counts(self):
- # GH 12835
- cats = Categorical(list('abcccb'), categories=list('cabd'))
- s = Series(cats, name='xxx')
- res = s.value_counts(sort=False)
- exp_index = CategoricalIndex(list('cabd'), categories=cats.categories)
- exp = Series([3, 1, 2, 0], name='xxx', index=exp_index)
- tm.assert_series_equal(res, exp)
- res = s.value_counts(sort=True)
- exp_index = CategoricalIndex(list('cbad'), categories=cats.categories)
- exp = Series([3, 2, 1, 0], name='xxx', index=exp_index)
- tm.assert_series_equal(res, exp)
- # check object dtype handles the Series.name as the same
- # (tested in test_base.py)
- s = Series(["a", "b", "c", "c", "c", "b"], name='xxx')
- res = s.value_counts()
- exp = Series([3, 2, 1], name='xxx', index=["c", "b", "a"])
- tm.assert_series_equal(res, exp)
- def test_value_counts_with_nan(self):
- # see gh-9443
- # sanity check
- s = Series(["a", "b", "a"], dtype="category")
- exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
- res = s.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
- res = s.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
- # same Series via two different constructions --> same behaviour
- series = [
- Series(["a", "b", None, "a", None, None], dtype="category"),
- Series(Categorical(["a", "b", None, "a", None, None],
- categories=["a", "b"]))
- ]
- for s in series:
- # None is a NaN value, so we exclude its count here
- exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
- res = s.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
- # we don't exclude the count of None and sort by counts
- exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]))
- res = s.value_counts(dropna=False)
- tm.assert_series_equal(res, exp)
- # When we aren't sorting by counts, and np.nan isn't a
- # category, it should be last.
- exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]))
- res = s.value_counts(dropna=False, sort=False)
- tm.assert_series_equal(res, exp)
- @pytest.mark.parametrize(
- "dtype",
- ["int_", "uint", "float_", "unicode_", "timedelta64[h]",
- pytest.param("datetime64[D]",
- marks=pytest.mark.xfail(reason="GH#7996"))]
- )
- @pytest.mark.parametrize("is_ordered", [True, False])
- def test_drop_duplicates_categorical_non_bool(self, dtype, is_ordered):
- cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
- # Test case 1
- input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
- tc1 = Series(Categorical(input1, categories=cat_array,
- ordered=is_ordered))
- expected = Series([False, False, False, True])
- tm.assert_series_equal(tc1.duplicated(), expected)
- tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
- sc = tc1.copy()
- sc.drop_duplicates(inplace=True)
- tm.assert_series_equal(sc, tc1[~expected])
- expected = Series([False, False, True, False])
- tm.assert_series_equal(tc1.duplicated(keep='last'), expected)
- tm.assert_series_equal(tc1.drop_duplicates(keep='last'),
- tc1[~expected])
- sc = tc1.copy()
- sc.drop_duplicates(keep='last', inplace=True)
- tm.assert_series_equal(sc, tc1[~expected])
- expected = Series([False, False, True, True])
- tm.assert_series_equal(tc1.duplicated(keep=False), expected)
- tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
- sc = tc1.copy()
- sc.drop_duplicates(keep=False, inplace=True)
- tm.assert_series_equal(sc, tc1[~expected])
- # Test case 2
- input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
- tc2 = Series(Categorical(
- input2, categories=cat_array, ordered=is_ordered)
- )
- expected = Series([False, False, False, False, True, True, False])
- tm.assert_series_equal(tc2.duplicated(), expected)
- tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
- sc = tc2.copy()
- sc.drop_duplicates(inplace=True)
- tm.assert_series_equal(sc, tc2[~expected])
- expected = Series([False, True, True, False, False, False, False])
- tm.assert_series_equal(tc2.duplicated(keep='last'), expected)
- tm.assert_series_equal(tc2.drop_duplicates(keep='last'),
- tc2[~expected])
- sc = tc2.copy()
- sc.drop_duplicates(keep='last', inplace=True)
- tm.assert_series_equal(sc, tc2[~expected])
- expected = Series([False, True, True, False, True, True, False])
- tm.assert_series_equal(tc2.duplicated(keep=False), expected)
- tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
- sc = tc2.copy()
- sc.drop_duplicates(keep=False, inplace=True)
- tm.assert_series_equal(sc, tc2[~expected])
- @pytest.mark.parametrize("is_ordered", [True, False])
- def test_drop_duplicates_categorical_bool(self, is_ordered):
- tc = Series(Categorical([True, False, True, False],
- categories=[True, False], ordered=is_ordered))
- expected = Series([False, False, True, True])
- tm.assert_series_equal(tc.duplicated(), expected)
- tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
- sc = tc.copy()
- sc.drop_duplicates(inplace=True)
- tm.assert_series_equal(sc, tc[~expected])
- expected = Series([True, True, False, False])
- tm.assert_series_equal(tc.duplicated(keep='last'), expected)
- tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected])
- sc = tc.copy()
- sc.drop_duplicates(keep='last', inplace=True)
- tm.assert_series_equal(sc, tc[~expected])
- expected = Series([True, True, True, True])
- tm.assert_series_equal(tc.duplicated(keep=False), expected)
- tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
- sc = tc.copy()
- sc.drop_duplicates(keep=False, inplace=True)
- tm.assert_series_equal(sc, tc[~expected])
|