test_analytics.py 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499
  1. # coding=utf-8
  2. # pylint: disable-msg=E1101,W0612
  3. from distutils.version import LooseVersion
  4. from itertools import product
  5. import operator
  6. import numpy as np
  7. from numpy import nan
  8. import pytest
  9. from pandas.compat import PY35, lrange, range
  10. import pandas.util._test_decorators as td
  11. import pandas as pd
  12. from pandas import (
  13. Categorical, CategoricalIndex, DataFrame, Series, compat, date_range, isna,
  14. notna)
  15. from pandas.api.types import is_scalar
  16. from pandas.core.index import MultiIndex
  17. from pandas.core.indexes.datetimes import Timestamp
  18. import pandas.util.testing as tm
  19. from pandas.util.testing import (
  20. assert_almost_equal, assert_frame_equal, assert_index_equal,
  21. assert_series_equal)
  22. class TestSeriesAnalytics(object):
  23. def test_describe(self):
  24. s = Series([0, 1, 2, 3, 4], name='int_data')
  25. result = s.describe()
  26. expected = Series([5, 2, s.std(), 0, 1, 2, 3, 4],
  27. name='int_data',
  28. index=['count', 'mean', 'std', 'min', '25%',
  29. '50%', '75%', 'max'])
  30. tm.assert_series_equal(result, expected)
  31. s = Series([True, True, False, False, False], name='bool_data')
  32. result = s.describe()
  33. expected = Series([5, 2, False, 3], name='bool_data',
  34. index=['count', 'unique', 'top', 'freq'])
  35. tm.assert_series_equal(result, expected)
  36. s = Series(['a', 'a', 'b', 'c', 'd'], name='str_data')
  37. result = s.describe()
  38. expected = Series([5, 4, 'a', 2], name='str_data',
  39. index=['count', 'unique', 'top', 'freq'])
  40. tm.assert_series_equal(result, expected)
  41. def test_describe_with_tz(self, tz_naive_fixture):
  42. # GH 21332
  43. tz = tz_naive_fixture
  44. name = str(tz_naive_fixture)
  45. start = Timestamp(2018, 1, 1)
  46. end = Timestamp(2018, 1, 5)
  47. s = Series(date_range(start, end, tz=tz), name=name)
  48. result = s.describe()
  49. expected = Series(
  50. [5, 5, s.value_counts().index[0], 1, start.tz_localize(tz),
  51. end.tz_localize(tz)
  52. ],
  53. name=name,
  54. index=['count', 'unique', 'top', 'freq', 'first', 'last']
  55. )
  56. tm.assert_series_equal(result, expected)
  57. def test_argsort(self, datetime_series):
  58. self._check_accum_op('argsort', datetime_series, check_dtype=False)
  59. argsorted = datetime_series.argsort()
  60. assert issubclass(argsorted.dtype.type, np.integer)
  61. # GH 2967 (introduced bug in 0.11-dev I think)
  62. s = Series([Timestamp('201301%02d' % (i + 1)) for i in range(5)])
  63. assert s.dtype == 'datetime64[ns]'
  64. shifted = s.shift(-1)
  65. assert shifted.dtype == 'datetime64[ns]'
  66. assert isna(shifted[4])
  67. result = s.argsort()
  68. expected = Series(lrange(5), dtype='int64')
  69. assert_series_equal(result, expected)
  70. result = shifted.argsort()
  71. expected = Series(lrange(4) + [-1], dtype='int64')
  72. assert_series_equal(result, expected)
  73. def test_argsort_stable(self):
  74. s = Series(np.random.randint(0, 100, size=10000))
  75. mindexer = s.argsort(kind='mergesort')
  76. qindexer = s.argsort()
  77. mexpected = np.argsort(s.values, kind='mergesort')
  78. qexpected = np.argsort(s.values, kind='quicksort')
  79. tm.assert_series_equal(mindexer, Series(mexpected),
  80. check_dtype=False)
  81. tm.assert_series_equal(qindexer, Series(qexpected),
  82. check_dtype=False)
  83. msg = (r"ndarray Expected type <(class|type) 'numpy\.ndarray'>,"
  84. r" found <class 'pandas\.core\.series\.Series'> instead")
  85. with pytest.raises(AssertionError, match=msg):
  86. tm.assert_numpy_array_equal(qindexer, mindexer)
  87. def test_cumsum(self, datetime_series):
  88. self._check_accum_op('cumsum', datetime_series)
  89. def test_cumprod(self, datetime_series):
  90. self._check_accum_op('cumprod', datetime_series)
  91. def test_cummin(self, datetime_series):
  92. tm.assert_numpy_array_equal(datetime_series.cummin().values,
  93. np.minimum
  94. .accumulate(np.array(datetime_series)))
  95. ts = datetime_series.copy()
  96. ts[::2] = np.NaN
  97. result = ts.cummin()[1::2]
  98. expected = np.minimum.accumulate(ts.dropna())
  99. tm.assert_series_equal(result, expected)
  100. def test_cummax(self, datetime_series):
  101. tm.assert_numpy_array_equal(datetime_series.cummax().values,
  102. np.maximum
  103. .accumulate(np.array(datetime_series)))
  104. ts = datetime_series.copy()
  105. ts[::2] = np.NaN
  106. result = ts.cummax()[1::2]
  107. expected = np.maximum.accumulate(ts.dropna())
  108. tm.assert_series_equal(result, expected)
  109. def test_cummin_datetime64(self):
  110. s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1',
  111. 'NaT', '2000-1-3']))
  112. expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT',
  113. '2000-1-1', 'NaT', '2000-1-1']))
  114. result = s.cummin(skipna=True)
  115. tm.assert_series_equal(expected, result)
  116. expected = pd.Series(pd.to_datetime(
  117. ['NaT', '2000-1-2', '2000-1-2', '2000-1-1', '2000-1-1', '2000-1-1'
  118. ]))
  119. result = s.cummin(skipna=False)
  120. tm.assert_series_equal(expected, result)
  121. def test_cummax_datetime64(self):
  122. s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1',
  123. 'NaT', '2000-1-3']))
  124. expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT',
  125. '2000-1-2', 'NaT', '2000-1-3']))
  126. result = s.cummax(skipna=True)
  127. tm.assert_series_equal(expected, result)
  128. expected = pd.Series(pd.to_datetime(
  129. ['NaT', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-3'
  130. ]))
  131. result = s.cummax(skipna=False)
  132. tm.assert_series_equal(expected, result)
  133. def test_cummin_timedelta64(self):
  134. s = pd.Series(pd.to_timedelta(['NaT',
  135. '2 min',
  136. 'NaT',
  137. '1 min',
  138. 'NaT',
  139. '3 min', ]))
  140. expected = pd.Series(pd.to_timedelta(['NaT',
  141. '2 min',
  142. 'NaT',
  143. '1 min',
  144. 'NaT',
  145. '1 min', ]))
  146. result = s.cummin(skipna=True)
  147. tm.assert_series_equal(expected, result)
  148. expected = pd.Series(pd.to_timedelta(['NaT',
  149. '2 min',
  150. '2 min',
  151. '1 min',
  152. '1 min',
  153. '1 min', ]))
  154. result = s.cummin(skipna=False)
  155. tm.assert_series_equal(expected, result)
  156. def test_cummax_timedelta64(self):
  157. s = pd.Series(pd.to_timedelta(['NaT',
  158. '2 min',
  159. 'NaT',
  160. '1 min',
  161. 'NaT',
  162. '3 min', ]))
  163. expected = pd.Series(pd.to_timedelta(['NaT',
  164. '2 min',
  165. 'NaT',
  166. '2 min',
  167. 'NaT',
  168. '3 min', ]))
  169. result = s.cummax(skipna=True)
  170. tm.assert_series_equal(expected, result)
  171. expected = pd.Series(pd.to_timedelta(['NaT',
  172. '2 min',
  173. '2 min',
  174. '2 min',
  175. '2 min',
  176. '3 min', ]))
  177. result = s.cummax(skipna=False)
  178. tm.assert_series_equal(expected, result)
  179. def test_npdiff(self):
  180. pytest.skip("skipping due to Series no longer being an "
  181. "ndarray")
  182. # no longer works as the return type of np.diff is now nd.array
  183. s = Series(np.arange(5))
  184. r = np.diff(s)
  185. assert_series_equal(Series([nan, 0, 0, 0, nan]), r)
  186. def _check_accum_op(self, name, datetime_series_, check_dtype=True):
  187. func = getattr(np, name)
  188. tm.assert_numpy_array_equal(func(datetime_series_).values,
  189. func(np.array(datetime_series_)),
  190. check_dtype=check_dtype)
  191. # with missing values
  192. ts = datetime_series_.copy()
  193. ts[::2] = np.NaN
  194. result = func(ts)[1::2]
  195. expected = func(np.array(ts.dropna()))
  196. tm.assert_numpy_array_equal(result.values, expected,
  197. check_dtype=False)
  198. def test_compress(self):
  199. cond = [True, False, True, False, False]
  200. s = Series([1, -1, 5, 8, 7],
  201. index=list('abcde'), name='foo')
  202. expected = Series(s.values.compress(cond),
  203. index=list('ac'), name='foo')
  204. with tm.assert_produces_warning(FutureWarning):
  205. result = s.compress(cond)
  206. tm.assert_series_equal(result, expected)
  207. def test_numpy_compress(self):
  208. cond = [True, False, True, False, False]
  209. s = Series([1, -1, 5, 8, 7],
  210. index=list('abcde'), name='foo')
  211. expected = Series(s.values.compress(cond),
  212. index=list('ac'), name='foo')
  213. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  214. tm.assert_series_equal(np.compress(cond, s), expected)
  215. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  216. msg = "the 'axis' parameter is not supported"
  217. with pytest.raises(ValueError, match=msg):
  218. np.compress(cond, s, axis=1)
  219. msg = "the 'out' parameter is not supported"
  220. with pytest.raises(ValueError, match=msg):
  221. np.compress(cond, s, out=s)
  222. def test_round(self, datetime_series):
  223. datetime_series.index.name = "index_name"
  224. result = datetime_series.round(2)
  225. expected = Series(np.round(datetime_series.values, 2),
  226. index=datetime_series.index, name='ts')
  227. assert_series_equal(result, expected)
  228. assert result.name == datetime_series.name
  229. def test_numpy_round(self):
  230. # See gh-12600
  231. s = Series([1.53, 1.36, 0.06])
  232. out = np.round(s, decimals=0)
  233. expected = Series([2., 1., 0.])
  234. assert_series_equal(out, expected)
  235. msg = "the 'out' parameter is not supported"
  236. with pytest.raises(ValueError, match=msg):
  237. np.round(s, decimals=0, out=s)
  238. def test_built_in_round(self):
  239. if not compat.PY3:
  240. pytest.skip(
  241. 'build in round cannot be overridden prior to Python 3')
  242. s = Series([1.123, 2.123, 3.123], index=lrange(3))
  243. result = round(s)
  244. expected_rounded0 = Series([1., 2., 3.], index=lrange(3))
  245. tm.assert_series_equal(result, expected_rounded0)
  246. decimals = 2
  247. expected_rounded = Series([1.12, 2.12, 3.12], index=lrange(3))
  248. result = round(s, decimals)
  249. tm.assert_series_equal(result, expected_rounded)
  250. def test_prod_numpy16_bug(self):
  251. s = Series([1., 1., 1.], index=lrange(3))
  252. result = s.prod()
  253. assert not isinstance(result, Series)
  254. @td.skip_if_no_scipy
  255. def test_corr(self, datetime_series):
  256. import scipy.stats as stats
  257. # full overlap
  258. tm.assert_almost_equal(datetime_series.corr(datetime_series), 1)
  259. # partial overlap
  260. tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]),
  261. 1)
  262. assert isna(datetime_series[:15].corr(datetime_series[5:],
  263. min_periods=12))
  264. ts1 = datetime_series[:15].reindex(datetime_series.index)
  265. ts2 = datetime_series[5:].reindex(datetime_series.index)
  266. assert isna(ts1.corr(ts2, min_periods=12))
  267. # No overlap
  268. assert np.isnan(datetime_series[::2].corr(datetime_series[1::2]))
  269. # all NA
  270. cp = datetime_series[:10].copy()
  271. cp[:] = np.nan
  272. assert isna(cp.corr(cp))
  273. A = tm.makeTimeSeries()
  274. B = tm.makeTimeSeries()
  275. result = A.corr(B)
  276. expected, _ = stats.pearsonr(A, B)
  277. tm.assert_almost_equal(result, expected)
  278. @td.skip_if_no_scipy
  279. def test_corr_rank(self):
  280. import scipy
  281. import scipy.stats as stats
  282. # kendall and spearman
  283. A = tm.makeTimeSeries()
  284. B = tm.makeTimeSeries()
  285. A[-5:] = A[:5]
  286. result = A.corr(B, method='kendall')
  287. expected = stats.kendalltau(A, B)[0]
  288. tm.assert_almost_equal(result, expected)
  289. result = A.corr(B, method='spearman')
  290. expected = stats.spearmanr(A, B)[0]
  291. tm.assert_almost_equal(result, expected)
  292. # these methods got rewritten in 0.8
  293. if LooseVersion(scipy.__version__) < LooseVersion('0.9'):
  294. pytest.skip("skipping corr rank because of scipy version "
  295. "{0}".format(scipy.__version__))
  296. # results from R
  297. A = Series(
  298. [-0.89926396, 0.94209606, -1.03289164, -0.95445587, 0.76910310, -
  299. 0.06430576, -2.09704447, 0.40660407, -0.89926396, 0.94209606])
  300. B = Series(
  301. [-1.01270225, -0.62210117, -1.56895827, 0.59592943, -0.01680292,
  302. 1.17258718, -1.06009347, -0.10222060, -0.89076239, 0.89372375])
  303. kexp = 0.4319297
  304. sexp = 0.5853767
  305. tm.assert_almost_equal(A.corr(B, method='kendall'), kexp)
  306. tm.assert_almost_equal(A.corr(B, method='spearman'), sexp)
  307. def test_corr_invalid_method(self):
  308. # GH PR #22298
  309. s1 = pd.Series(np.random.randn(10))
  310. s2 = pd.Series(np.random.randn(10))
  311. msg = ("method must be either 'pearson', 'spearman', "
  312. "or 'kendall'")
  313. with pytest.raises(ValueError, match=msg):
  314. s1.corr(s2, method="____")
  315. def test_corr_callable_method(self, datetime_series):
  316. # simple correlation example
  317. # returns 1 if exact equality, 0 otherwise
  318. my_corr = lambda a, b: 1. if (a == b).all() else 0.
  319. # simple example
  320. s1 = Series([1, 2, 3, 4, 5])
  321. s2 = Series([5, 4, 3, 2, 1])
  322. expected = 0
  323. tm.assert_almost_equal(
  324. s1.corr(s2, method=my_corr),
  325. expected)
  326. # full overlap
  327. tm.assert_almost_equal(datetime_series.corr(
  328. datetime_series, method=my_corr), 1.)
  329. # partial overlap
  330. tm.assert_almost_equal(datetime_series[:15].corr(
  331. datetime_series[5:], method=my_corr), 1.)
  332. # No overlap
  333. assert np.isnan(datetime_series[::2].corr(
  334. datetime_series[1::2], method=my_corr))
  335. # dataframe example
  336. df = pd.DataFrame([s1, s2])
  337. expected = pd.DataFrame([
  338. {0: 1., 1: 0}, {0: 0, 1: 1.}])
  339. tm.assert_almost_equal(
  340. df.transpose().corr(method=my_corr), expected)
  341. def test_cov(self, datetime_series):
  342. # full overlap
  343. tm.assert_almost_equal(datetime_series.cov(datetime_series),
  344. datetime_series.std() ** 2)
  345. # partial overlap
  346. tm.assert_almost_equal(datetime_series[:15].cov(datetime_series[5:]),
  347. datetime_series[5:15].std() ** 2)
  348. # No overlap
  349. assert np.isnan(datetime_series[::2].cov(datetime_series[1::2]))
  350. # all NA
  351. cp = datetime_series[:10].copy()
  352. cp[:] = np.nan
  353. assert isna(cp.cov(cp))
  354. # min_periods
  355. assert isna(datetime_series[:15].cov(datetime_series[5:],
  356. min_periods=12))
  357. ts1 = datetime_series[:15].reindex(datetime_series.index)
  358. ts2 = datetime_series[5:].reindex(datetime_series.index)
  359. assert isna(ts1.cov(ts2, min_periods=12))
  360. def test_count(self, datetime_series):
  361. assert datetime_series.count() == len(datetime_series)
  362. datetime_series[::2] = np.NaN
  363. assert datetime_series.count() == np.isfinite(datetime_series).sum()
  364. mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
  365. ts = Series(np.arange(len(mi)), index=mi)
  366. left = ts.count(level=1)
  367. right = Series([2, 3, 1], index=[1, 2, nan])
  368. assert_series_equal(left, right)
  369. ts.iloc[[0, 3, 5]] = nan
  370. assert_series_equal(ts.count(level=1), right - 1)
  371. def test_dot(self):
  372. a = Series(np.random.randn(4), index=['p', 'q', 'r', 's'])
  373. b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'],
  374. columns=['p', 'q', 'r', 's']).T
  375. result = a.dot(b)
  376. expected = Series(np.dot(a.values, b.values), index=['1', '2', '3'])
  377. assert_series_equal(result, expected)
  378. # Check index alignment
  379. b2 = b.reindex(index=reversed(b.index))
  380. result = a.dot(b)
  381. assert_series_equal(result, expected)
  382. # Check ndarray argument
  383. result = a.dot(b.values)
  384. assert np.all(result == expected.values)
  385. assert_almost_equal(a.dot(b['2'].values), expected['2'])
  386. # Check series argument
  387. assert_almost_equal(a.dot(b['1']), expected['1'])
  388. assert_almost_equal(a.dot(b2['1']), expected['1'])
  389. msg = r"Dot product shape mismatch, \(4L?,\) vs \(3L?,\)"
  390. # exception raised is of type Exception
  391. with pytest.raises(Exception, match=msg):
  392. a.dot(a.values[:3])
  393. msg = "matrices are not aligned"
  394. with pytest.raises(ValueError, match=msg):
  395. a.dot(b.T)
  396. @pytest.mark.skipif(not PY35,
  397. reason='matmul supported for Python>=3.5')
  398. def test_matmul(self):
  399. # matmul test is for GH #10259
  400. a = Series(np.random.randn(4), index=['p', 'q', 'r', 's'])
  401. b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'],
  402. columns=['p', 'q', 'r', 's']).T
  403. # Series @ DataFrame
  404. result = operator.matmul(a, b)
  405. expected = Series(np.dot(a.values, b.values), index=['1', '2', '3'])
  406. assert_series_equal(result, expected)
  407. # DataFrame @ Series
  408. result = operator.matmul(b.T, a)
  409. expected = Series(np.dot(b.T.values, a.T.values),
  410. index=['1', '2', '3'])
  411. assert_series_equal(result, expected)
  412. # Series @ Series
  413. result = operator.matmul(a, a)
  414. expected = np.dot(a.values, a.values)
  415. assert_almost_equal(result, expected)
  416. # GH 21530
  417. # vector (1D np.array) @ Series (__rmatmul__)
  418. result = operator.matmul(a.values, a)
  419. expected = np.dot(a.values, a.values)
  420. assert_almost_equal(result, expected)
  421. # GH 21530
  422. # vector (1D list) @ Series (__rmatmul__)
  423. result = operator.matmul(a.values.tolist(), a)
  424. expected = np.dot(a.values, a.values)
  425. assert_almost_equal(result, expected)
  426. # GH 21530
  427. # matrix (2D np.array) @ Series (__rmatmul__)
  428. result = operator.matmul(b.T.values, a)
  429. expected = np.dot(b.T.values, a.values)
  430. assert_almost_equal(result, expected)
  431. # GH 21530
  432. # matrix (2D nested lists) @ Series (__rmatmul__)
  433. result = operator.matmul(b.T.values.tolist(), a)
  434. expected = np.dot(b.T.values, a.values)
  435. assert_almost_equal(result, expected)
  436. # mixed dtype DataFrame @ Series
  437. a['p'] = int(a.p)
  438. result = operator.matmul(b.T, a)
  439. expected = Series(np.dot(b.T.values, a.T.values),
  440. index=['1', '2', '3'])
  441. assert_series_equal(result, expected)
  442. # different dtypes DataFrame @ Series
  443. a = a.astype(int)
  444. result = operator.matmul(b.T, a)
  445. expected = Series(np.dot(b.T.values, a.T.values),
  446. index=['1', '2', '3'])
  447. assert_series_equal(result, expected)
  448. msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)"
  449. # exception raised is of type Exception
  450. with pytest.raises(Exception, match=msg):
  451. a.dot(a.values[:3])
  452. msg = "matrices are not aligned"
  453. with pytest.raises(ValueError, match=msg):
  454. a.dot(b.T)
  455. def test_clip(self, datetime_series):
  456. val = datetime_series.median()
  457. with tm.assert_produces_warning(FutureWarning):
  458. assert datetime_series.clip_lower(val).min() == val
  459. with tm.assert_produces_warning(FutureWarning):
  460. assert datetime_series.clip_upper(val).max() == val
  461. assert datetime_series.clip(lower=val).min() == val
  462. assert datetime_series.clip(upper=val).max() == val
  463. result = datetime_series.clip(-0.5, 0.5)
  464. expected = np.clip(datetime_series, -0.5, 0.5)
  465. assert_series_equal(result, expected)
  466. assert isinstance(expected, Series)
  467. def test_clip_types_and_nulls(self):
  468. sers = [Series([np.nan, 1.0, 2.0, 3.0]), Series([None, 'a', 'b', 'c']),
  469. Series(pd.to_datetime(
  470. [np.nan, 1, 2, 3], unit='D'))]
  471. for s in sers:
  472. thresh = s[2]
  473. with tm.assert_produces_warning(FutureWarning):
  474. lower = s.clip_lower(thresh)
  475. with tm.assert_produces_warning(FutureWarning):
  476. upper = s.clip_upper(thresh)
  477. assert lower[notna(lower)].min() == thresh
  478. assert upper[notna(upper)].max() == thresh
  479. assert list(isna(s)) == list(isna(lower))
  480. assert list(isna(s)) == list(isna(upper))
  481. def test_clip_with_na_args(self):
  482. """Should process np.nan argument as None """
  483. # GH # 17276
  484. s = Series([1, 2, 3])
  485. assert_series_equal(s.clip(np.nan), Series([1, 2, 3]))
  486. assert_series_equal(s.clip(upper=np.nan, lower=np.nan),
  487. Series([1, 2, 3]))
  488. # GH #19992
  489. assert_series_equal(s.clip(lower=[0, 4, np.nan]),
  490. Series([1, 4, np.nan]))
  491. assert_series_equal(s.clip(upper=[1, np.nan, 1]),
  492. Series([1, np.nan, 1]))
  493. def test_clip_against_series(self):
  494. # GH #6966
  495. s = Series([1.0, 1.0, 4.0])
  496. threshold = Series([1.0, 2.0, 3.0])
  497. with tm.assert_produces_warning(FutureWarning):
  498. assert_series_equal(s.clip_lower(threshold),
  499. Series([1.0, 2.0, 4.0]))
  500. with tm.assert_produces_warning(FutureWarning):
  501. assert_series_equal(s.clip_upper(threshold),
  502. Series([1.0, 1.0, 3.0]))
  503. lower = Series([1.0, 2.0, 3.0])
  504. upper = Series([1.5, 2.5, 3.5])
  505. assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5]))
  506. assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5]))
  507. @pytest.mark.parametrize("inplace", [True, False])
  508. @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])])
  509. def test_clip_against_list_like(self, inplace, upper):
  510. # GH #15390
  511. original = pd.Series([5, 6, 7])
  512. result = original.clip(upper=upper, inplace=inplace)
  513. expected = pd.Series([1, 2, 3])
  514. if inplace:
  515. result = original
  516. tm.assert_series_equal(result, expected, check_exact=True)
  517. def test_clip_with_datetimes(self):
  518. # GH 11838
  519. # naive and tz-aware datetimes
  520. t = Timestamp('2015-12-01 09:30:30')
  521. s = Series([Timestamp('2015-12-01 09:30:00'),
  522. Timestamp('2015-12-01 09:31:00')])
  523. result = s.clip(upper=t)
  524. expected = Series([Timestamp('2015-12-01 09:30:00'),
  525. Timestamp('2015-12-01 09:30:30')])
  526. assert_series_equal(result, expected)
  527. t = Timestamp('2015-12-01 09:30:30', tz='US/Eastern')
  528. s = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'),
  529. Timestamp('2015-12-01 09:31:00', tz='US/Eastern')])
  530. result = s.clip(upper=t)
  531. expected = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'),
  532. Timestamp('2015-12-01 09:30:30', tz='US/Eastern')])
  533. assert_series_equal(result, expected)
  534. def test_cummethods_bool(self):
  535. # GH 6270
  536. a = pd.Series([False, False, False, True, True, False, False])
  537. b = ~a
  538. c = pd.Series([False] * len(b))
  539. d = ~c
  540. methods = {'cumsum': np.cumsum,
  541. 'cumprod': np.cumprod,
  542. 'cummin': np.minimum.accumulate,
  543. 'cummax': np.maximum.accumulate}
  544. args = product((a, b, c, d), methods)
  545. for s, method in args:
  546. expected = Series(methods[method](s.values))
  547. result = getattr(s, method)()
  548. assert_series_equal(result, expected)
  549. e = pd.Series([False, True, nan, False])
  550. cse = pd.Series([0, 1, nan, 1], dtype=object)
  551. cpe = pd.Series([False, 0, nan, 0])
  552. cmin = pd.Series([False, False, nan, False])
  553. cmax = pd.Series([False, True, nan, True])
  554. expecteds = {'cumsum': cse,
  555. 'cumprod': cpe,
  556. 'cummin': cmin,
  557. 'cummax': cmax}
  558. for method in methods:
  559. res = getattr(e, method)()
  560. assert_series_equal(res, expecteds[method])
  561. def test_isin(self):
  562. s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])
  563. result = s.isin(['A', 'C'])
  564. expected = Series([True, False, True, False, False, False, True, True])
  565. assert_series_equal(result, expected)
  566. # GH: 16012
  567. # This specific issue has to have a series over 1e6 in len, but the
  568. # comparison array (in_list) must be large enough so that numpy doesn't
  569. # do a manual masking trick that will avoid this issue altogether
  570. s = Series(list('abcdefghijk' * 10 ** 5))
  571. # If numpy doesn't do the manual comparison/mask, these
  572. # unorderable mixed types are what cause the exception in numpy
  573. in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E',
  574. 'K', 'E', 'S', 'I', 'R', 'R'] * 6
  575. assert s.isin(in_list).sum() == 200000
  576. def test_isin_with_string_scalar(self):
  577. # GH4763
  578. s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])
  579. msg = (r"only list-like objects are allowed to be passed to isin\(\),"
  580. r" you passed a \[str\]")
  581. with pytest.raises(TypeError, match=msg):
  582. s.isin('a')
  583. s = Series(['aaa', 'b', 'c'])
  584. with pytest.raises(TypeError, match=msg):
  585. s.isin('aaa')
  586. def test_isin_with_i8(self):
  587. # GH 5021
  588. expected = Series([True, True, False, False, False])
  589. expected2 = Series([False, True, False, False, False])
  590. # datetime64[ns]
  591. s = Series(date_range('jan-01-2013', 'jan-05-2013'))
  592. result = s.isin(s[0:2])
  593. assert_series_equal(result, expected)
  594. result = s.isin(s[0:2].values)
  595. assert_series_equal(result, expected)
  596. # fails on dtype conversion in the first place
  597. result = s.isin(s[0:2].values.astype('datetime64[D]'))
  598. assert_series_equal(result, expected)
  599. result = s.isin([s[1]])
  600. assert_series_equal(result, expected2)
  601. result = s.isin([np.datetime64(s[1])])
  602. assert_series_equal(result, expected2)
  603. result = s.isin(set(s[0:2]))
  604. assert_series_equal(result, expected)
  605. # timedelta64[ns]
  606. s = Series(pd.to_timedelta(lrange(5), unit='d'))
  607. result = s.isin(s[0:2])
  608. assert_series_equal(result, expected)
  609. @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
  610. def test_isin_empty(self, empty):
  611. # see gh-16991
  612. s = Series(["a", "b"])
  613. expected = Series([False, False])
  614. result = s.isin(empty)
  615. tm.assert_series_equal(expected, result)
  616. def test_ptp(self):
  617. # GH21614
  618. N = 1000
  619. arr = np.random.randn(N)
  620. ser = Series(arr)
  621. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  622. assert np.ptp(ser) == np.ptp(arr)
  623. # GH11163
  624. s = Series([3, 5, np.nan, -3, 10])
  625. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  626. assert s.ptp() == 13
  627. assert pd.isna(s.ptp(skipna=False))
  628. mi = pd.MultiIndex.from_product([['a', 'b'], [1, 2, 3]])
  629. s = pd.Series([1, np.nan, 7, 3, 5, np.nan], index=mi)
  630. expected = pd.Series([6, 2], index=['a', 'b'], dtype=np.float64)
  631. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  632. tm.assert_series_equal(s.ptp(level=0), expected)
  633. expected = pd.Series([np.nan, np.nan], index=['a', 'b'])
  634. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  635. tm.assert_series_equal(s.ptp(level=0, skipna=False), expected)
  636. msg = r"No axis named 1 for object type <(class|type) 'type'>"
  637. with pytest.raises(ValueError, match=msg):
  638. with tm.assert_produces_warning(FutureWarning,
  639. check_stacklevel=False):
  640. s.ptp(axis=1)
  641. s = pd.Series(['a', 'b', 'c', 'd', 'e'])
  642. msg = r"unsupported operand type\(s\) for -: 'str' and 'str'"
  643. with pytest.raises(TypeError, match=msg):
  644. with tm.assert_produces_warning(FutureWarning,
  645. check_stacklevel=False):
  646. s.ptp()
  647. msg = r"Series\.ptp does not implement numeric_only\."
  648. with pytest.raises(NotImplementedError, match=msg):
  649. with tm.assert_produces_warning(FutureWarning,
  650. check_stacklevel=False):
  651. s.ptp(numeric_only=True)
  652. def test_repeat(self):
  653. s = Series(np.random.randn(3), index=['a', 'b', 'c'])
  654. reps = s.repeat(5)
  655. exp = Series(s.values.repeat(5), index=s.index.values.repeat(5))
  656. assert_series_equal(reps, exp)
  657. to_rep = [2, 3, 4]
  658. reps = s.repeat(to_rep)
  659. exp = Series(s.values.repeat(to_rep),
  660. index=s.index.values.repeat(to_rep))
  661. assert_series_equal(reps, exp)
  662. def test_numpy_repeat(self):
  663. s = Series(np.arange(3), name='x')
  664. expected = Series(s.values.repeat(2), name='x',
  665. index=s.index.values.repeat(2))
  666. assert_series_equal(np.repeat(s, 2), expected)
  667. msg = "the 'axis' parameter is not supported"
  668. with pytest.raises(ValueError, match=msg):
  669. np.repeat(s, 2, axis=0)
  670. def test_searchsorted(self):
  671. s = Series([1, 2, 3])
  672. result = s.searchsorted(1, side='left')
  673. assert is_scalar(result)
  674. assert result == 0
  675. result = s.searchsorted(1, side='right')
  676. assert is_scalar(result)
  677. assert result == 1
  678. def test_searchsorted_numeric_dtypes_scalar(self):
  679. s = Series([1, 2, 90, 1000, 3e9])
  680. r = s.searchsorted(30)
  681. assert is_scalar(r)
  682. assert r == 2
  683. r = s.searchsorted([30])
  684. e = np.array([2], dtype=np.intp)
  685. tm.assert_numpy_array_equal(r, e)
  686. def test_searchsorted_numeric_dtypes_vector(self):
  687. s = Series([1, 2, 90, 1000, 3e9])
  688. r = s.searchsorted([91, 2e6])
  689. e = np.array([3, 4], dtype=np.intp)
  690. tm.assert_numpy_array_equal(r, e)
  691. def test_search_sorted_datetime64_scalar(self):
  692. s = Series(pd.date_range('20120101', periods=10, freq='2D'))
  693. v = pd.Timestamp('20120102')
  694. r = s.searchsorted(v)
  695. assert is_scalar(r)
  696. assert r == 1
  697. def test_search_sorted_datetime64_list(self):
  698. s = Series(pd.date_range('20120101', periods=10, freq='2D'))
  699. v = [pd.Timestamp('20120102'), pd.Timestamp('20120104')]
  700. r = s.searchsorted(v)
  701. e = np.array([1, 2], dtype=np.intp)
  702. tm.assert_numpy_array_equal(r, e)
  703. def test_searchsorted_sorter(self):
  704. # GH8490
  705. s = Series([3, 1, 2])
  706. r = s.searchsorted([0, 3], sorter=np.argsort(s))
  707. e = np.array([0, 2], dtype=np.intp)
  708. tm.assert_numpy_array_equal(r, e)
  709. def test_is_monotonic(self):
  710. s = Series(np.random.randint(0, 10, size=1000))
  711. assert not s.is_monotonic
  712. s = Series(np.arange(1000))
  713. assert s.is_monotonic is True
  714. assert s.is_monotonic_increasing is True
  715. s = Series(np.arange(1000, 0, -1))
  716. assert s.is_monotonic_decreasing is True
  717. s = Series(pd.date_range('20130101', periods=10))
  718. assert s.is_monotonic is True
  719. assert s.is_monotonic_increasing is True
  720. s = Series(list(reversed(s.tolist())))
  721. assert s.is_monotonic is False
  722. assert s.is_monotonic_decreasing is True
  723. def test_sort_index_level(self):
  724. mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
  725. s = Series([1, 2], mi)
  726. backwards = s.iloc[[1, 0]]
  727. res = s.sort_index(level='A')
  728. assert_series_equal(backwards, res)
  729. res = s.sort_index(level=['A', 'B'])
  730. assert_series_equal(backwards, res)
  731. res = s.sort_index(level='A', sort_remaining=False)
  732. assert_series_equal(s, res)
  733. res = s.sort_index(level=['A', 'B'], sort_remaining=False)
  734. assert_series_equal(s, res)
  735. def test_apply_categorical(self):
  736. values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'),
  737. ordered=True)
  738. s = pd.Series(values, name='XX', index=list('abcdefg'))
  739. result = s.apply(lambda x: x.lower())
  740. # should be categorical dtype when the number of categories are
  741. # the same
  742. values = pd.Categorical(list('abbabcd'), categories=list('dcba'),
  743. ordered=True)
  744. exp = pd.Series(values, name='XX', index=list('abcdefg'))
  745. tm.assert_series_equal(result, exp)
  746. tm.assert_categorical_equal(result.values, exp.values)
  747. result = s.apply(lambda x: 'A')
  748. exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg'))
  749. tm.assert_series_equal(result, exp)
  750. assert result.dtype == np.object
  751. def test_shift_int(self, datetime_series):
  752. ts = datetime_series.astype(int)
  753. shifted = ts.shift(1)
  754. expected = ts.astype(float).shift(1)
  755. assert_series_equal(shifted, expected)
  756. def test_shift_categorical(self):
  757. # GH 9416
  758. s = pd.Series(['a', 'b', 'c', 'd'], dtype='category')
  759. assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna())
  760. sp1 = s.shift(1)
  761. assert_index_equal(s.index, sp1.index)
  762. assert np.all(sp1.values.codes[:1] == -1)
  763. assert np.all(s.values.codes[:-1] == sp1.values.codes[1:])
  764. sn2 = s.shift(-2)
  765. assert_index_equal(s.index, sn2.index)
  766. assert np.all(sn2.values.codes[-2:] == -1)
  767. assert np.all(s.values.codes[2:] == sn2.values.codes[:-2])
  768. assert_index_equal(s.values.categories, sp1.values.categories)
  769. assert_index_equal(s.values.categories, sn2.values.categories)
  770. def test_unstack(self):
  771. from numpy import nan
  772. index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']],
  773. codes=[[1, 1, 0, 0], [0, 1, 0, 2]])
  774. s = Series(np.arange(4.), index=index)
  775. unstacked = s.unstack()
  776. expected = DataFrame([[2., nan, 3.], [0., 1., nan]],
  777. index=['bar', 'foo'],
  778. columns=['one', 'three', 'two'])
  779. assert_frame_equal(unstacked, expected)
  780. unstacked = s.unstack(level=0)
  781. assert_frame_equal(unstacked, expected.T)
  782. index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
  783. codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
  784. [0, 1, 0, 1, 0, 1]])
  785. s = Series(np.random.randn(6), index=index)
  786. exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]],
  787. codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]])
  788. expected = DataFrame({'bar': s.values},
  789. index=exp_index).sort_index(level=0)
  790. unstacked = s.unstack(0).sort_index()
  791. assert_frame_equal(unstacked, expected)
  792. # GH5873
  793. idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]])
  794. ts = pd.Series([1, 2], index=idx)
  795. left = ts.unstack()
  796. right = DataFrame([[nan, 1], [2, nan]], index=[101, 102],
  797. columns=[nan, 3.5])
  798. assert_frame_equal(left, right)
  799. idx = pd.MultiIndex.from_arrays([['cat', 'cat', 'cat', 'dog', 'dog'
  800. ], ['a', 'a', 'b', 'a', 'b'],
  801. [1, 2, 1, 1, np.nan]])
  802. ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx)
  803. right = DataFrame([[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]],
  804. columns=['cat', 'dog'])
  805. tpls = [('a', 1), ('a', 2), ('b', nan), ('b', 1)]
  806. right.index = pd.MultiIndex.from_tuples(tpls)
  807. assert_frame_equal(ts.unstack(level=0), right)
  808. def test_value_counts_datetime(self):
  809. # most dtypes are tested in test_base.py
  810. values = [pd.Timestamp('2011-01-01 09:00'),
  811. pd.Timestamp('2011-01-01 10:00'),
  812. pd.Timestamp('2011-01-01 11:00'),
  813. pd.Timestamp('2011-01-01 09:00'),
  814. pd.Timestamp('2011-01-01 09:00'),
  815. pd.Timestamp('2011-01-01 11:00')]
  816. exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00',
  817. '2011-01-01 10:00'])
  818. exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
  819. s = pd.Series(values, name='xxx')
  820. tm.assert_series_equal(s.value_counts(), exp)
  821. # check DatetimeIndex outputs the same result
  822. idx = pd.DatetimeIndex(values, name='xxx')
  823. tm.assert_series_equal(idx.value_counts(), exp)
  824. # normalize
  825. exp = pd.Series(np.array([3., 2., 1]) / 6.,
  826. index=exp_idx, name='xxx')
  827. tm.assert_series_equal(s.value_counts(normalize=True), exp)
  828. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  829. def test_value_counts_datetime_tz(self):
  830. values = [pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
  831. pd.Timestamp('2011-01-01 10:00', tz='US/Eastern'),
  832. pd.Timestamp('2011-01-01 11:00', tz='US/Eastern'),
  833. pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
  834. pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'),
  835. pd.Timestamp('2011-01-01 11:00', tz='US/Eastern')]
  836. exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00',
  837. '2011-01-01 10:00'], tz='US/Eastern')
  838. exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
  839. s = pd.Series(values, name='xxx')
  840. tm.assert_series_equal(s.value_counts(), exp)
  841. idx = pd.DatetimeIndex(values, name='xxx')
  842. tm.assert_series_equal(idx.value_counts(), exp)
  843. exp = pd.Series(np.array([3., 2., 1]) / 6.,
  844. index=exp_idx, name='xxx')
  845. tm.assert_series_equal(s.value_counts(normalize=True), exp)
  846. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  847. def test_value_counts_period(self):
  848. values = [pd.Period('2011-01', freq='M'),
  849. pd.Period('2011-02', freq='M'),
  850. pd.Period('2011-03', freq='M'),
  851. pd.Period('2011-01', freq='M'),
  852. pd.Period('2011-01', freq='M'),
  853. pd.Period('2011-03', freq='M')]
  854. exp_idx = pd.PeriodIndex(['2011-01', '2011-03', '2011-02'], freq='M')
  855. exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
  856. s = pd.Series(values, name='xxx')
  857. tm.assert_series_equal(s.value_counts(), exp)
  858. # check DatetimeIndex outputs the same result
  859. idx = pd.PeriodIndex(values, name='xxx')
  860. tm.assert_series_equal(idx.value_counts(), exp)
  861. # normalize
  862. exp = pd.Series(np.array([3., 2., 1]) / 6.,
  863. index=exp_idx, name='xxx')
  864. tm.assert_series_equal(s.value_counts(normalize=True), exp)
  865. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  866. def test_value_counts_categorical_ordered(self):
  867. # most dtypes are tested in test_base.py
  868. values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True)
  869. exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3],
  870. ordered=True)
  871. exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
  872. s = pd.Series(values, name='xxx')
  873. tm.assert_series_equal(s.value_counts(), exp)
  874. # check CategoricalIndex outputs the same result
  875. idx = pd.CategoricalIndex(values, name='xxx')
  876. tm.assert_series_equal(idx.value_counts(), exp)
  877. # normalize
  878. exp = pd.Series(np.array([3., 2., 1]) / 6.,
  879. index=exp_idx, name='xxx')
  880. tm.assert_series_equal(s.value_counts(normalize=True), exp)
  881. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  882. def test_value_counts_categorical_not_ordered(self):
  883. values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False)
  884. exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3],
  885. ordered=False)
  886. exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx')
  887. s = pd.Series(values, name='xxx')
  888. tm.assert_series_equal(s.value_counts(), exp)
  889. # check CategoricalIndex outputs the same result
  890. idx = pd.CategoricalIndex(values, name='xxx')
  891. tm.assert_series_equal(idx.value_counts(), exp)
  892. # normalize
  893. exp = pd.Series(np.array([3., 2., 1]) / 6.,
  894. index=exp_idx, name='xxx')
  895. tm.assert_series_equal(s.value_counts(normalize=True), exp)
  896. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  897. @pytest.mark.parametrize("func", [np.any, np.all])
  898. @pytest.mark.parametrize("kwargs", [
  899. dict(keepdims=True),
  900. dict(out=object()),
  901. ])
  902. @td.skip_if_np_lt_115
  903. def test_validate_any_all_out_keepdims_raises(self, kwargs, func):
  904. s = pd.Series([1, 2])
  905. param = list(kwargs)[0]
  906. name = func.__name__
  907. msg = (r"the '{arg}' parameter is not "
  908. r"supported in the pandas "
  909. r"implementation of {fname}\(\)").format(arg=param, fname=name)
  910. with pytest.raises(ValueError, match=msg):
  911. func(s, **kwargs)
  912. @td.skip_if_np_lt_115
  913. def test_validate_sum_initial(self):
  914. s = pd.Series([1, 2])
  915. msg = (r"the 'initial' parameter is not "
  916. r"supported in the pandas "
  917. r"implementation of sum\(\)")
  918. with pytest.raises(ValueError, match=msg):
  919. np.sum(s, initial=10)
  920. def test_validate_median_initial(self):
  921. s = pd.Series([1, 2])
  922. msg = (r"the 'overwrite_input' parameter is not "
  923. r"supported in the pandas "
  924. r"implementation of median\(\)")
  925. with pytest.raises(ValueError, match=msg):
  926. # It seems like np.median doesn't dispatch, so we use the
  927. # method instead of the ufunc.
  928. s.median(overwrite_input=True)
  929. @td.skip_if_np_lt_115
  930. def test_validate_stat_keepdims(self):
  931. s = pd.Series([1, 2])
  932. msg = (r"the 'keepdims' parameter is not "
  933. r"supported in the pandas "
  934. r"implementation of sum\(\)")
  935. with pytest.raises(ValueError, match=msg):
  936. np.sum(s, keepdims=True)
  937. main_dtypes = [
  938. 'datetime',
  939. 'datetimetz',
  940. 'timedelta',
  941. 'int8',
  942. 'int16',
  943. 'int32',
  944. 'int64',
  945. 'float32',
  946. 'float64',
  947. 'uint8',
  948. 'uint16',
  949. 'uint32',
  950. 'uint64'
  951. ]
  952. @pytest.fixture
  953. def s_main_dtypes():
  954. """A DataFrame with many dtypes
  955. * datetime
  956. * datetimetz
  957. * timedelta
  958. * [u]int{8,16,32,64}
  959. * float{32,64}
  960. The columns are the name of the dtype.
  961. """
  962. df = pd.DataFrame(
  963. {'datetime': pd.to_datetime(['2003', '2002',
  964. '2001', '2002',
  965. '2005']),
  966. 'datetimetz': pd.to_datetime(
  967. ['2003', '2002',
  968. '2001', '2002',
  969. '2005']).tz_localize('US/Eastern'),
  970. 'timedelta': pd.to_timedelta(['3d', '2d', '1d',
  971. '2d', '5d'])})
  972. for dtype in ['int8', 'int16', 'int32', 'int64',
  973. 'float32', 'float64',
  974. 'uint8', 'uint16', 'uint32', 'uint64']:
  975. df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype)
  976. return df
  977. @pytest.fixture(params=main_dtypes)
  978. def s_main_dtypes_split(request, s_main_dtypes):
  979. """Each series in s_main_dtypes."""
  980. return s_main_dtypes[request.param]
  981. def assert_check_nselect_boundary(vals, dtype, method):
  982. # helper function for 'test_boundary_{dtype}' tests
  983. s = Series(vals, dtype=dtype)
  984. result = getattr(s, method)(3)
  985. expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1]
  986. expected = s.loc[expected_idxr]
  987. tm.assert_series_equal(result, expected)
  988. class TestNLargestNSmallest(object):
  989. @pytest.mark.parametrize(
  990. "r", [Series([3., 2, 1, 2, '5'], dtype='object'),
  991. Series([3., 2, 1, 2, 5], dtype='object'),
  992. # not supported on some archs
  993. # Series([3., 2, 1, 2, 5], dtype='complex256'),
  994. Series([3., 2, 1, 2, 5], dtype='complex128'),
  995. Series(list('abcde')),
  996. Series(list('abcde'), dtype='category')])
  997. def test_error(self, r):
  998. dt = r.dtype
  999. msg = ("Cannot use method 'n(larg|small)est' with "
  1000. "dtype {dt}".format(dt=dt))
  1001. args = 2, len(r), 0, -1
  1002. methods = r.nlargest, r.nsmallest
  1003. for method, arg in product(methods, args):
  1004. with pytest.raises(TypeError, match=msg):
  1005. method(arg)
  1006. def test_nsmallest_nlargest(self, s_main_dtypes_split):
  1007. # float, int, datetime64 (use i8), timedelts64 (same),
  1008. # object that are numbers, object that are strings
  1009. s = s_main_dtypes_split
  1010. assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
  1011. assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]])
  1012. empty = s.iloc[0:0]
  1013. assert_series_equal(s.nsmallest(0), empty)
  1014. assert_series_equal(s.nsmallest(-1), empty)
  1015. assert_series_equal(s.nlargest(0), empty)
  1016. assert_series_equal(s.nlargest(-1), empty)
  1017. assert_series_equal(s.nsmallest(len(s)), s.sort_values())
  1018. assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values())
  1019. assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]])
  1020. assert_series_equal(s.nlargest(len(s) + 1),
  1021. s.iloc[[4, 0, 1, 3, 2]])
  1022. def test_misc(self):
  1023. s = Series([3., np.nan, 1, 2, 5])
  1024. assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
  1025. assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
  1026. msg = 'keep must be either "first", "last"'
  1027. with pytest.raises(ValueError, match=msg):
  1028. s.nsmallest(keep='invalid')
  1029. with pytest.raises(ValueError, match=msg):
  1030. s.nlargest(keep='invalid')
  1031. # GH 15297
  1032. s = Series([1] * 5, index=[1, 2, 3, 4, 5])
  1033. expected_first = Series([1] * 3, index=[1, 2, 3])
  1034. expected_last = Series([1] * 3, index=[5, 4, 3])
  1035. result = s.nsmallest(3)
  1036. assert_series_equal(result, expected_first)
  1037. result = s.nsmallest(3, keep='last')
  1038. assert_series_equal(result, expected_last)
  1039. result = s.nlargest(3)
  1040. assert_series_equal(result, expected_first)
  1041. result = s.nlargest(3, keep='last')
  1042. assert_series_equal(result, expected_last)
  1043. @pytest.mark.parametrize('n', range(1, 5))
  1044. def test_n(self, n):
  1045. # GH 13412
  1046. s = Series([1, 4, 3, 2], index=[0, 0, 1, 1])
  1047. result = s.nlargest(n)
  1048. expected = s.sort_values(ascending=False).head(n)
  1049. assert_series_equal(result, expected)
  1050. result = s.nsmallest(n)
  1051. expected = s.sort_values().head(n)
  1052. assert_series_equal(result, expected)
  1053. def test_boundary_integer(self, nselect_method, any_int_dtype):
  1054. # GH 21426
  1055. dtype_info = np.iinfo(any_int_dtype)
  1056. min_val, max_val = dtype_info.min, dtype_info.max
  1057. vals = [min_val, min_val + 1, max_val - 1, max_val]
  1058. assert_check_nselect_boundary(vals, any_int_dtype, nselect_method)
  1059. def test_boundary_float(self, nselect_method, float_dtype):
  1060. # GH 21426
  1061. dtype_info = np.finfo(float_dtype)
  1062. min_val, max_val = dtype_info.min, dtype_info.max
  1063. min_2nd, max_2nd = np.nextafter(
  1064. [min_val, max_val], 0, dtype=float_dtype)
  1065. vals = [min_val, min_2nd, max_2nd, max_val]
  1066. assert_check_nselect_boundary(vals, float_dtype, nselect_method)
  1067. @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]'])
  1068. def test_boundary_datetimelike(self, nselect_method, dtype):
  1069. # GH 21426
  1070. # use int64 bounds and +1 to min_val since true minimum is NaT
  1071. # (include min_val/NaT at end to maintain same expected_idxr)
  1072. dtype_info = np.iinfo('int64')
  1073. min_val, max_val = dtype_info.min, dtype_info.max
  1074. vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val]
  1075. assert_check_nselect_boundary(vals, dtype, nselect_method)
  1076. def test_duplicate_keep_all_ties(self):
  1077. # see gh-16818
  1078. s = Series([10, 9, 8, 7, 7, 7, 7, 6])
  1079. result = s.nlargest(4, keep='all')
  1080. expected = Series([10, 9, 8, 7, 7, 7, 7])
  1081. assert_series_equal(result, expected)
  1082. result = s.nsmallest(2, keep='all')
  1083. expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6])
  1084. assert_series_equal(result, expected)
  1085. class TestCategoricalSeriesAnalytics(object):
  1086. def test_count(self):
  1087. s = Series(Categorical([np.nan, 1, 2, np.nan],
  1088. categories=[5, 4, 3, 2, 1], ordered=True))
  1089. result = s.count()
  1090. assert result == 2
  1091. def test_value_counts(self):
  1092. # GH 12835
  1093. cats = Categorical(list('abcccb'), categories=list('cabd'))
  1094. s = Series(cats, name='xxx')
  1095. res = s.value_counts(sort=False)
  1096. exp_index = CategoricalIndex(list('cabd'), categories=cats.categories)
  1097. exp = Series([3, 1, 2, 0], name='xxx', index=exp_index)
  1098. tm.assert_series_equal(res, exp)
  1099. res = s.value_counts(sort=True)
  1100. exp_index = CategoricalIndex(list('cbad'), categories=cats.categories)
  1101. exp = Series([3, 2, 1, 0], name='xxx', index=exp_index)
  1102. tm.assert_series_equal(res, exp)
  1103. # check object dtype handles the Series.name as the same
  1104. # (tested in test_base.py)
  1105. s = Series(["a", "b", "c", "c", "c", "b"], name='xxx')
  1106. res = s.value_counts()
  1107. exp = Series([3, 2, 1], name='xxx', index=["c", "b", "a"])
  1108. tm.assert_series_equal(res, exp)
  1109. def test_value_counts_with_nan(self):
  1110. # see gh-9443
  1111. # sanity check
  1112. s = Series(["a", "b", "a"], dtype="category")
  1113. exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
  1114. res = s.value_counts(dropna=True)
  1115. tm.assert_series_equal(res, exp)
  1116. res = s.value_counts(dropna=True)
  1117. tm.assert_series_equal(res, exp)
  1118. # same Series via two different constructions --> same behaviour
  1119. series = [
  1120. Series(["a", "b", None, "a", None, None], dtype="category"),
  1121. Series(Categorical(["a", "b", None, "a", None, None],
  1122. categories=["a", "b"]))
  1123. ]
  1124. for s in series:
  1125. # None is a NaN value, so we exclude its count here
  1126. exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
  1127. res = s.value_counts(dropna=True)
  1128. tm.assert_series_equal(res, exp)
  1129. # we don't exclude the count of None and sort by counts
  1130. exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]))
  1131. res = s.value_counts(dropna=False)
  1132. tm.assert_series_equal(res, exp)
  1133. # When we aren't sorting by counts, and np.nan isn't a
  1134. # category, it should be last.
  1135. exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]))
  1136. res = s.value_counts(dropna=False, sort=False)
  1137. tm.assert_series_equal(res, exp)
  1138. @pytest.mark.parametrize(
  1139. "dtype",
  1140. ["int_", "uint", "float_", "unicode_", "timedelta64[h]",
  1141. pytest.param("datetime64[D]",
  1142. marks=pytest.mark.xfail(reason="GH#7996"))]
  1143. )
  1144. @pytest.mark.parametrize("is_ordered", [True, False])
  1145. def test_drop_duplicates_categorical_non_bool(self, dtype, is_ordered):
  1146. cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
  1147. # Test case 1
  1148. input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
  1149. tc1 = Series(Categorical(input1, categories=cat_array,
  1150. ordered=is_ordered))
  1151. expected = Series([False, False, False, True])
  1152. tm.assert_series_equal(tc1.duplicated(), expected)
  1153. tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
  1154. sc = tc1.copy()
  1155. sc.drop_duplicates(inplace=True)
  1156. tm.assert_series_equal(sc, tc1[~expected])
  1157. expected = Series([False, False, True, False])
  1158. tm.assert_series_equal(tc1.duplicated(keep='last'), expected)
  1159. tm.assert_series_equal(tc1.drop_duplicates(keep='last'),
  1160. tc1[~expected])
  1161. sc = tc1.copy()
  1162. sc.drop_duplicates(keep='last', inplace=True)
  1163. tm.assert_series_equal(sc, tc1[~expected])
  1164. expected = Series([False, False, True, True])
  1165. tm.assert_series_equal(tc1.duplicated(keep=False), expected)
  1166. tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
  1167. sc = tc1.copy()
  1168. sc.drop_duplicates(keep=False, inplace=True)
  1169. tm.assert_series_equal(sc, tc1[~expected])
  1170. # Test case 2
  1171. input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
  1172. tc2 = Series(Categorical(
  1173. input2, categories=cat_array, ordered=is_ordered)
  1174. )
  1175. expected = Series([False, False, False, False, True, True, False])
  1176. tm.assert_series_equal(tc2.duplicated(), expected)
  1177. tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
  1178. sc = tc2.copy()
  1179. sc.drop_duplicates(inplace=True)
  1180. tm.assert_series_equal(sc, tc2[~expected])
  1181. expected = Series([False, True, True, False, False, False, False])
  1182. tm.assert_series_equal(tc2.duplicated(keep='last'), expected)
  1183. tm.assert_series_equal(tc2.drop_duplicates(keep='last'),
  1184. tc2[~expected])
  1185. sc = tc2.copy()
  1186. sc.drop_duplicates(keep='last', inplace=True)
  1187. tm.assert_series_equal(sc, tc2[~expected])
  1188. expected = Series([False, True, True, False, True, True, False])
  1189. tm.assert_series_equal(tc2.duplicated(keep=False), expected)
  1190. tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
  1191. sc = tc2.copy()
  1192. sc.drop_duplicates(keep=False, inplace=True)
  1193. tm.assert_series_equal(sc, tc2[~expected])
  1194. @pytest.mark.parametrize("is_ordered", [True, False])
  1195. def test_drop_duplicates_categorical_bool(self, is_ordered):
  1196. tc = Series(Categorical([True, False, True, False],
  1197. categories=[True, False], ordered=is_ordered))
  1198. expected = Series([False, False, True, True])
  1199. tm.assert_series_equal(tc.duplicated(), expected)
  1200. tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
  1201. sc = tc.copy()
  1202. sc.drop_duplicates(inplace=True)
  1203. tm.assert_series_equal(sc, tc[~expected])
  1204. expected = Series([True, True, False, False])
  1205. tm.assert_series_equal(tc.duplicated(keep='last'), expected)
  1206. tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected])
  1207. sc = tc.copy()
  1208. sc.drop_duplicates(keep='last', inplace=True)
  1209. tm.assert_series_equal(sc, tc[~expected])
  1210. expected = Series([True, True, True, True])
  1211. tm.assert_series_equal(tc.duplicated(keep=False), expected)
  1212. tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
  1213. sc = tc.copy()
  1214. sc.drop_duplicates(keep=False, inplace=True)
  1215. tm.assert_series_equal(sc, tc[~expected])