test_analytics.py 90 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393
  1. # -*- coding: utf-8 -*-
  2. from datetime import timedelta
  3. import operator
  4. from string import ascii_lowercase
  5. import warnings
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import PY35, lrange
  9. import pandas.util._test_decorators as td
  10. import pandas as pd
  11. from pandas import (
  12. Categorical, DataFrame, MultiIndex, Series, Timestamp, compat, date_range,
  13. isna, notna, to_datetime, to_timedelta)
  14. import pandas.core.algorithms as algorithms
  15. import pandas.core.nanops as nanops
  16. import pandas.util.testing as tm
  17. def assert_stat_op_calc(opname, alternative, frame, has_skipna=True,
  18. check_dtype=True, check_dates=False,
  19. check_less_precise=False, skipna_alternative=None):
  20. """
  21. Check that operator opname works as advertised on frame
  22. Parameters
  23. ----------
  24. opname : string
  25. Name of the operator to test on frame
  26. alternative : function
  27. Function that opname is tested against; i.e. "frame.opname()" should
  28. equal "alternative(frame)".
  29. frame : DataFrame
  30. The object that the tests are executed on
  31. has_skipna : bool, default True
  32. Whether the method "opname" has the kwarg "skip_na"
  33. check_dtype : bool, default True
  34. Whether the dtypes of the result of "frame.opname()" and
  35. "alternative(frame)" should be checked.
  36. check_dates : bool, default false
  37. Whether opname should be tested on a Datetime Series
  38. check_less_precise : bool, default False
  39. Whether results should only be compared approximately;
  40. passed on to tm.assert_series_equal
  41. skipna_alternative : function, default None
  42. NaN-safe version of alternative
  43. """
  44. f = getattr(frame, opname)
  45. if check_dates:
  46. df = DataFrame({'b': date_range('1/1/2001', periods=2)})
  47. result = getattr(df, opname)()
  48. assert isinstance(result, Series)
  49. df['a'] = lrange(len(df))
  50. result = getattr(df, opname)()
  51. assert isinstance(result, Series)
  52. assert len(result)
  53. if has_skipna:
  54. def wrapper(x):
  55. return alternative(x.values)
  56. skipna_wrapper = tm._make_skipna_wrapper(alternative,
  57. skipna_alternative)
  58. result0 = f(axis=0, skipna=False)
  59. result1 = f(axis=1, skipna=False)
  60. tm.assert_series_equal(result0, frame.apply(wrapper),
  61. check_dtype=check_dtype,
  62. check_less_precise=check_less_precise)
  63. # HACK: win32
  64. tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),
  65. check_dtype=False,
  66. check_less_precise=check_less_precise)
  67. else:
  68. skipna_wrapper = alternative
  69. result0 = f(axis=0)
  70. result1 = f(axis=1)
  71. tm.assert_series_equal(result0, frame.apply(skipna_wrapper),
  72. check_dtype=check_dtype,
  73. check_less_precise=check_less_precise)
  74. if opname in ['sum', 'prod']:
  75. expected = frame.apply(skipna_wrapper, axis=1)
  76. tm.assert_series_equal(result1, expected, check_dtype=False,
  77. check_less_precise=check_less_precise)
  78. # check dtypes
  79. if check_dtype:
  80. lcd_dtype = frame.values.dtype
  81. assert lcd_dtype == result0.dtype
  82. assert lcd_dtype == result1.dtype
  83. # bad axis
  84. with pytest.raises(ValueError, match='No axis named 2'):
  85. f(axis=2)
  86. # all NA case
  87. if has_skipna:
  88. all_na = frame * np.NaN
  89. r0 = getattr(all_na, opname)(axis=0)
  90. r1 = getattr(all_na, opname)(axis=1)
  91. if opname in ['sum', 'prod']:
  92. unit = 1 if opname == 'prod' else 0 # result for empty sum/prod
  93. expected = pd.Series(unit, index=r0.index, dtype=r0.dtype)
  94. tm.assert_series_equal(r0, expected)
  95. expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
  96. tm.assert_series_equal(r1, expected)
  97. def assert_stat_op_api(opname, float_frame, float_string_frame,
  98. has_numeric_only=False):
  99. """
  100. Check that API for operator opname works as advertised on frame
  101. Parameters
  102. ----------
  103. opname : string
  104. Name of the operator to test on frame
  105. float_frame : DataFrame
  106. DataFrame with columns of type float
  107. float_string_frame : DataFrame
  108. DataFrame with both float and string columns
  109. has_numeric_only : bool, default False
  110. Whether the method "opname" has the kwarg "numeric_only"
  111. """
  112. # make sure works on mixed-type frame
  113. getattr(float_string_frame, opname)(axis=0)
  114. getattr(float_string_frame, opname)(axis=1)
  115. if has_numeric_only:
  116. getattr(float_string_frame, opname)(axis=0, numeric_only=True)
  117. getattr(float_string_frame, opname)(axis=1, numeric_only=True)
  118. getattr(float_frame, opname)(axis=0, numeric_only=False)
  119. getattr(float_frame, opname)(axis=1, numeric_only=False)
  120. def assert_bool_op_calc(opname, alternative, frame, has_skipna=True):
  121. """
  122. Check that bool operator opname works as advertised on frame
  123. Parameters
  124. ----------
  125. opname : string
  126. Name of the operator to test on frame
  127. alternative : function
  128. Function that opname is tested against; i.e. "frame.opname()" should
  129. equal "alternative(frame)".
  130. frame : DataFrame
  131. The object that the tests are executed on
  132. has_skipna : bool, default True
  133. Whether the method "opname" has the kwarg "skip_na"
  134. """
  135. f = getattr(frame, opname)
  136. if has_skipna:
  137. def skipna_wrapper(x):
  138. nona = x.dropna().values
  139. return alternative(nona)
  140. def wrapper(x):
  141. return alternative(x.values)
  142. result0 = f(axis=0, skipna=False)
  143. result1 = f(axis=1, skipna=False)
  144. tm.assert_series_equal(result0, frame.apply(wrapper))
  145. tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),
  146. check_dtype=False) # HACK: win32
  147. else:
  148. skipna_wrapper = alternative
  149. wrapper = alternative
  150. result0 = f(axis=0)
  151. result1 = f(axis=1)
  152. tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
  153. tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
  154. check_dtype=False)
  155. # bad axis
  156. with pytest.raises(ValueError, match='No axis named 2'):
  157. f(axis=2)
  158. # all NA case
  159. if has_skipna:
  160. all_na = frame * np.NaN
  161. r0 = getattr(all_na, opname)(axis=0)
  162. r1 = getattr(all_na, opname)(axis=1)
  163. if opname == 'any':
  164. assert not r0.any()
  165. assert not r1.any()
  166. else:
  167. assert r0.all()
  168. assert r1.all()
  169. def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame,
  170. has_bool_only=False):
  171. """
  172. Check that API for boolean operator opname works as advertised on frame
  173. Parameters
  174. ----------
  175. opname : string
  176. Name of the operator to test on frame
  177. float_frame : DataFrame
  178. DataFrame with columns of type float
  179. float_string_frame : DataFrame
  180. DataFrame with both float and string columns
  181. has_bool_only : bool, default False
  182. Whether the method "opname" has the kwarg "bool_only"
  183. """
  184. # make sure op works on mixed-type frame
  185. mixed = float_string_frame
  186. mixed['_bool_'] = np.random.randn(len(mixed)) > 0.5
  187. getattr(mixed, opname)(axis=0)
  188. getattr(mixed, opname)(axis=1)
  189. if has_bool_only:
  190. getattr(mixed, opname)(axis=0, bool_only=True)
  191. getattr(mixed, opname)(axis=1, bool_only=True)
  192. getattr(bool_frame_with_na, opname)(axis=0, bool_only=False)
  193. getattr(bool_frame_with_na, opname)(axis=1, bool_only=False)
  194. class TestDataFrameAnalytics():
  195. # ---------------------------------------------------------------------=
  196. # Correlation and covariance
  197. @td.skip_if_no_scipy
  198. def test_corr_pearson(self, float_frame):
  199. float_frame['A'][:5] = np.nan
  200. float_frame['B'][5:10] = np.nan
  201. self._check_method(float_frame, 'pearson')
  202. @td.skip_if_no_scipy
  203. def test_corr_kendall(self, float_frame):
  204. float_frame['A'][:5] = np.nan
  205. float_frame['B'][5:10] = np.nan
  206. self._check_method(float_frame, 'kendall')
  207. @td.skip_if_no_scipy
  208. def test_corr_spearman(self, float_frame):
  209. float_frame['A'][:5] = np.nan
  210. float_frame['B'][5:10] = np.nan
  211. self._check_method(float_frame, 'spearman')
  212. def _check_method(self, frame, method='pearson'):
  213. correls = frame.corr(method=method)
  214. expected = frame['A'].corr(frame['C'], method=method)
  215. tm.assert_almost_equal(correls['A']['C'], expected)
  216. @td.skip_if_no_scipy
  217. def test_corr_non_numeric(self, float_frame, float_string_frame):
  218. float_frame['A'][:5] = np.nan
  219. float_frame['B'][5:10] = np.nan
  220. # exclude non-numeric types
  221. result = float_string_frame.corr()
  222. expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].corr()
  223. tm.assert_frame_equal(result, expected)
  224. @td.skip_if_no_scipy
  225. @pytest.mark.parametrize('meth', ['pearson', 'kendall', 'spearman'])
  226. def test_corr_nooverlap(self, meth):
  227. # nothing in common
  228. df = DataFrame({'A': [1, 1.5, 1, np.nan, np.nan, np.nan],
  229. 'B': [np.nan, np.nan, np.nan, 1, 1.5, 1],
  230. 'C': [np.nan, np.nan, np.nan, np.nan,
  231. np.nan, np.nan]})
  232. rs = df.corr(meth)
  233. assert isna(rs.loc['A', 'B'])
  234. assert isna(rs.loc['B', 'A'])
  235. assert rs.loc['A', 'A'] == 1
  236. assert rs.loc['B', 'B'] == 1
  237. assert isna(rs.loc['C', 'C'])
  238. @td.skip_if_no_scipy
  239. @pytest.mark.parametrize('meth', ['pearson', 'spearman'])
  240. def test_corr_constant(self, meth):
  241. # constant --> all NA
  242. df = DataFrame({'A': [1, 1, 1, np.nan, np.nan, np.nan],
  243. 'B': [np.nan, np.nan, np.nan, 1, 1, 1]})
  244. rs = df.corr(meth)
  245. assert isna(rs.values).all()
  246. def test_corr_int(self):
  247. # dtypes other than float64 #1761
  248. df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
  249. df3.cov()
  250. df3.corr()
  251. @td.skip_if_no_scipy
  252. def test_corr_int_and_boolean(self):
  253. # when dtypes of pandas series are different
  254. # then ndarray will have dtype=object,
  255. # so it need to be properly handled
  256. df = DataFrame({"a": [True, False], "b": [1, 0]})
  257. expected = DataFrame(np.ones((2, 2)), index=[
  258. 'a', 'b'], columns=['a', 'b'])
  259. for meth in ['pearson', 'kendall', 'spearman']:
  260. with warnings.catch_warnings(record=True):
  261. warnings.simplefilter("ignore", RuntimeWarning)
  262. result = df.corr(meth)
  263. tm.assert_frame_equal(result, expected)
  264. def test_corr_cov_independent_index_column(self):
  265. # GH 14617
  266. df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4),
  267. columns=list("abcd"))
  268. for method in ['cov', 'corr']:
  269. result = getattr(df, method)()
  270. assert result.index is not result.columns
  271. assert result.index.equals(result.columns)
  272. def test_corr_invalid_method(self):
  273. # GH 22298
  274. df = pd.DataFrame(np.random.normal(size=(10, 2)))
  275. msg = ("method must be either 'pearson', 'spearman', "
  276. "or 'kendall'")
  277. with pytest.raises(ValueError, match=msg):
  278. df.corr(method="____")
  279. def test_cov(self, float_frame, float_string_frame):
  280. # min_periods no NAs (corner case)
  281. expected = float_frame.cov()
  282. result = float_frame.cov(min_periods=len(float_frame))
  283. tm.assert_frame_equal(expected, result)
  284. result = float_frame.cov(min_periods=len(float_frame) + 1)
  285. assert isna(result.values).all()
  286. # with NAs
  287. frame = float_frame.copy()
  288. frame['A'][:5] = np.nan
  289. frame['B'][5:10] = np.nan
  290. result = float_frame.cov(min_periods=len(float_frame) - 8)
  291. expected = float_frame.cov()
  292. expected.loc['A', 'B'] = np.nan
  293. expected.loc['B', 'A'] = np.nan
  294. # regular
  295. float_frame['A'][:5] = np.nan
  296. float_frame['B'][:10] = np.nan
  297. cov = float_frame.cov()
  298. tm.assert_almost_equal(cov['A']['C'],
  299. float_frame['A'].cov(float_frame['C']))
  300. # exclude non-numeric types
  301. result = float_string_frame.cov()
  302. expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].cov()
  303. tm.assert_frame_equal(result, expected)
  304. # Single column frame
  305. df = DataFrame(np.linspace(0.0, 1.0, 10))
  306. result = df.cov()
  307. expected = DataFrame(np.cov(df.values.T).reshape((1, 1)),
  308. index=df.columns, columns=df.columns)
  309. tm.assert_frame_equal(result, expected)
  310. df.loc[0] = np.nan
  311. result = df.cov()
  312. expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)),
  313. index=df.columns, columns=df.columns)
  314. tm.assert_frame_equal(result, expected)
  315. def test_corrwith(self, datetime_frame):
  316. a = datetime_frame
  317. noise = Series(np.random.randn(len(a)), index=a.index)
  318. b = datetime_frame.add(noise, axis=0)
  319. # make sure order does not matter
  320. b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
  321. del b['B']
  322. colcorr = a.corrwith(b, axis=0)
  323. tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A']))
  324. rowcorr = a.corrwith(b, axis=1)
  325. tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
  326. dropped = a.corrwith(b, axis=0, drop=True)
  327. tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A']))
  328. assert 'B' not in dropped
  329. dropped = a.corrwith(b, axis=1, drop=True)
  330. assert a.index[-1] not in dropped.index
  331. # non time-series data
  332. index = ['a', 'b', 'c', 'd', 'e']
  333. columns = ['one', 'two', 'three', 'four']
  334. df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns)
  335. df2 = DataFrame(np.random.randn(4, 4),
  336. index=index[:4], columns=columns)
  337. correls = df1.corrwith(df2, axis=1)
  338. for row in index[:4]:
  339. tm.assert_almost_equal(correls[row],
  340. df1.loc[row].corr(df2.loc[row]))
  341. def test_corrwith_with_objects(self):
  342. df1 = tm.makeTimeDataFrame()
  343. df2 = tm.makeTimeDataFrame()
  344. cols = ['A', 'B', 'C', 'D']
  345. df1['obj'] = 'foo'
  346. df2['obj'] = 'bar'
  347. result = df1.corrwith(df2)
  348. expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
  349. tm.assert_series_equal(result, expected)
  350. result = df1.corrwith(df2, axis=1)
  351. expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
  352. tm.assert_series_equal(result, expected)
  353. def test_corrwith_series(self, datetime_frame):
  354. result = datetime_frame.corrwith(datetime_frame['A'])
  355. expected = datetime_frame.apply(datetime_frame['A'].corr)
  356. tm.assert_series_equal(result, expected)
  357. def test_corrwith_matches_corrcoef(self):
  358. df1 = DataFrame(np.arange(10000), columns=['a'])
  359. df2 = DataFrame(np.arange(10000) ** 2, columns=['a'])
  360. c1 = df1.corrwith(df2)['a']
  361. c2 = np.corrcoef(df1['a'], df2['a'])[0][1]
  362. tm.assert_almost_equal(c1, c2)
  363. assert c1 < 1
  364. def test_corrwith_mixed_dtypes(self):
  365. # GH 18570
  366. df = pd.DataFrame({'a': [1, 4, 3, 2], 'b': [4, 6, 7, 3],
  367. 'c': ['a', 'b', 'c', 'd']})
  368. s = pd.Series([0, 6, 7, 3])
  369. result = df.corrwith(s)
  370. corrs = [df['a'].corr(s), df['b'].corr(s)]
  371. expected = pd.Series(data=corrs, index=['a', 'b'])
  372. tm.assert_series_equal(result, expected)
  373. def test_corrwith_index_intersection(self):
  374. df1 = pd.DataFrame(np.random.random(size=(10, 2)),
  375. columns=["a", "b"])
  376. df2 = pd.DataFrame(np.random.random(size=(10, 3)),
  377. columns=["a", "b", "c"])
  378. result = df1.corrwith(df2, drop=True).index.sort_values()
  379. expected = df1.columns.intersection(df2.columns).sort_values()
  380. tm.assert_index_equal(result, expected)
  381. def test_corrwith_index_union(self):
  382. df1 = pd.DataFrame(np.random.random(size=(10, 2)),
  383. columns=["a", "b"])
  384. df2 = pd.DataFrame(np.random.random(size=(10, 3)),
  385. columns=["a", "b", "c"])
  386. result = df1.corrwith(df2, drop=False).index.sort_values()
  387. expected = df1.columns.union(df2.columns).sort_values()
  388. tm.assert_index_equal(result, expected)
  389. def test_corrwith_dup_cols(self):
  390. # GH 21925
  391. df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T)
  392. df2 = df1.copy()
  393. df2 = pd.concat((df2, df2[0]), axis=1)
  394. result = df1.corrwith(df2)
  395. expected = pd.Series(np.ones(4), index=[0, 0, 1, 2])
  396. tm.assert_series_equal(result, expected)
  397. @td.skip_if_no_scipy
  398. def test_corrwith_spearman(self):
  399. # GH 21925
  400. df = pd.DataFrame(np.random.random(size=(100, 3)))
  401. result = df.corrwith(df**2, method="spearman")
  402. expected = Series(np.ones(len(result)))
  403. tm.assert_series_equal(result, expected)
  404. @td.skip_if_no_scipy
  405. def test_corrwith_kendall(self):
  406. # GH 21925
  407. df = pd.DataFrame(np.random.random(size=(100, 3)))
  408. result = df.corrwith(df**2, method="kendall")
  409. expected = Series(np.ones(len(result)))
  410. tm.assert_series_equal(result, expected)
  411. def test_bool_describe_in_mixed_frame(self):
  412. df = DataFrame({
  413. 'string_data': ['a', 'b', 'c', 'd', 'e'],
  414. 'bool_data': [True, True, False, False, False],
  415. 'int_data': [10, 20, 30, 40, 50],
  416. })
  417. # Integer data are included in .describe() output,
  418. # Boolean and string data are not.
  419. result = df.describe()
  420. expected = DataFrame({'int_data': [5, 30, df.int_data.std(),
  421. 10, 20, 30, 40, 50]},
  422. index=['count', 'mean', 'std', 'min', '25%',
  423. '50%', '75%', 'max'])
  424. tm.assert_frame_equal(result, expected)
  425. # Top value is a boolean value that is False
  426. result = df.describe(include=['bool'])
  427. expected = DataFrame({'bool_data': [5, 2, False, 3]},
  428. index=['count', 'unique', 'top', 'freq'])
  429. tm.assert_frame_equal(result, expected)
  430. def test_describe_bool_frame(self):
  431. # GH 13891
  432. df = pd.DataFrame({
  433. 'bool_data_1': [False, False, True, True],
  434. 'bool_data_2': [False, True, True, True]
  435. })
  436. result = df.describe()
  437. expected = DataFrame({'bool_data_1': [4, 2, True, 2],
  438. 'bool_data_2': [4, 2, True, 3]},
  439. index=['count', 'unique', 'top', 'freq'])
  440. tm.assert_frame_equal(result, expected)
  441. df = pd.DataFrame({
  442. 'bool_data': [False, False, True, True, False],
  443. 'int_data': [0, 1, 2, 3, 4]
  444. })
  445. result = df.describe()
  446. expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1,
  447. 2, 3, 4]},
  448. index=['count', 'mean', 'std', 'min', '25%',
  449. '50%', '75%', 'max'])
  450. tm.assert_frame_equal(result, expected)
  451. df = pd.DataFrame({
  452. 'bool_data': [False, False, True, True],
  453. 'str_data': ['a', 'b', 'c', 'a']
  454. })
  455. result = df.describe()
  456. expected = DataFrame({'bool_data': [4, 2, True, 2],
  457. 'str_data': [4, 3, 'a', 2]},
  458. index=['count', 'unique', 'top', 'freq'])
  459. tm.assert_frame_equal(result, expected)
  460. def test_describe_categorical(self):
  461. df = DataFrame({'value': np.random.randint(0, 10000, 100)})
  462. labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
  463. cat_labels = Categorical(labels, labels)
  464. df = df.sort_values(by=['value'], ascending=True)
  465. df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
  466. right=False, labels=cat_labels)
  467. cat = df
  468. # Categoricals should not show up together with numerical columns
  469. result = cat.describe()
  470. assert len(result.columns) == 1
  471. # In a frame, describe() for the cat should be the same as for string
  472. # arrays (count, unique, top, freq)
  473. cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
  474. ordered=True)
  475. s = Series(cat)
  476. result = s.describe()
  477. expected = Series([4, 2, "b", 3],
  478. index=['count', 'unique', 'top', 'freq'])
  479. tm.assert_series_equal(result, expected)
  480. cat = Series(Categorical(["a", "b", "c", "c"]))
  481. df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
  482. result = df3.describe()
  483. tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
  484. def test_describe_categorical_columns(self):
  485. # GH 11558
  486. columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
  487. ordered=True, name='XXX')
  488. df = DataFrame({'int1': [10, 20, 30, 40, 50],
  489. 'int2': [10, 20, 30, 40, 50],
  490. 'obj': ['A', 0, None, 'X', 1]},
  491. columns=columns)
  492. result = df.describe()
  493. exp_columns = pd.CategoricalIndex(['int1', 'int2'],
  494. categories=['int1', 'int2', 'obj'],
  495. ordered=True, name='XXX')
  496. expected = DataFrame({'int1': [5, 30, df.int1.std(),
  497. 10, 20, 30, 40, 50],
  498. 'int2': [5, 30, df.int2.std(),
  499. 10, 20, 30, 40, 50]},
  500. index=['count', 'mean', 'std', 'min', '25%',
  501. '50%', '75%', 'max'],
  502. columns=exp_columns)
  503. tm.assert_frame_equal(result, expected)
  504. tm.assert_categorical_equal(result.columns.values,
  505. expected.columns.values)
  506. def test_describe_datetime_columns(self):
  507. columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
  508. freq='MS', tz='US/Eastern', name='XXX')
  509. df = DataFrame({0: [10, 20, 30, 40, 50],
  510. 1: [10, 20, 30, 40, 50],
  511. 2: ['A', 0, None, 'X', 1]})
  512. df.columns = columns
  513. result = df.describe()
  514. exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
  515. freq='MS', tz='US/Eastern', name='XXX')
  516. expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
  517. 10, 20, 30, 40, 50],
  518. 1: [5, 30, df.iloc[:, 1].std(),
  519. 10, 20, 30, 40, 50]},
  520. index=['count', 'mean', 'std', 'min', '25%',
  521. '50%', '75%', 'max'])
  522. expected.columns = exp_columns
  523. tm.assert_frame_equal(result, expected)
  524. assert result.columns.freq == 'MS'
  525. assert result.columns.tz == expected.columns.tz
  526. def test_describe_timedelta_values(self):
  527. # GH 6145
  528. t1 = pd.timedelta_range('1 days', freq='D', periods=5)
  529. t2 = pd.timedelta_range('1 hours', freq='H', periods=5)
  530. df = pd.DataFrame({'t1': t1, 't2': t2})
  531. expected = DataFrame({'t1': [5, pd.Timedelta('3 days'),
  532. df.iloc[:, 0].std(),
  533. pd.Timedelta('1 days'),
  534. pd.Timedelta('2 days'),
  535. pd.Timedelta('3 days'),
  536. pd.Timedelta('4 days'),
  537. pd.Timedelta('5 days')],
  538. 't2': [5, pd.Timedelta('3 hours'),
  539. df.iloc[:, 1].std(),
  540. pd.Timedelta('1 hours'),
  541. pd.Timedelta('2 hours'),
  542. pd.Timedelta('3 hours'),
  543. pd.Timedelta('4 hours'),
  544. pd.Timedelta('5 hours')]},
  545. index=['count', 'mean', 'std', 'min', '25%',
  546. '50%', '75%', 'max'])
  547. result = df.describe()
  548. tm.assert_frame_equal(result, expected)
  549. exp_repr = (" t1 t2\n"
  550. "count 5 5\n"
  551. "mean 3 days 00:00:00 0 days 03:00:00\n"
  552. "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
  553. "min 1 days 00:00:00 0 days 01:00:00\n"
  554. "25% 2 days 00:00:00 0 days 02:00:00\n"
  555. "50% 3 days 00:00:00 0 days 03:00:00\n"
  556. "75% 4 days 00:00:00 0 days 04:00:00\n"
  557. "max 5 days 00:00:00 0 days 05:00:00")
  558. assert repr(result) == exp_repr
  559. def test_describe_tz_values(self, tz_naive_fixture):
  560. # GH 21332
  561. tz = tz_naive_fixture
  562. s1 = Series(range(5))
  563. start = Timestamp(2018, 1, 1)
  564. end = Timestamp(2018, 1, 5)
  565. s2 = Series(date_range(start, end, tz=tz))
  566. df = pd.DataFrame({'s1': s1, 's2': s2})
  567. expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan,
  568. 2, 1.581139, 0, 1, 2, 3, 4],
  569. 's2': [5, 5, s2.value_counts().index[0], 1,
  570. start.tz_localize(tz),
  571. end.tz_localize(tz), np.nan, np.nan,
  572. np.nan, np.nan, np.nan, np.nan, np.nan]},
  573. index=['count', 'unique', 'top', 'freq', 'first',
  574. 'last', 'mean', 'std', 'min', '25%', '50%',
  575. '75%', 'max']
  576. )
  577. result = df.describe(include='all')
  578. tm.assert_frame_equal(result, expected)
  579. def test_reduce_mixed_frame(self):
  580. # GH 6806
  581. df = DataFrame({
  582. 'bool_data': [True, True, False, False, False],
  583. 'int_data': [10, 20, 30, 40, 50],
  584. 'string_data': ['a', 'b', 'c', 'd', 'e'],
  585. })
  586. df.reindex(columns=['bool_data', 'int_data', 'string_data'])
  587. test = df.sum(axis=0)
  588. tm.assert_numpy_array_equal(test.values,
  589. np.array([2, 150, 'abcde'], dtype=object))
  590. tm.assert_series_equal(test, df.T.sum(axis=1))
  591. def test_count(self, float_frame_with_na, float_frame, float_string_frame):
  592. f = lambda s: notna(s).sum()
  593. assert_stat_op_calc('count', f, float_frame_with_na, has_skipna=False,
  594. check_dtype=False, check_dates=True)
  595. assert_stat_op_api('count', float_frame, float_string_frame,
  596. has_numeric_only=True)
  597. # corner case
  598. frame = DataFrame()
  599. ct1 = frame.count(1)
  600. assert isinstance(ct1, Series)
  601. ct2 = frame.count(0)
  602. assert isinstance(ct2, Series)
  603. # GH 423
  604. df = DataFrame(index=lrange(10))
  605. result = df.count(1)
  606. expected = Series(0, index=df.index)
  607. tm.assert_series_equal(result, expected)
  608. df = DataFrame(columns=lrange(10))
  609. result = df.count(0)
  610. expected = Series(0, index=df.columns)
  611. tm.assert_series_equal(result, expected)
  612. df = DataFrame()
  613. result = df.count()
  614. expected = Series(0, index=[])
  615. tm.assert_series_equal(result, expected)
  616. def test_nunique(self, float_frame_with_na, float_frame,
  617. float_string_frame):
  618. f = lambda s: len(algorithms.unique1d(s.dropna()))
  619. assert_stat_op_calc('nunique', f, float_frame_with_na,
  620. has_skipna=False, check_dtype=False,
  621. check_dates=True)
  622. assert_stat_op_api('nunique', float_frame, float_string_frame)
  623. df = DataFrame({'A': [1, 1, 1],
  624. 'B': [1, 2, 3],
  625. 'C': [1, np.nan, 3]})
  626. tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2}))
  627. tm.assert_series_equal(df.nunique(dropna=False),
  628. Series({'A': 1, 'B': 3, 'C': 3}))
  629. tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
  630. tm.assert_series_equal(df.nunique(axis=1, dropna=False),
  631. Series({0: 1, 1: 3, 2: 2}))
  632. def test_sum(self, float_frame_with_na, mixed_float_frame,
  633. float_frame, float_string_frame):
  634. assert_stat_op_api('sum', float_frame, float_string_frame,
  635. has_numeric_only=True)
  636. assert_stat_op_calc('sum', np.sum, float_frame_with_na,
  637. skipna_alternative=np.nansum)
  638. # mixed types (with upcasting happening)
  639. assert_stat_op_calc('sum', np.sum, mixed_float_frame.astype('float32'),
  640. check_dtype=False, check_less_precise=True)
  641. @pytest.mark.parametrize('method', ['sum', 'mean', 'prod', 'var',
  642. 'std', 'skew', 'min', 'max'])
  643. def test_stat_operators_attempt_obj_array(self, method):
  644. # GH 676
  645. data = {
  646. 'a': [-0.00049987540199591344, -0.0016467257772919831,
  647. 0.00067695870775883013],
  648. 'b': [-0, -0, 0.0],
  649. 'c': [0.00031111847529610595, 0.0014902627951905339,
  650. -0.00094099200035979691]
  651. }
  652. df1 = DataFrame(data, index=['foo', 'bar', 'baz'], dtype='O')
  653. df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
  654. 2: [np.nan, 4]}, dtype=object)
  655. for df in [df1, df2]:
  656. assert df.values.dtype == np.object_
  657. result = getattr(df, method)(1)
  658. expected = getattr(df.astype('f8'), method)(1)
  659. if method in ['sum', 'prod']:
  660. tm.assert_series_equal(result, expected)
  661. def test_mean(self, float_frame_with_na, float_frame, float_string_frame):
  662. assert_stat_op_calc('mean', np.mean, float_frame_with_na,
  663. check_dates=True)
  664. assert_stat_op_api('mean', float_frame, float_string_frame)
  665. @pytest.mark.parametrize('tz', [None, 'UTC'])
  666. def test_mean_mixed_datetime_numeric(self, tz):
  667. # https://github.com/pandas-dev/pandas/issues/24752
  668. df = pd.DataFrame({"A": [1, 1],
  669. "B": [pd.Timestamp('2000', tz=tz)] * 2})
  670. result = df.mean()
  671. expected = pd.Series([1.0], index=['A'])
  672. tm.assert_series_equal(result, expected)
  673. @pytest.mark.parametrize('tz', [None, 'UTC'])
  674. def test_mean_excludeds_datetimes(self, tz):
  675. # https://github.com/pandas-dev/pandas/issues/24752
  676. # Our long-term desired behavior is unclear, but the behavior in
  677. # 0.24.0rc1 was buggy.
  678. df = pd.DataFrame({"A": [pd.Timestamp('2000', tz=tz)] * 2})
  679. result = df.mean()
  680. expected = pd.Series()
  681. tm.assert_series_equal(result, expected)
  682. def test_product(self, float_frame_with_na, float_frame,
  683. float_string_frame):
  684. assert_stat_op_calc('product', np.prod, float_frame_with_na)
  685. assert_stat_op_api('product', float_frame, float_string_frame)
  686. # TODO: Ensure warning isn't emitted in the first place
  687. @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
  688. def test_median(self, float_frame_with_na, float_frame,
  689. float_string_frame):
  690. def wrapper(x):
  691. if isna(x).any():
  692. return np.nan
  693. return np.median(x)
  694. assert_stat_op_calc('median', wrapper, float_frame_with_na,
  695. check_dates=True)
  696. assert_stat_op_api('median', float_frame, float_string_frame)
  697. def test_min(self, float_frame_with_na, int_frame,
  698. float_frame, float_string_frame):
  699. with warnings.catch_warnings(record=True):
  700. warnings.simplefilter("ignore", RuntimeWarning)
  701. assert_stat_op_calc('min', np.min, float_frame_with_na,
  702. check_dates=True)
  703. assert_stat_op_calc('min', np.min, int_frame)
  704. assert_stat_op_api('min', float_frame, float_string_frame)
  705. def test_cummin(self, datetime_frame):
  706. datetime_frame.loc[5:10, 0] = np.nan
  707. datetime_frame.loc[10:15, 1] = np.nan
  708. datetime_frame.loc[15:, 2] = np.nan
  709. # axis = 0
  710. cummin = datetime_frame.cummin()
  711. expected = datetime_frame.apply(Series.cummin)
  712. tm.assert_frame_equal(cummin, expected)
  713. # axis = 1
  714. cummin = datetime_frame.cummin(axis=1)
  715. expected = datetime_frame.apply(Series.cummin, axis=1)
  716. tm.assert_frame_equal(cummin, expected)
  717. # it works
  718. df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
  719. result = df.cummin() # noqa
  720. # fix issue
  721. cummin_xs = datetime_frame.cummin(axis=1)
  722. assert np.shape(cummin_xs) == np.shape(datetime_frame)
  723. def test_cummax(self, datetime_frame):
  724. datetime_frame.loc[5:10, 0] = np.nan
  725. datetime_frame.loc[10:15, 1] = np.nan
  726. datetime_frame.loc[15:, 2] = np.nan
  727. # axis = 0
  728. cummax = datetime_frame.cummax()
  729. expected = datetime_frame.apply(Series.cummax)
  730. tm.assert_frame_equal(cummax, expected)
  731. # axis = 1
  732. cummax = datetime_frame.cummax(axis=1)
  733. expected = datetime_frame.apply(Series.cummax, axis=1)
  734. tm.assert_frame_equal(cummax, expected)
  735. # it works
  736. df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
  737. result = df.cummax() # noqa
  738. # fix issue
  739. cummax_xs = datetime_frame.cummax(axis=1)
  740. assert np.shape(cummax_xs) == np.shape(datetime_frame)
  741. def test_max(self, float_frame_with_na, int_frame,
  742. float_frame, float_string_frame):
  743. with warnings.catch_warnings(record=True):
  744. warnings.simplefilter("ignore", RuntimeWarning)
  745. assert_stat_op_calc('max', np.max, float_frame_with_na,
  746. check_dates=True)
  747. assert_stat_op_calc('max', np.max, int_frame)
  748. assert_stat_op_api('max', float_frame, float_string_frame)
  749. def test_mad(self, float_frame_with_na, float_frame, float_string_frame):
  750. f = lambda x: np.abs(x - x.mean()).mean()
  751. assert_stat_op_calc('mad', f, float_frame_with_na)
  752. assert_stat_op_api('mad', float_frame, float_string_frame)
  753. def test_var_std(self, float_frame_with_na, datetime_frame, float_frame,
  754. float_string_frame):
  755. alt = lambda x: np.var(x, ddof=1)
  756. assert_stat_op_calc('var', alt, float_frame_with_na)
  757. assert_stat_op_api('var', float_frame, float_string_frame)
  758. alt = lambda x: np.std(x, ddof=1)
  759. assert_stat_op_calc('std', alt, float_frame_with_na)
  760. assert_stat_op_api('std', float_frame, float_string_frame)
  761. result = datetime_frame.std(ddof=4)
  762. expected = datetime_frame.apply(lambda x: x.std(ddof=4))
  763. tm.assert_almost_equal(result, expected)
  764. result = datetime_frame.var(ddof=4)
  765. expected = datetime_frame.apply(lambda x: x.var(ddof=4))
  766. tm.assert_almost_equal(result, expected)
  767. arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
  768. result = nanops.nanvar(arr, axis=0)
  769. assert not (result < 0).any()
  770. with pd.option_context('use_bottleneck', False):
  771. result = nanops.nanvar(arr, axis=0)
  772. assert not (result < 0).any()
  773. @pytest.mark.parametrize(
  774. "meth", ['sem', 'var', 'std'])
  775. def test_numeric_only_flag(self, meth):
  776. # GH 9201
  777. df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
  778. # set one entry to a number in str format
  779. df1.loc[0, 'foo'] = '100'
  780. df2 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
  781. # set one entry to a non-number str
  782. df2.loc[0, 'foo'] = 'a'
  783. result = getattr(df1, meth)(axis=1, numeric_only=True)
  784. expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
  785. tm.assert_series_equal(expected, result)
  786. result = getattr(df2, meth)(axis=1, numeric_only=True)
  787. expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
  788. tm.assert_series_equal(expected, result)
  789. # df1 has all numbers, df2 has a letter inside
  790. pytest.raises(TypeError, lambda: getattr(df1, meth)(
  791. axis=1, numeric_only=False))
  792. pytest.raises(TypeError, lambda: getattr(df2, meth)(
  793. axis=1, numeric_only=False))
  794. @pytest.mark.parametrize('op', ['mean', 'std', 'var',
  795. 'skew', 'kurt', 'sem'])
  796. def test_mixed_ops(self, op):
  797. # GH 16116
  798. df = DataFrame({'int': [1, 2, 3, 4],
  799. 'float': [1., 2., 3., 4.],
  800. 'str': ['a', 'b', 'c', 'd']})
  801. result = getattr(df, op)()
  802. assert len(result) == 2
  803. with pd.option_context('use_bottleneck', False):
  804. result = getattr(df, op)()
  805. assert len(result) == 2
  806. def test_cumsum(self, datetime_frame):
  807. datetime_frame.loc[5:10, 0] = np.nan
  808. datetime_frame.loc[10:15, 1] = np.nan
  809. datetime_frame.loc[15:, 2] = np.nan
  810. # axis = 0
  811. cumsum = datetime_frame.cumsum()
  812. expected = datetime_frame.apply(Series.cumsum)
  813. tm.assert_frame_equal(cumsum, expected)
  814. # axis = 1
  815. cumsum = datetime_frame.cumsum(axis=1)
  816. expected = datetime_frame.apply(Series.cumsum, axis=1)
  817. tm.assert_frame_equal(cumsum, expected)
  818. # works
  819. df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
  820. result = df.cumsum() # noqa
  821. # fix issue
  822. cumsum_xs = datetime_frame.cumsum(axis=1)
  823. assert np.shape(cumsum_xs) == np.shape(datetime_frame)
  824. def test_cumprod(self, datetime_frame):
  825. datetime_frame.loc[5:10, 0] = np.nan
  826. datetime_frame.loc[10:15, 1] = np.nan
  827. datetime_frame.loc[15:, 2] = np.nan
  828. # axis = 0
  829. cumprod = datetime_frame.cumprod()
  830. expected = datetime_frame.apply(Series.cumprod)
  831. tm.assert_frame_equal(cumprod, expected)
  832. # axis = 1
  833. cumprod = datetime_frame.cumprod(axis=1)
  834. expected = datetime_frame.apply(Series.cumprod, axis=1)
  835. tm.assert_frame_equal(cumprod, expected)
  836. # fix issue
  837. cumprod_xs = datetime_frame.cumprod(axis=1)
  838. assert np.shape(cumprod_xs) == np.shape(datetime_frame)
  839. # ints
  840. df = datetime_frame.fillna(0).astype(int)
  841. df.cumprod(0)
  842. df.cumprod(1)
  843. # ints32
  844. df = datetime_frame.fillna(0).astype(np.int32)
  845. df.cumprod(0)
  846. df.cumprod(1)
  847. def test_sem(self, float_frame_with_na, datetime_frame,
  848. float_frame, float_string_frame):
  849. alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
  850. assert_stat_op_calc('sem', alt, float_frame_with_na)
  851. assert_stat_op_api('sem', float_frame, float_string_frame)
  852. result = datetime_frame.sem(ddof=4)
  853. expected = datetime_frame.apply(
  854. lambda x: x.std(ddof=4) / np.sqrt(len(x)))
  855. tm.assert_almost_equal(result, expected)
  856. arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
  857. result = nanops.nansem(arr, axis=0)
  858. assert not (result < 0).any()
  859. with pd.option_context('use_bottleneck', False):
  860. result = nanops.nansem(arr, axis=0)
  861. assert not (result < 0).any()
  862. @td.skip_if_no_scipy
  863. def test_skew(self, float_frame_with_na, float_frame, float_string_frame):
  864. from scipy.stats import skew
  865. def alt(x):
  866. if len(x) < 3:
  867. return np.nan
  868. return skew(x, bias=False)
  869. assert_stat_op_calc('skew', alt, float_frame_with_na)
  870. assert_stat_op_api('skew', float_frame, float_string_frame)
  871. @td.skip_if_no_scipy
  872. def test_kurt(self, float_frame_with_na, float_frame, float_string_frame):
  873. from scipy.stats import kurtosis
  874. def alt(x):
  875. if len(x) < 4:
  876. return np.nan
  877. return kurtosis(x, bias=False)
  878. assert_stat_op_calc('kurt', alt, float_frame_with_na)
  879. assert_stat_op_api('kurt', float_frame, float_string_frame)
  880. index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
  881. codes=[[0, 0, 0, 0, 0, 0],
  882. [0, 1, 2, 0, 1, 2],
  883. [0, 1, 0, 1, 0, 1]])
  884. df = DataFrame(np.random.randn(6, 3), index=index)
  885. kurt = df.kurt()
  886. kurt2 = df.kurt(level=0).xs('bar')
  887. tm.assert_series_equal(kurt, kurt2, check_names=False)
  888. assert kurt.name is None
  889. assert kurt2.name == 'bar'
  890. @pytest.mark.parametrize("dropna, expected", [
  891. (True, {'A': [12],
  892. 'B': [10.0],
  893. 'C': [1.0],
  894. 'D': ['a'],
  895. 'E': Categorical(['a'], categories=['a']),
  896. 'F': to_datetime(['2000-1-2']),
  897. 'G': to_timedelta(['1 days'])}),
  898. (False, {'A': [12],
  899. 'B': [10.0],
  900. 'C': [np.nan],
  901. 'D': np.array([np.nan], dtype=object),
  902. 'E': Categorical([np.nan], categories=['a']),
  903. 'F': [pd.NaT],
  904. 'G': to_timedelta([pd.NaT])}),
  905. (True, {'H': [8, 9, np.nan, np.nan],
  906. 'I': [8, 9, np.nan, np.nan],
  907. 'J': [1, np.nan, np.nan, np.nan],
  908. 'K': Categorical(['a', np.nan, np.nan, np.nan],
  909. categories=['a']),
  910. 'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']),
  911. 'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']),
  912. 'N': [0, 1, 2, 3]}),
  913. (False, {'H': [8, 9, np.nan, np.nan],
  914. 'I': [8, 9, np.nan, np.nan],
  915. 'J': [1, np.nan, np.nan, np.nan],
  916. 'K': Categorical([np.nan, 'a', np.nan, np.nan],
  917. categories=['a']),
  918. 'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
  919. 'M': to_timedelta(['nan', '1 days', 'nan', 'nan']),
  920. 'N': [0, 1, 2, 3]})
  921. ])
  922. def test_mode_dropna(self, dropna, expected):
  923. df = DataFrame({"A": [12, 12, 19, 11],
  924. "B": [10, 10, np.nan, 3],
  925. "C": [1, np.nan, np.nan, np.nan],
  926. "D": [np.nan, np.nan, 'a', np.nan],
  927. "E": Categorical([np.nan, np.nan, 'a', np.nan]),
  928. "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
  929. "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
  930. "H": [8, 8, 9, 9],
  931. "I": [9, 9, 8, 8],
  932. "J": [1, 1, np.nan, np.nan],
  933. "K": Categorical(['a', np.nan, 'a', np.nan]),
  934. "L": to_datetime(['2000-1-2', '2000-1-2',
  935. 'NaT', 'NaT']),
  936. "M": to_timedelta(['1 days', 'nan',
  937. '1 days', 'nan']),
  938. "N": np.arange(4, dtype='int64')})
  939. result = df[sorted(list(expected.keys()))].mode(dropna=dropna)
  940. expected = DataFrame(expected)
  941. tm.assert_frame_equal(result, expected)
  942. @pytest.mark.skipif(not compat.PY3, reason="only PY3")
  943. def test_mode_sortwarning(self):
  944. # Check for the warning that is raised when the mode
  945. # results cannot be sorted
  946. df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']})
  947. expected = DataFrame({'A': ['a', np.nan]})
  948. with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
  949. result = df.mode(dropna=False)
  950. result = result.sort_values(by='A').reset_index(drop=True)
  951. tm.assert_frame_equal(result, expected)
  952. def test_operators_timedelta64(self):
  953. df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'),
  954. B=date_range('2012-1-2', periods=3, freq='D'),
  955. C=Timestamp('20120101') -
  956. timedelta(minutes=5, seconds=5)))
  957. diffs = DataFrame(dict(A=df['A'] - df['C'],
  958. B=df['A'] - df['B']))
  959. # min
  960. result = diffs.min()
  961. assert result[0] == diffs.loc[0, 'A']
  962. assert result[1] == diffs.loc[0, 'B']
  963. result = diffs.min(axis=1)
  964. assert (result == diffs.loc[0, 'B']).all()
  965. # max
  966. result = diffs.max()
  967. assert result[0] == diffs.loc[2, 'A']
  968. assert result[1] == diffs.loc[2, 'B']
  969. result = diffs.max(axis=1)
  970. assert (result == diffs['A']).all()
  971. # abs
  972. result = diffs.abs()
  973. result2 = abs(diffs)
  974. expected = DataFrame(dict(A=df['A'] - df['C'],
  975. B=df['B'] - df['A']))
  976. tm.assert_frame_equal(result, expected)
  977. tm.assert_frame_equal(result2, expected)
  978. # mixed frame
  979. mixed = diffs.copy()
  980. mixed['C'] = 'foo'
  981. mixed['D'] = 1
  982. mixed['E'] = 1.
  983. mixed['F'] = Timestamp('20130101')
  984. # results in an object array
  985. result = mixed.min()
  986. expected = Series([pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
  987. pd.Timedelta(timedelta(days=-1)),
  988. 'foo', 1, 1.0,
  989. Timestamp('20130101')],
  990. index=mixed.columns)
  991. tm.assert_series_equal(result, expected)
  992. # excludes numeric
  993. result = mixed.min(axis=1)
  994. expected = Series([1, 1, 1.], index=[0, 1, 2])
  995. tm.assert_series_equal(result, expected)
  996. # works when only those columns are selected
  997. result = mixed[['A', 'B']].min(1)
  998. expected = Series([timedelta(days=-1)] * 3)
  999. tm.assert_series_equal(result, expected)
  1000. result = mixed[['A', 'B']].min()
  1001. expected = Series([timedelta(seconds=5 * 60 + 5),
  1002. timedelta(days=-1)], index=['A', 'B'])
  1003. tm.assert_series_equal(result, expected)
  1004. # GH 3106
  1005. df = DataFrame({'time': date_range('20130102', periods=5),
  1006. 'time2': date_range('20130105', periods=5)})
  1007. df['off1'] = df['time2'] - df['time']
  1008. assert df['off1'].dtype == 'timedelta64[ns]'
  1009. df['off2'] = df['time'] - df['time2']
  1010. df._consolidate_inplace()
  1011. assert df['off1'].dtype == 'timedelta64[ns]'
  1012. assert df['off2'].dtype == 'timedelta64[ns]'
  1013. def test_sum_corner(self, empty_frame):
  1014. axis0 = empty_frame.sum(0)
  1015. axis1 = empty_frame.sum(1)
  1016. assert isinstance(axis0, Series)
  1017. assert isinstance(axis1, Series)
  1018. assert len(axis0) == 0
  1019. assert len(axis1) == 0
  1020. @pytest.mark.parametrize('method, unit', [
  1021. ('sum', 0),
  1022. ('prod', 1),
  1023. ])
  1024. def test_sum_prod_nanops(self, method, unit):
  1025. idx = ['a', 'b', 'c']
  1026. df = pd.DataFrame({"a": [unit, unit],
  1027. "b": [unit, np.nan],
  1028. "c": [np.nan, np.nan]})
  1029. # The default
  1030. result = getattr(df, method)
  1031. expected = pd.Series([unit, unit, unit], index=idx, dtype='float64')
  1032. # min_count=1
  1033. result = getattr(df, method)(min_count=1)
  1034. expected = pd.Series([unit, unit, np.nan], index=idx)
  1035. tm.assert_series_equal(result, expected)
  1036. # min_count=0
  1037. result = getattr(df, method)(min_count=0)
  1038. expected = pd.Series([unit, unit, unit], index=idx, dtype='float64')
  1039. tm.assert_series_equal(result, expected)
  1040. result = getattr(df.iloc[1:], method)(min_count=1)
  1041. expected = pd.Series([unit, np.nan, np.nan], index=idx)
  1042. tm.assert_series_equal(result, expected)
  1043. # min_count > 1
  1044. df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
  1045. result = getattr(df, method)(min_count=5)
  1046. expected = pd.Series(result, index=['A', 'B'])
  1047. tm.assert_series_equal(result, expected)
  1048. result = getattr(df, method)(min_count=6)
  1049. expected = pd.Series(result, index=['A', 'B'])
  1050. tm.assert_series_equal(result, expected)
  1051. def test_sum_nanops_timedelta(self):
  1052. # prod isn't defined on timedeltas
  1053. idx = ['a', 'b', 'c']
  1054. df = pd.DataFrame({"a": [0, 0],
  1055. "b": [0, np.nan],
  1056. "c": [np.nan, np.nan]})
  1057. df2 = df.apply(pd.to_timedelta)
  1058. # 0 by default
  1059. result = df2.sum()
  1060. expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx)
  1061. tm.assert_series_equal(result, expected)
  1062. # min_count=0
  1063. result = df2.sum(min_count=0)
  1064. tm.assert_series_equal(result, expected)
  1065. # min_count=1
  1066. result = df2.sum(min_count=1)
  1067. expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', index=idx)
  1068. tm.assert_series_equal(result, expected)
  1069. def test_sum_object(self, float_frame):
  1070. values = float_frame.values.astype(int)
  1071. frame = DataFrame(values, index=float_frame.index,
  1072. columns=float_frame.columns)
  1073. deltas = frame * timedelta(1)
  1074. deltas.sum()
  1075. def test_sum_bool(self, float_frame):
  1076. # ensure this works, bug report
  1077. bools = np.isnan(float_frame)
  1078. bools.sum(1)
  1079. bools.sum(0)
  1080. def test_mean_corner(self, float_frame, float_string_frame):
  1081. # unit test when have object data
  1082. the_mean = float_string_frame.mean(axis=0)
  1083. the_sum = float_string_frame.sum(axis=0, numeric_only=True)
  1084. tm.assert_index_equal(the_sum.index, the_mean.index)
  1085. assert len(the_mean.index) < len(float_string_frame.columns)
  1086. # xs sum mixed type, just want to know it works...
  1087. the_mean = float_string_frame.mean(axis=1)
  1088. the_sum = float_string_frame.sum(axis=1, numeric_only=True)
  1089. tm.assert_index_equal(the_sum.index, the_mean.index)
  1090. # take mean of boolean column
  1091. float_frame['bool'] = float_frame['A'] > 0
  1092. means = float_frame.mean(0)
  1093. assert means['bool'] == float_frame['bool'].values.mean()
  1094. def test_stats_mixed_type(self, float_string_frame):
  1095. # don't blow up
  1096. float_string_frame.std(1)
  1097. float_string_frame.var(1)
  1098. float_string_frame.mean(1)
  1099. float_string_frame.skew(1)
  1100. # TODO: Ensure warning isn't emitted in the first place
  1101. @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
  1102. def test_median_corner(self, int_frame, float_frame, float_string_frame):
  1103. def wrapper(x):
  1104. if isna(x).any():
  1105. return np.nan
  1106. return np.median(x)
  1107. assert_stat_op_calc('median', wrapper, int_frame, check_dtype=False,
  1108. check_dates=True)
  1109. assert_stat_op_api('median', float_frame, float_string_frame)
  1110. # Miscellanea
  1111. def test_count_objects(self, float_string_frame):
  1112. dm = DataFrame(float_string_frame._series)
  1113. df = DataFrame(float_string_frame._series)
  1114. tm.assert_series_equal(dm.count(), df.count())
  1115. tm.assert_series_equal(dm.count(1), df.count(1))
  1116. def test_cumsum_corner(self):
  1117. dm = DataFrame(np.arange(20).reshape(4, 5),
  1118. index=lrange(4), columns=lrange(5))
  1119. # ?(wesm)
  1120. result = dm.cumsum() # noqa
  1121. def test_sum_bools(self):
  1122. df = DataFrame(index=lrange(1), columns=lrange(10))
  1123. bools = isna(df)
  1124. assert bools.sum(axis=1)[0] == 10
  1125. # Index of max / min
  1126. def test_idxmin(self, float_frame, int_frame):
  1127. frame = float_frame
  1128. frame.loc[5:10] = np.nan
  1129. frame.loc[15:20, -2:] = np.nan
  1130. for skipna in [True, False]:
  1131. for axis in [0, 1]:
  1132. for df in [frame, int_frame]:
  1133. result = df.idxmin(axis=axis, skipna=skipna)
  1134. expected = df.apply(Series.idxmin, axis=axis,
  1135. skipna=skipna)
  1136. tm.assert_series_equal(result, expected)
  1137. pytest.raises(ValueError, frame.idxmin, axis=2)
  1138. def test_idxmax(self, float_frame, int_frame):
  1139. frame = float_frame
  1140. frame.loc[5:10] = np.nan
  1141. frame.loc[15:20, -2:] = np.nan
  1142. for skipna in [True, False]:
  1143. for axis in [0, 1]:
  1144. for df in [frame, int_frame]:
  1145. result = df.idxmax(axis=axis, skipna=skipna)
  1146. expected = df.apply(Series.idxmax, axis=axis,
  1147. skipna=skipna)
  1148. tm.assert_series_equal(result, expected)
  1149. pytest.raises(ValueError, frame.idxmax, axis=2)
  1150. # ----------------------------------------------------------------------
  1151. # Logical reductions
  1152. @pytest.mark.parametrize('opname', ['any', 'all'])
  1153. def test_any_all(self, opname, bool_frame_with_na, float_string_frame):
  1154. assert_bool_op_calc(opname, getattr(np, opname), bool_frame_with_na,
  1155. has_skipna=True)
  1156. assert_bool_op_api(opname, bool_frame_with_na, float_string_frame,
  1157. has_bool_only=True)
  1158. def test_any_all_extra(self):
  1159. df = DataFrame({
  1160. 'A': [True, False, False],
  1161. 'B': [True, True, False],
  1162. 'C': [True, True, True],
  1163. }, index=['a', 'b', 'c'])
  1164. result = df[['A', 'B']].any(1)
  1165. expected = Series([True, True, False], index=['a', 'b', 'c'])
  1166. tm.assert_series_equal(result, expected)
  1167. result = df[['A', 'B']].any(1, bool_only=True)
  1168. tm.assert_series_equal(result, expected)
  1169. result = df.all(1)
  1170. expected = Series([True, False, False], index=['a', 'b', 'c'])
  1171. tm.assert_series_equal(result, expected)
  1172. result = df.all(1, bool_only=True)
  1173. tm.assert_series_equal(result, expected)
  1174. # Axis is None
  1175. result = df.all(axis=None).item()
  1176. assert result is False
  1177. result = df.any(axis=None).item()
  1178. assert result is True
  1179. result = df[['C']].all(axis=None).item()
  1180. assert result is True
  1181. def test_any_datetime(self):
  1182. # GH 23070
  1183. float_data = [1, np.nan, 3, np.nan]
  1184. datetime_data = [pd.Timestamp('1960-02-15'),
  1185. pd.Timestamp('1960-02-16'),
  1186. pd.NaT,
  1187. pd.NaT]
  1188. df = DataFrame({
  1189. "A": float_data,
  1190. "B": datetime_data
  1191. })
  1192. result = df.any(1)
  1193. expected = Series([True, True, True, False])
  1194. tm.assert_series_equal(result, expected)
  1195. def test_any_all_bool_only(self):
  1196. # GH 25101
  1197. df = DataFrame({"col1": [1, 2, 3],
  1198. "col2": [4, 5, 6],
  1199. "col3": [None, None, None]})
  1200. result = df.all(bool_only=True)
  1201. expected = Series(dtype=np.bool)
  1202. tm.assert_series_equal(result, expected)
  1203. df = DataFrame({"col1": [1, 2, 3],
  1204. "col2": [4, 5, 6],
  1205. "col3": [None, None, None],
  1206. "col4": [False, False, True]})
  1207. result = df.all(bool_only=True)
  1208. expected = Series({"col4": False})
  1209. tm.assert_series_equal(result, expected)
  1210. @pytest.mark.parametrize('func, data, expected', [
  1211. (np.any, {}, False),
  1212. (np.all, {}, True),
  1213. (np.any, {'A': []}, False),
  1214. (np.all, {'A': []}, True),
  1215. (np.any, {'A': [False, False]}, False),
  1216. (np.all, {'A': [False, False]}, False),
  1217. (np.any, {'A': [True, False]}, True),
  1218. (np.all, {'A': [True, False]}, False),
  1219. (np.any, {'A': [True, True]}, True),
  1220. (np.all, {'A': [True, True]}, True),
  1221. (np.any, {'A': [False], 'B': [False]}, False),
  1222. (np.all, {'A': [False], 'B': [False]}, False),
  1223. (np.any, {'A': [False, False], 'B': [False, True]}, True),
  1224. (np.all, {'A': [False, False], 'B': [False, True]}, False),
  1225. # other types
  1226. (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False),
  1227. (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True),
  1228. (np.all, {'A': pd.Series([0, 1], dtype=int)}, False),
  1229. (np.any, {'A': pd.Series([0, 1], dtype=int)}, True),
  1230. pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False,
  1231. marks=[td.skip_if_np_lt_115]),
  1232. pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True,
  1233. marks=[td.skip_if_np_lt_115]),
  1234. pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
  1235. marks=[td.skip_if_np_lt_115]),
  1236. pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
  1237. marks=[td.skip_if_np_lt_115]),
  1238. pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False,
  1239. marks=[td.skip_if_np_lt_115]),
  1240. pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True,
  1241. marks=[td.skip_if_np_lt_115]),
  1242. pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
  1243. marks=[td.skip_if_np_lt_115]),
  1244. pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
  1245. marks=[td.skip_if_np_lt_115]),
  1246. (np.all, {'A': pd.Series([0, 1], dtype='category')}, False),
  1247. (np.any, {'A': pd.Series([0, 1], dtype='category')}, True),
  1248. (np.all, {'A': pd.Series([1, 2], dtype='category')}, True),
  1249. (np.any, {'A': pd.Series([1, 2], dtype='category')}, True),
  1250. # # Mix
  1251. # GH 21484
  1252. # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'),
  1253. # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True),
  1254. ])
  1255. def test_any_all_np_func(self, func, data, expected):
  1256. # GH 19976
  1257. data = DataFrame(data)
  1258. result = func(data)
  1259. assert isinstance(result, np.bool_)
  1260. assert result.item() is expected
  1261. # method version
  1262. result = getattr(DataFrame(data), func.__name__)(axis=None)
  1263. assert isinstance(result, np.bool_)
  1264. assert result.item() is expected
  1265. def test_any_all_object(self):
  1266. # GH 19976
  1267. result = np.all(DataFrame(columns=['a', 'b'])).item()
  1268. assert result is True
  1269. result = np.any(DataFrame(columns=['a', 'b'])).item()
  1270. assert result is False
  1271. @pytest.mark.parametrize('method', ['any', 'all'])
  1272. def test_any_all_level_axis_none_raises(self, method):
  1273. df = DataFrame(
  1274. {"A": 1},
  1275. index=MultiIndex.from_product([['A', 'B'], ['a', 'b']],
  1276. names=['out', 'in'])
  1277. )
  1278. xpr = "Must specify 'axis' when aggregating by level."
  1279. with pytest.raises(ValueError, match=xpr):
  1280. getattr(df, method)(axis=None, level='out')
  1281. # ----------------------------------------------------------------------
  1282. # Isin
  1283. def test_isin(self):
  1284. # GH 4211
  1285. df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
  1286. 'ids2': ['a', 'n', 'c', 'n']},
  1287. index=['foo', 'bar', 'baz', 'qux'])
  1288. other = ['a', 'b', 'c']
  1289. result = df.isin(other)
  1290. expected = DataFrame([df.loc[s].isin(other) for s in df.index])
  1291. tm.assert_frame_equal(result, expected)
  1292. @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
  1293. def test_isin_empty(self, empty):
  1294. # GH 16991
  1295. df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
  1296. expected = DataFrame(False, df.index, df.columns)
  1297. result = df.isin(empty)
  1298. tm.assert_frame_equal(result, expected)
  1299. def test_isin_dict(self):
  1300. df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
  1301. d = {'A': ['a']}
  1302. expected = DataFrame(False, df.index, df.columns)
  1303. expected.loc[0, 'A'] = True
  1304. result = df.isin(d)
  1305. tm.assert_frame_equal(result, expected)
  1306. # non unique columns
  1307. df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
  1308. df.columns = ['A', 'A']
  1309. expected = DataFrame(False, df.index, df.columns)
  1310. expected.loc[0, 'A'] = True
  1311. result = df.isin(d)
  1312. tm.assert_frame_equal(result, expected)
  1313. def test_isin_with_string_scalar(self):
  1314. # GH 4763
  1315. df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
  1316. 'ids2': ['a', 'n', 'c', 'n']},
  1317. index=['foo', 'bar', 'baz', 'qux'])
  1318. with pytest.raises(TypeError):
  1319. df.isin('a')
  1320. with pytest.raises(TypeError):
  1321. df.isin('aaa')
  1322. def test_isin_df(self):
  1323. df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
  1324. df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]})
  1325. expected = DataFrame(False, df1.index, df1.columns)
  1326. result = df1.isin(df2)
  1327. expected['A'].loc[[1, 3]] = True
  1328. expected['B'].loc[[0, 2]] = True
  1329. tm.assert_frame_equal(result, expected)
  1330. # partial overlapping columns
  1331. df2.columns = ['A', 'C']
  1332. result = df1.isin(df2)
  1333. expected['B'] = False
  1334. tm.assert_frame_equal(result, expected)
  1335. def test_isin_tuples(self):
  1336. # GH 16394
  1337. df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
  1338. df['C'] = list(zip(df['A'], df['B']))
  1339. result = df['C'].isin([(1, 'a')])
  1340. tm.assert_series_equal(result,
  1341. Series([True, False, False], name="C"))
  1342. def test_isin_df_dupe_values(self):
  1343. df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
  1344. # just cols duped
  1345. df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
  1346. columns=['B', 'B'])
  1347. with pytest.raises(ValueError):
  1348. df1.isin(df2)
  1349. # just index duped
  1350. df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
  1351. columns=['A', 'B'], index=[0, 0, 1, 1])
  1352. with pytest.raises(ValueError):
  1353. df1.isin(df2)
  1354. # cols and index:
  1355. df2.columns = ['B', 'B']
  1356. with pytest.raises(ValueError):
  1357. df1.isin(df2)
  1358. def test_isin_dupe_self(self):
  1359. other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]})
  1360. df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A', 'A'])
  1361. result = df.isin(other)
  1362. expected = DataFrame(False, index=df.index, columns=df.columns)
  1363. expected.loc[0] = True
  1364. expected.iloc[1, 1] = True
  1365. tm.assert_frame_equal(result, expected)
  1366. def test_isin_against_series(self):
  1367. df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]},
  1368. index=['a', 'b', 'c', 'd'])
  1369. s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd'])
  1370. expected = DataFrame(False, index=df.index, columns=df.columns)
  1371. expected['A'].loc['a'] = True
  1372. expected.loc['d'] = True
  1373. result = df.isin(s)
  1374. tm.assert_frame_equal(result, expected)
  1375. def test_isin_multiIndex(self):
  1376. idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'),
  1377. (0, 'b', 'bar'), (0, 'b', 'baz'),
  1378. (2, 'a', 'foo'), (2, 'a', 'bar'),
  1379. (2, 'c', 'bar'), (2, 'c', 'baz'),
  1380. (1, 'b', 'foo'), (1, 'b', 'bar'),
  1381. (1, 'c', 'bar'), (1, 'c', 'baz')])
  1382. df1 = DataFrame({'A': np.ones(12),
  1383. 'B': np.zeros(12)}, index=idx)
  1384. df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
  1385. 'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]})
  1386. # against regular index
  1387. expected = DataFrame(False, index=df1.index, columns=df1.columns)
  1388. result = df1.isin(df2)
  1389. tm.assert_frame_equal(result, expected)
  1390. df2.index = idx
  1391. expected = df2.values.astype(np.bool)
  1392. expected[:, 1] = ~expected[:, 1]
  1393. expected = DataFrame(expected, columns=['A', 'B'], index=idx)
  1394. result = df1.isin(df2)
  1395. tm.assert_frame_equal(result, expected)
  1396. def test_isin_empty_datetimelike(self):
  1397. # GH 15473
  1398. df1_ts = DataFrame({'date':
  1399. pd.to_datetime(['2014-01-01', '2014-01-02'])})
  1400. df1_td = DataFrame({'date':
  1401. [pd.Timedelta(1, 's'), pd.Timedelta(2, 's')]})
  1402. df2 = DataFrame({'date': []})
  1403. df3 = DataFrame()
  1404. expected = DataFrame({'date': [False, False]})
  1405. result = df1_ts.isin(df2)
  1406. tm.assert_frame_equal(result, expected)
  1407. result = df1_ts.isin(df3)
  1408. tm.assert_frame_equal(result, expected)
  1409. result = df1_td.isin(df2)
  1410. tm.assert_frame_equal(result, expected)
  1411. result = df1_td.isin(df3)
  1412. tm.assert_frame_equal(result, expected)
  1413. # Rounding
  1414. def test_round(self):
  1415. # GH 2665
  1416. # Test that rounding an empty DataFrame does nothing
  1417. df = DataFrame()
  1418. tm.assert_frame_equal(df, df.round())
  1419. # Here's the test frame we'll be working with
  1420. df = DataFrame({'col1': [1.123, 2.123, 3.123],
  1421. 'col2': [1.234, 2.234, 3.234]})
  1422. # Default round to integer (i.e. decimals=0)
  1423. expected_rounded = DataFrame(
  1424. {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
  1425. tm.assert_frame_equal(df.round(), expected_rounded)
  1426. # Round with an integer
  1427. decimals = 2
  1428. expected_rounded = DataFrame({'col1': [1.12, 2.12, 3.12],
  1429. 'col2': [1.23, 2.23, 3.23]})
  1430. tm.assert_frame_equal(df.round(decimals), expected_rounded)
  1431. # This should also work with np.round (since np.round dispatches to
  1432. # df.round)
  1433. tm.assert_frame_equal(np.round(df, decimals), expected_rounded)
  1434. # Round with a list
  1435. round_list = [1, 2]
  1436. with pytest.raises(TypeError):
  1437. df.round(round_list)
  1438. # Round with a dictionary
  1439. expected_rounded = DataFrame(
  1440. {'col1': [1.1, 2.1, 3.1], 'col2': [1.23, 2.23, 3.23]})
  1441. round_dict = {'col1': 1, 'col2': 2}
  1442. tm.assert_frame_equal(df.round(round_dict), expected_rounded)
  1443. # Incomplete dict
  1444. expected_partially_rounded = DataFrame(
  1445. {'col1': [1.123, 2.123, 3.123], 'col2': [1.2, 2.2, 3.2]})
  1446. partial_round_dict = {'col2': 1}
  1447. tm.assert_frame_equal(df.round(partial_round_dict),
  1448. expected_partially_rounded)
  1449. # Dict with unknown elements
  1450. wrong_round_dict = {'col3': 2, 'col2': 1}
  1451. tm.assert_frame_equal(df.round(wrong_round_dict),
  1452. expected_partially_rounded)
  1453. # float input to `decimals`
  1454. non_int_round_dict = {'col1': 1, 'col2': 0.5}
  1455. with pytest.raises(TypeError):
  1456. df.round(non_int_round_dict)
  1457. # String input
  1458. non_int_round_dict = {'col1': 1, 'col2': 'foo'}
  1459. with pytest.raises(TypeError):
  1460. df.round(non_int_round_dict)
  1461. non_int_round_Series = Series(non_int_round_dict)
  1462. with pytest.raises(TypeError):
  1463. df.round(non_int_round_Series)
  1464. # List input
  1465. non_int_round_dict = {'col1': 1, 'col2': [1, 2]}
  1466. with pytest.raises(TypeError):
  1467. df.round(non_int_round_dict)
  1468. non_int_round_Series = Series(non_int_round_dict)
  1469. with pytest.raises(TypeError):
  1470. df.round(non_int_round_Series)
  1471. # Non integer Series inputs
  1472. non_int_round_Series = Series(non_int_round_dict)
  1473. with pytest.raises(TypeError):
  1474. df.round(non_int_round_Series)
  1475. non_int_round_Series = Series(non_int_round_dict)
  1476. with pytest.raises(TypeError):
  1477. df.round(non_int_round_Series)
  1478. # Negative numbers
  1479. negative_round_dict = {'col1': -1, 'col2': -2}
  1480. big_df = df * 100
  1481. expected_neg_rounded = DataFrame(
  1482. {'col1': [110., 210, 310], 'col2': [100., 200, 300]})
  1483. tm.assert_frame_equal(big_df.round(negative_round_dict),
  1484. expected_neg_rounded)
  1485. # nan in Series round
  1486. nan_round_Series = Series({'col1': np.nan, 'col2': 1})
  1487. # TODO(wesm): unused?
  1488. expected_nan_round = DataFrame({ # noqa
  1489. 'col1': [1.123, 2.123, 3.123],
  1490. 'col2': [1.2, 2.2, 3.2]})
  1491. with pytest.raises(TypeError):
  1492. df.round(nan_round_Series)
  1493. # Make sure this doesn't break existing Series.round
  1494. tm.assert_series_equal(df['col1'].round(1), expected_rounded['col1'])
  1495. # named columns
  1496. # GH 11986
  1497. decimals = 2
  1498. expected_rounded = DataFrame(
  1499. {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]})
  1500. df.columns.name = "cols"
  1501. expected_rounded.columns.name = "cols"
  1502. tm.assert_frame_equal(df.round(decimals), expected_rounded)
  1503. # interaction of named columns & series
  1504. tm.assert_series_equal(df['col1'].round(decimals),
  1505. expected_rounded['col1'])
  1506. tm.assert_series_equal(df.round(decimals)['col1'],
  1507. expected_rounded['col1'])
  1508. def test_numpy_round(self):
  1509. # GH 12600
  1510. df = DataFrame([[1.53, 1.36], [0.06, 7.01]])
  1511. out = np.round(df, decimals=0)
  1512. expected = DataFrame([[2., 1.], [0., 7.]])
  1513. tm.assert_frame_equal(out, expected)
  1514. msg = "the 'out' parameter is not supported"
  1515. with pytest.raises(ValueError, match=msg):
  1516. np.round(df, decimals=0, out=df)
  1517. def test_round_mixed_type(self):
  1518. # GH 11885
  1519. df = DataFrame({'col1': [1.1, 2.2, 3.3, 4.4],
  1520. 'col2': ['1', 'a', 'c', 'f'],
  1521. 'col3': date_range('20111111', periods=4)})
  1522. round_0 = DataFrame({'col1': [1., 2., 3., 4.],
  1523. 'col2': ['1', 'a', 'c', 'f'],
  1524. 'col3': date_range('20111111', periods=4)})
  1525. tm.assert_frame_equal(df.round(), round_0)
  1526. tm.assert_frame_equal(df.round(1), df)
  1527. tm.assert_frame_equal(df.round({'col1': 1}), df)
  1528. tm.assert_frame_equal(df.round({'col1': 0}), round_0)
  1529. tm.assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0)
  1530. tm.assert_frame_equal(df.round({'col3': 1}), df)
  1531. def test_round_issue(self):
  1532. # GH 11611
  1533. df = pd.DataFrame(np.random.random([3, 3]), columns=['A', 'B', 'C'],
  1534. index=['first', 'second', 'third'])
  1535. dfs = pd.concat((df, df), axis=1)
  1536. rounded = dfs.round()
  1537. tm.assert_index_equal(rounded.index, dfs.index)
  1538. decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A'])
  1539. pytest.raises(ValueError, df.round, decimals)
  1540. def test_built_in_round(self):
  1541. if not compat.PY3:
  1542. pytest.skip("build in round cannot be overridden "
  1543. "prior to Python 3")
  1544. # GH 11763
  1545. # Here's the test frame we'll be working with
  1546. df = DataFrame(
  1547. {'col1': [1.123, 2.123, 3.123], 'col2': [1.234, 2.234, 3.234]})
  1548. # Default round to integer (i.e. decimals=0)
  1549. expected_rounded = DataFrame(
  1550. {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
  1551. tm.assert_frame_equal(round(df), expected_rounded)
  1552. def test_round_nonunique_categorical(self):
  1553. # See GH21809
  1554. idx = pd.CategoricalIndex(['low'] * 3 + ['hi'] * 3)
  1555. df = pd.DataFrame(np.random.rand(6, 3), columns=list('abc'))
  1556. expected = df.round(3)
  1557. expected.index = idx
  1558. df_categorical = df.copy().set_index(idx)
  1559. assert df_categorical.shape == (6, 3)
  1560. result = df_categorical.round(3)
  1561. assert result.shape == (6, 3)
  1562. tm.assert_frame_equal(result, expected)
  1563. def test_pct_change(self):
  1564. # GH 11150
  1565. pnl = DataFrame([np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(
  1566. 0, 40, 10)]).astype(np.float64)
  1567. pnl.iat[1, 0] = np.nan
  1568. pnl.iat[1, 1] = np.nan
  1569. pnl.iat[2, 3] = 60
  1570. for axis in range(2):
  1571. expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(
  1572. axis=axis) - 1
  1573. result = pnl.pct_change(axis=axis, fill_method='pad')
  1574. tm.assert_frame_equal(result, expected)
  1575. # Clip
  1576. def test_clip(self, float_frame):
  1577. median = float_frame.median().median()
  1578. original = float_frame.copy()
  1579. with tm.assert_produces_warning(FutureWarning):
  1580. capped = float_frame.clip_upper(median)
  1581. assert not (capped.values > median).any()
  1582. with tm.assert_produces_warning(FutureWarning):
  1583. floored = float_frame.clip_lower(median)
  1584. assert not (floored.values < median).any()
  1585. double = float_frame.clip(upper=median, lower=median)
  1586. assert not (double.values != median).any()
  1587. # Verify that float_frame was not changed inplace
  1588. assert (float_frame.values == original.values).all()
  1589. def test_inplace_clip(self, float_frame):
  1590. # GH 15388
  1591. median = float_frame.median().median()
  1592. frame_copy = float_frame.copy()
  1593. with tm.assert_produces_warning(FutureWarning):
  1594. frame_copy.clip_upper(median, inplace=True)
  1595. assert not (frame_copy.values > median).any()
  1596. frame_copy = float_frame.copy()
  1597. with tm.assert_produces_warning(FutureWarning):
  1598. frame_copy.clip_lower(median, inplace=True)
  1599. assert not (frame_copy.values < median).any()
  1600. frame_copy = float_frame.copy()
  1601. frame_copy.clip(upper=median, lower=median, inplace=True)
  1602. assert not (frame_copy.values != median).any()
  1603. def test_dataframe_clip(self):
  1604. # GH 2747
  1605. df = DataFrame(np.random.randn(1000, 2))
  1606. for lb, ub in [(-1, 1), (1, -1)]:
  1607. clipped_df = df.clip(lb, ub)
  1608. lb, ub = min(lb, ub), max(ub, lb)
  1609. lb_mask = df.values <= lb
  1610. ub_mask = df.values >= ub
  1611. mask = ~lb_mask & ~ub_mask
  1612. assert (clipped_df.values[lb_mask] == lb).all()
  1613. assert (clipped_df.values[ub_mask] == ub).all()
  1614. assert (clipped_df.values[mask] == df.values[mask]).all()
  1615. def test_clip_mixed_numeric(self):
  1616. # TODO(jreback)
  1617. # clip on mixed integer or floats
  1618. # with integer clippers coerces to float
  1619. df = DataFrame({'A': [1, 2, 3],
  1620. 'B': [1., np.nan, 3.]})
  1621. result = df.clip(1, 2)
  1622. expected = DataFrame({'A': [1, 2, 2],
  1623. 'B': [1., np.nan, 2.]})
  1624. tm.assert_frame_equal(result, expected, check_like=True)
  1625. # GH 24162, clipping now preserves numeric types per column
  1626. df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]],
  1627. columns=['foo', 'bar', 'baz'])
  1628. expected = df.dtypes
  1629. result = df.clip(upper=3).dtypes
  1630. tm.assert_series_equal(result, expected)
  1631. @pytest.mark.parametrize("inplace", [True, False])
  1632. def test_clip_against_series(self, inplace):
  1633. # GH 6966
  1634. df = DataFrame(np.random.randn(1000, 2))
  1635. lb = Series(np.random.randn(1000))
  1636. ub = lb + 1
  1637. original = df.copy()
  1638. clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
  1639. if inplace:
  1640. clipped_df = df
  1641. for i in range(2):
  1642. lb_mask = original.iloc[:, i] <= lb
  1643. ub_mask = original.iloc[:, i] >= ub
  1644. mask = ~lb_mask & ~ub_mask
  1645. result = clipped_df.loc[lb_mask, i]
  1646. tm.assert_series_equal(result, lb[lb_mask], check_names=False)
  1647. assert result.name == i
  1648. result = clipped_df.loc[ub_mask, i]
  1649. tm.assert_series_equal(result, ub[ub_mask], check_names=False)
  1650. assert result.name == i
  1651. tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
  1652. @pytest.mark.parametrize("inplace", [True, False])
  1653. @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
  1654. @pytest.mark.parametrize("axis,res", [
  1655. (0, [[2., 2., 3.], [4., 5., 6.], [7., 7., 7.]]),
  1656. (1, [[2., 3., 4.], [4., 5., 6.], [5., 6., 7.]])
  1657. ])
  1658. def test_clip_against_list_like(self, simple_frame,
  1659. inplace, lower, axis, res):
  1660. # GH 15390
  1661. original = simple_frame.copy(deep=True)
  1662. result = original.clip(lower=lower, upper=[5, 6, 7],
  1663. axis=axis, inplace=inplace)
  1664. expected = pd.DataFrame(res,
  1665. columns=original.columns,
  1666. index=original.index)
  1667. if inplace:
  1668. result = original
  1669. tm.assert_frame_equal(result, expected, check_exact=True)
  1670. @pytest.mark.parametrize("axis", [0, 1, None])
  1671. def test_clip_against_frame(self, axis):
  1672. df = DataFrame(np.random.randn(1000, 2))
  1673. lb = DataFrame(np.random.randn(1000, 2))
  1674. ub = lb + 1
  1675. clipped_df = df.clip(lb, ub, axis=axis)
  1676. lb_mask = df <= lb
  1677. ub_mask = df >= ub
  1678. mask = ~lb_mask & ~ub_mask
  1679. tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
  1680. tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
  1681. tm.assert_frame_equal(clipped_df[mask], df[mask])
  1682. def test_clip_against_unordered_columns(self):
  1683. # GH 20911
  1684. df1 = DataFrame(np.random.randn(1000, 4), columns=['A', 'B', 'C', 'D'])
  1685. df2 = DataFrame(np.random.randn(1000, 4), columns=['D', 'A', 'B', 'C'])
  1686. df3 = DataFrame(df2.values - 1, columns=['B', 'D', 'C', 'A'])
  1687. result_upper = df1.clip(lower=0, upper=df2)
  1688. expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
  1689. result_lower = df1.clip(lower=df3, upper=3)
  1690. expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
  1691. result_lower_upper = df1.clip(lower=df3, upper=df2)
  1692. expected_lower_upper = df1.clip(lower=df3[df1.columns],
  1693. upper=df2[df1.columns])
  1694. tm.assert_frame_equal(result_upper, expected_upper)
  1695. tm.assert_frame_equal(result_lower, expected_lower)
  1696. tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
  1697. def test_clip_with_na_args(self, float_frame):
  1698. """Should process np.nan argument as None """
  1699. # GH 17276
  1700. tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
  1701. tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan),
  1702. float_frame)
  1703. # GH 19992
  1704. df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6],
  1705. 'col_2': [7, 8, 9]})
  1706. result = df.clip(lower=[4, 5, np.nan], axis=0)
  1707. expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan],
  1708. 'col_2': [7, 8, np.nan]})
  1709. tm.assert_frame_equal(result, expected)
  1710. result = df.clip(lower=[4, 5, np.nan], axis=1)
  1711. expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6],
  1712. 'col_2': [np.nan, np.nan, np.nan]})
  1713. tm.assert_frame_equal(result, expected)
  1714. # Matrix-like
  1715. def test_dot(self):
  1716. a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
  1717. columns=['p', 'q', 'r', 's'])
  1718. b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
  1719. columns=['one', 'two'])
  1720. result = a.dot(b)
  1721. expected = DataFrame(np.dot(a.values, b.values),
  1722. index=['a', 'b', 'c'],
  1723. columns=['one', 'two'])
  1724. # Check alignment
  1725. b1 = b.reindex(index=reversed(b.index))
  1726. result = a.dot(b)
  1727. tm.assert_frame_equal(result, expected)
  1728. # Check series argument
  1729. result = a.dot(b['one'])
  1730. tm.assert_series_equal(result, expected['one'], check_names=False)
  1731. assert result.name is None
  1732. result = a.dot(b1['one'])
  1733. tm.assert_series_equal(result, expected['one'], check_names=False)
  1734. assert result.name is None
  1735. # can pass correct-length arrays
  1736. row = a.iloc[0].values
  1737. result = a.dot(row)
  1738. expected = a.dot(a.iloc[0])
  1739. tm.assert_series_equal(result, expected)
  1740. with pytest.raises(ValueError, match='Dot product shape mismatch'):
  1741. a.dot(row[:-1])
  1742. a = np.random.rand(1, 5)
  1743. b = np.random.rand(5, 1)
  1744. A = DataFrame(a)
  1745. # TODO(wesm): unused
  1746. B = DataFrame(b) # noqa
  1747. # it works
  1748. result = A.dot(b)
  1749. # unaligned
  1750. df = DataFrame(np.random.randn(3, 4),
  1751. index=[1, 2, 3], columns=lrange(4))
  1752. df2 = DataFrame(np.random.randn(5, 3),
  1753. index=lrange(5), columns=[1, 2, 3])
  1754. with pytest.raises(ValueError, match='aligned'):
  1755. df.dot(df2)
  1756. @pytest.mark.skipif(not PY35,
  1757. reason='matmul supported for Python>=3.5')
  1758. def test_matmul(self):
  1759. # matmul test is for GH 10259
  1760. a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
  1761. columns=['p', 'q', 'r', 's'])
  1762. b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
  1763. columns=['one', 'two'])
  1764. # DataFrame @ DataFrame
  1765. result = operator.matmul(a, b)
  1766. expected = DataFrame(np.dot(a.values, b.values),
  1767. index=['a', 'b', 'c'],
  1768. columns=['one', 'two'])
  1769. tm.assert_frame_equal(result, expected)
  1770. # DataFrame @ Series
  1771. result = operator.matmul(a, b.one)
  1772. expected = Series(np.dot(a.values, b.one.values),
  1773. index=['a', 'b', 'c'])
  1774. tm.assert_series_equal(result, expected)
  1775. # np.array @ DataFrame
  1776. result = operator.matmul(a.values, b)
  1777. assert isinstance(result, DataFrame)
  1778. assert result.columns.equals(b.columns)
  1779. assert result.index.equals(pd.Index(range(3)))
  1780. expected = np.dot(a.values, b.values)
  1781. tm.assert_almost_equal(result.values, expected)
  1782. # nested list @ DataFrame (__rmatmul__)
  1783. result = operator.matmul(a.values.tolist(), b)
  1784. expected = DataFrame(np.dot(a.values, b.values),
  1785. index=['a', 'b', 'c'],
  1786. columns=['one', 'two'])
  1787. tm.assert_almost_equal(result.values, expected.values)
  1788. # mixed dtype DataFrame @ DataFrame
  1789. a['q'] = a.q.round().astype(int)
  1790. result = operator.matmul(a, b)
  1791. expected = DataFrame(np.dot(a.values, b.values),
  1792. index=['a', 'b', 'c'],
  1793. columns=['one', 'two'])
  1794. tm.assert_frame_equal(result, expected)
  1795. # different dtypes DataFrame @ DataFrame
  1796. a = a.astype(int)
  1797. result = operator.matmul(a, b)
  1798. expected = DataFrame(np.dot(a.values, b.values),
  1799. index=['a', 'b', 'c'],
  1800. columns=['one', 'two'])
  1801. tm.assert_frame_equal(result, expected)
  1802. # unaligned
  1803. df = DataFrame(np.random.randn(3, 4),
  1804. index=[1, 2, 3], columns=lrange(4))
  1805. df2 = DataFrame(np.random.randn(5, 3),
  1806. index=lrange(5), columns=[1, 2, 3])
  1807. with pytest.raises(ValueError, match='aligned'):
  1808. operator.matmul(df, df2)
  1809. @pytest.fixture
  1810. def df_duplicates():
  1811. return pd.DataFrame({'a': [1, 2, 3, 4, 4],
  1812. 'b': [1, 1, 1, 1, 1],
  1813. 'c': [0, 1, 2, 5, 4]},
  1814. index=[0, 0, 1, 1, 1])
  1815. @pytest.fixture
  1816. def df_strings():
  1817. return pd.DataFrame({'a': np.random.permutation(10),
  1818. 'b': list(ascii_lowercase[:10]),
  1819. 'c': np.random.permutation(10).astype('float64')})
  1820. @pytest.fixture
  1821. def df_main_dtypes():
  1822. return pd.DataFrame(
  1823. {'group': [1, 1, 2],
  1824. 'int': [1, 2, 3],
  1825. 'float': [4., 5., 6.],
  1826. 'string': list('abc'),
  1827. 'category_string': pd.Series(list('abc')).astype('category'),
  1828. 'category_int': [7, 8, 9],
  1829. 'datetime': pd.date_range('20130101', periods=3),
  1830. 'datetimetz': pd.date_range('20130101',
  1831. periods=3,
  1832. tz='US/Eastern'),
  1833. 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
  1834. columns=['group', 'int', 'float', 'string',
  1835. 'category_string', 'category_int',
  1836. 'datetime', 'datetimetz',
  1837. 'timedelta'])
  1838. class TestNLargestNSmallest(object):
  1839. dtype_error_msg_template = ("Column {column!r} has dtype {dtype}, cannot "
  1840. "use method {method!r} with this dtype")
  1841. # ----------------------------------------------------------------------
  1842. # Top / bottom
  1843. @pytest.mark.parametrize('order', [
  1844. ['a'],
  1845. ['c'],
  1846. ['a', 'b'],
  1847. ['a', 'c'],
  1848. ['b', 'a'],
  1849. ['b', 'c'],
  1850. ['a', 'b', 'c'],
  1851. ['c', 'a', 'b'],
  1852. ['c', 'b', 'a'],
  1853. ['b', 'c', 'a'],
  1854. ['b', 'a', 'c'],
  1855. # dups!
  1856. ['b', 'c', 'c']])
  1857. @pytest.mark.parametrize('n', range(1, 11))
  1858. def test_n(self, df_strings, nselect_method, n, order):
  1859. # GH 10393
  1860. df = df_strings
  1861. if 'b' in order:
  1862. error_msg = self.dtype_error_msg_template.format(
  1863. column='b', method=nselect_method, dtype='object')
  1864. with pytest.raises(TypeError, match=error_msg):
  1865. getattr(df, nselect_method)(n, order)
  1866. else:
  1867. ascending = nselect_method == 'nsmallest'
  1868. result = getattr(df, nselect_method)(n, order)
  1869. expected = df.sort_values(order, ascending=ascending).head(n)
  1870. tm.assert_frame_equal(result, expected)
  1871. @pytest.mark.parametrize('columns', [
  1872. ['group', 'category_string'], ['group', 'string']])
  1873. def test_n_error(self, df_main_dtypes, nselect_method, columns):
  1874. df = df_main_dtypes
  1875. col = columns[1]
  1876. error_msg = self.dtype_error_msg_template.format(
  1877. column=col, method=nselect_method, dtype=df[col].dtype)
  1878. # escape some characters that may be in the repr
  1879. error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
  1880. .replace("[", "\\[").replace("]", "\\]"))
  1881. with pytest.raises(TypeError, match=error_msg):
  1882. getattr(df, nselect_method)(2, columns)
  1883. def test_n_all_dtypes(self, df_main_dtypes):
  1884. df = df_main_dtypes
  1885. df.nsmallest(2, list(set(df) - {'category_string', 'string'}))
  1886. df.nlargest(2, list(set(df) - {'category_string', 'string'}))
  1887. @pytest.mark.parametrize('method,expected', [
  1888. ('nlargest',
  1889. pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]},
  1890. index=[2, 1, 0, 3])),
  1891. ('nsmallest',
  1892. pd.DataFrame({'a': [1, 1, 1, 2], 'b': [1, 2, 3, 1]},
  1893. index=[5, 4, 3, 0]))])
  1894. def test_duplicates_on_starter_columns(self, method, expected):
  1895. # regression test for #22752
  1896. df = pd.DataFrame({
  1897. 'a': [2, 2, 2, 1, 1, 1],
  1898. 'b': [1, 2, 3, 3, 2, 1]
  1899. })
  1900. result = getattr(df, method)(4, columns=['a', 'b'])
  1901. tm.assert_frame_equal(result, expected)
  1902. def test_n_identical_values(self):
  1903. # GH 15297
  1904. df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})
  1905. result = df.nlargest(3, 'a')
  1906. expected = pd.DataFrame(
  1907. {'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2]
  1908. )
  1909. tm.assert_frame_equal(result, expected)
  1910. result = df.nsmallest(3, 'a')
  1911. expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
  1912. tm.assert_frame_equal(result, expected)
  1913. @pytest.mark.parametrize('order', [
  1914. ['a', 'b', 'c'],
  1915. ['c', 'b', 'a'],
  1916. ['a'],
  1917. ['b'],
  1918. ['a', 'b'],
  1919. ['c', 'b']])
  1920. @pytest.mark.parametrize('n', range(1, 6))
  1921. def test_n_duplicate_index(self, df_duplicates, n, order):
  1922. # GH 13412
  1923. df = df_duplicates
  1924. result = df.nsmallest(n, order)
  1925. expected = df.sort_values(order).head(n)
  1926. tm.assert_frame_equal(result, expected)
  1927. result = df.nlargest(n, order)
  1928. expected = df.sort_values(order, ascending=False).head(n)
  1929. tm.assert_frame_equal(result, expected)
  1930. def test_duplicate_keep_all_ties(self):
  1931. # GH 16818
  1932. df = pd.DataFrame({'a': [5, 4, 4, 2, 3, 3, 3, 3],
  1933. 'b': [10, 9, 8, 7, 5, 50, 10, 20]})
  1934. result = df.nlargest(4, 'a', keep='all')
  1935. expected = pd.DataFrame({'a': {0: 5, 1: 4, 2: 4, 4: 3,
  1936. 5: 3, 6: 3, 7: 3},
  1937. 'b': {0: 10, 1: 9, 2: 8, 4: 5,
  1938. 5: 50, 6: 10, 7: 20}})
  1939. tm.assert_frame_equal(result, expected)
  1940. result = df.nsmallest(2, 'a', keep='all')
  1941. expected = pd.DataFrame({'a': {3: 2, 4: 3, 5: 3, 6: 3, 7: 3},
  1942. 'b': {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}})
  1943. tm.assert_frame_equal(result, expected)
  1944. def test_series_broadcasting(self):
  1945. # smoke test for numpy warnings
  1946. # GH 16378, GH 16306
  1947. df = DataFrame([1.0, 1.0, 1.0])
  1948. df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]})
  1949. s = Series([1, 1, 1])
  1950. s_nan = Series([np.nan, np.nan, 1])
  1951. with tm.assert_produces_warning(None):
  1952. with tm.assert_produces_warning(FutureWarning):
  1953. df_nan.clip_lower(s, axis=0)
  1954. for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']:
  1955. getattr(df, op)(s_nan, axis=0)
  1956. def test_series_nat_conversion(self):
  1957. # GH 18521
  1958. # Check rank does not mutate DataFrame
  1959. df = DataFrame(np.random.randn(10, 3), dtype='float64')
  1960. expected = df.copy()
  1961. df.rank()
  1962. result = df
  1963. tm.assert_frame_equal(result, expected)
  1964. def test_multiindex_column_lookup(self):
  1965. # Check whether tuples are correctly treated as multi-level lookups.
  1966. # GH 23033
  1967. df = pd.DataFrame(
  1968. columns=pd.MultiIndex.from_product([['x'], ['a', 'b']]),
  1969. data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]])
  1970. # nsmallest
  1971. result = df.nsmallest(3, ('x', 'a'))
  1972. expected = df.iloc[[2, 0, 3]]
  1973. tm.assert_frame_equal(result, expected)
  1974. # nlargest
  1975. result = df.nlargest(3, ('x', 'b'))
  1976. expected = df.iloc[[3, 2, 1]]
  1977. tm.assert_frame_equal(result, expected)