mstats_basic.py 91 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936
  1. """
  2. An extension of scipy.stats.stats to support masked arrays
  3. """
  4. # Original author (2007): Pierre GF Gerard-Marchant
  5. # TODO : f_value_wilks_lambda looks botched... what are dfnum & dfden for ?
  6. # TODO : ttest_rel looks botched: what are x1,x2,v1,v2 for ?
  7. # TODO : reimplement ksonesamp
  8. from __future__ import division, print_function, absolute_import
  9. __all__ = ['argstoarray',
  10. 'count_tied_groups',
  11. 'describe',
  12. 'f_oneway', 'find_repeats','friedmanchisquare',
  13. 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
  14. 'ks_twosamp','ks_2samp','kurtosis','kurtosistest',
  15. 'linregress',
  16. 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
  17. 'normaltest',
  18. 'obrientransform',
  19. 'pearsonr','plotting_positions','pointbiserialr',
  20. 'rankdata',
  21. 'scoreatpercentile','sem',
  22. 'sen_seasonal_slopes','skew','skewtest','spearmanr',
  23. 'siegelslopes', 'theilslopes',
  24. 'tmax','tmean','tmin','trim','trimboth',
  25. 'trimtail','trima','trimr','trimmed_mean','trimmed_std',
  26. 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
  27. 'ttest_ind','ttest_rel','tvar',
  28. 'variation',
  29. 'winsorize',
  30. 'brunnermunzel',
  31. ]
  32. import numpy as np
  33. from numpy import ndarray
  34. import numpy.ma as ma
  35. from numpy.ma import masked, nomask
  36. from scipy._lib.six import iteritems
  37. import itertools
  38. import warnings
  39. from collections import namedtuple
  40. from . import distributions
  41. import scipy.special as special
  42. from ._stats_mstats_common import (
  43. _find_repeats,
  44. linregress as stats_linregress,
  45. theilslopes as stats_theilslopes,
  46. siegelslopes as stats_siegelslopes
  47. )
  48. genmissingvaldoc = """
  49. Notes
  50. -----
  51. Missing values are considered pair-wise: if a value is missing in x,
  52. the corresponding value in y is masked.
  53. """
  54. def _chk_asarray(a, axis):
  55. # Always returns a masked array, raveled for axis=None
  56. a = ma.asanyarray(a)
  57. if axis is None:
  58. a = ma.ravel(a)
  59. outaxis = 0
  60. else:
  61. outaxis = axis
  62. return a, outaxis
  63. def _chk2_asarray(a, b, axis):
  64. a = ma.asanyarray(a)
  65. b = ma.asanyarray(b)
  66. if axis is None:
  67. a = ma.ravel(a)
  68. b = ma.ravel(b)
  69. outaxis = 0
  70. else:
  71. outaxis = axis
  72. return a, b, outaxis
  73. def _chk_size(a, b):
  74. a = ma.asanyarray(a)
  75. b = ma.asanyarray(b)
  76. (na, nb) = (a.size, b.size)
  77. if na != nb:
  78. raise ValueError("The size of the input array should match!"
  79. " (%s <> %s)" % (na, nb))
  80. return (a, b, na)
  81. def argstoarray(*args):
  82. """
  83. Constructs a 2D array from a group of sequences.
  84. Sequences are filled with missing values to match the length of the longest
  85. sequence.
  86. Parameters
  87. ----------
  88. args : sequences
  89. Group of sequences.
  90. Returns
  91. -------
  92. argstoarray : MaskedArray
  93. A ( `m` x `n` ) masked array, where `m` is the number of arguments and
  94. `n` the length of the longest argument.
  95. Notes
  96. -----
  97. `numpy.ma.row_stack` has identical behavior, but is called with a sequence
  98. of sequences.
  99. """
  100. if len(args) == 1 and not isinstance(args[0], ndarray):
  101. output = ma.asarray(args[0])
  102. if output.ndim != 2:
  103. raise ValueError("The input should be 2D")
  104. else:
  105. n = len(args)
  106. m = max([len(k) for k in args])
  107. output = ma.array(np.empty((n,m), dtype=float), mask=True)
  108. for (k,v) in enumerate(args):
  109. output[k,:len(v)] = v
  110. output[np.logical_not(np.isfinite(output._data))] = masked
  111. return output
  112. def find_repeats(arr):
  113. """Find repeats in arr and return a tuple (repeats, repeat_count).
  114. The input is cast to float64. Masked values are discarded.
  115. Parameters
  116. ----------
  117. arr : sequence
  118. Input array. The array is flattened if it is not 1D.
  119. Returns
  120. -------
  121. repeats : ndarray
  122. Array of repeated values.
  123. counts : ndarray
  124. Array of counts.
  125. """
  126. # Make sure we get a copy. ma.compressed promises a "new array", but can
  127. # actually return a reference.
  128. compr = np.asarray(ma.compressed(arr), dtype=np.float64)
  129. try:
  130. need_copy = np.may_share_memory(compr, arr)
  131. except AttributeError:
  132. # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
  133. # while in numpy 1.8.2 and above it just (correctly) returns False.
  134. need_copy = False
  135. if need_copy:
  136. compr = compr.copy()
  137. return _find_repeats(compr)
  138. def count_tied_groups(x, use_missing=False):
  139. """
  140. Counts the number of tied values.
  141. Parameters
  142. ----------
  143. x : sequence
  144. Sequence of data on which to counts the ties
  145. use_missing : bool, optional
  146. Whether to consider missing values as tied.
  147. Returns
  148. -------
  149. count_tied_groups : dict
  150. Returns a dictionary (nb of ties: nb of groups).
  151. Examples
  152. --------
  153. >>> from scipy.stats import mstats
  154. >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
  155. >>> mstats.count_tied_groups(z)
  156. {2: 1, 3: 2}
  157. In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x).
  158. >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
  159. >>> mstats.count_tied_groups(z)
  160. {2: 2, 3: 1}
  161. >>> z[[1,-1]] = np.ma.masked
  162. >>> mstats.count_tied_groups(z, use_missing=True)
  163. {2: 2, 3: 1}
  164. """
  165. nmasked = ma.getmask(x).sum()
  166. # We need the copy as find_repeats will overwrite the initial data
  167. data = ma.compressed(x).copy()
  168. (ties, counts) = find_repeats(data)
  169. nties = {}
  170. if len(ties):
  171. nties = dict(zip(np.unique(counts), itertools.repeat(1)))
  172. nties.update(dict(zip(*find_repeats(counts))))
  173. if nmasked and use_missing:
  174. try:
  175. nties[nmasked] += 1
  176. except KeyError:
  177. nties[nmasked] = 1
  178. return nties
  179. def rankdata(data, axis=None, use_missing=False):
  180. """Returns the rank (also known as order statistics) of each data point
  181. along the given axis.
  182. If some values are tied, their rank is averaged.
  183. If some values are masked, their rank is set to 0 if use_missing is False,
  184. or set to the average rank of the unmasked values if use_missing is True.
  185. Parameters
  186. ----------
  187. data : sequence
  188. Input data. The data is transformed to a masked array
  189. axis : {None,int}, optional
  190. Axis along which to perform the ranking.
  191. If None, the array is first flattened. An exception is raised if
  192. the axis is specified for arrays with a dimension larger than 2
  193. use_missing : bool, optional
  194. Whether the masked values have a rank of 0 (False) or equal to the
  195. average rank of the unmasked values (True).
  196. """
  197. def _rank1d(data, use_missing=False):
  198. n = data.count()
  199. rk = np.empty(data.size, dtype=float)
  200. idx = data.argsort()
  201. rk[idx[:n]] = np.arange(1,n+1)
  202. if use_missing:
  203. rk[idx[n:]] = (n+1)/2.
  204. else:
  205. rk[idx[n:]] = 0
  206. repeats = find_repeats(data.copy())
  207. for r in repeats[0]:
  208. condition = (data == r).filled(False)
  209. rk[condition] = rk[condition].mean()
  210. return rk
  211. data = ma.array(data, copy=False)
  212. if axis is None:
  213. if data.ndim > 1:
  214. return _rank1d(data.ravel(), use_missing).reshape(data.shape)
  215. else:
  216. return _rank1d(data, use_missing)
  217. else:
  218. return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
  219. ModeResult = namedtuple('ModeResult', ('mode', 'count'))
  220. def mode(a, axis=0):
  221. """
  222. Returns an array of the modal (most common) value in the passed array.
  223. Parameters
  224. ----------
  225. a : array_like
  226. n-dimensional array of which to find mode(s).
  227. axis : int or None, optional
  228. Axis along which to operate. Default is 0. If None, compute over
  229. the whole array `a`.
  230. Returns
  231. -------
  232. mode : ndarray
  233. Array of modal values.
  234. count : ndarray
  235. Array of counts for each mode.
  236. Notes
  237. -----
  238. For more details, see `stats.mode`.
  239. Examples
  240. --------
  241. >>> from scipy import stats
  242. >>> from scipy.stats import mstats
  243. >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0])
  244. >>> stats.mode(m_arr)
  245. ModeResult(mode=array([0]), count=array([4]))
  246. >>> mstats.mode(m_arr)
  247. ModeResult(mode=array([1.]), count=array([2.]))
  248. """
  249. a, axis = _chk_asarray(a, axis)
  250. def _mode1D(a):
  251. (rep,cnt) = find_repeats(a)
  252. if not cnt.ndim:
  253. return (0, 0)
  254. elif cnt.size:
  255. return (rep[cnt.argmax()], cnt.max())
  256. else:
  257. return (a.min(), 1)
  258. if axis is None:
  259. output = _mode1D(ma.ravel(a))
  260. output = (ma.array(output[0]), ma.array(output[1]))
  261. else:
  262. output = ma.apply_along_axis(_mode1D, axis, a)
  263. newshape = list(a.shape)
  264. newshape[axis] = 1
  265. slices = [slice(None)] * output.ndim
  266. slices[axis] = 0
  267. modes = output[tuple(slices)].reshape(newshape)
  268. slices[axis] = 1
  269. counts = output[tuple(slices)].reshape(newshape)
  270. output = (modes, counts)
  271. return ModeResult(*output)
  272. def _betai(a, b, x):
  273. x = np.asanyarray(x)
  274. x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0
  275. return special.betainc(a, b, x)
  276. def msign(x):
  277. """Returns the sign of x, or 0 if x is masked."""
  278. return ma.filled(np.sign(x), 0)
  279. def pearsonr(x,y):
  280. """
  281. Calculates a Pearson correlation coefficient and the p-value for testing
  282. non-correlation.
  283. The Pearson correlation coefficient measures the linear relationship
  284. between two datasets. Strictly speaking, Pearson's correlation requires
  285. that each dataset be normally distributed. Like other correlation
  286. coefficients, this one varies between -1 and +1 with 0 implying no
  287. correlation. Correlations of -1 or +1 imply an exact linear
  288. relationship. Positive correlations imply that as `x` increases, so does
  289. `y`. Negative correlations imply that as `x` increases, `y` decreases.
  290. The p-value roughly indicates the probability of an uncorrelated system
  291. producing datasets that have a Pearson correlation at least as extreme
  292. as the one computed from these datasets. The p-values are not entirely
  293. reliable but are probably reasonable for datasets larger than 500 or so.
  294. Parameters
  295. ----------
  296. x : 1-D array_like
  297. Input
  298. y : 1-D array_like
  299. Input
  300. Returns
  301. -------
  302. pearsonr : float
  303. Pearson's correlation coefficient, 2-tailed p-value.
  304. References
  305. ----------
  306. http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
  307. """
  308. (x, y, n) = _chk_size(x, y)
  309. (x, y) = (x.ravel(), y.ravel())
  310. # Get the common mask and the total nb of unmasked elements
  311. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  312. n -= m.sum()
  313. df = n-2
  314. if df < 0:
  315. return (masked, masked)
  316. (mx, my) = (x.mean(), y.mean())
  317. (xm, ym) = (x-mx, y-my)
  318. r_num = ma.add.reduce(xm*ym)
  319. r_den = ma.sqrt(ma.dot(xm,xm) * ma.dot(ym,ym))
  320. r = r_num / r_den
  321. # Presumably, if r > 1, then it is only some small artifact of floating
  322. # point arithmetic.
  323. r = min(r, 1.0)
  324. r = max(r, -1.0)
  325. if r is masked or abs(r) == 1.0:
  326. prob = 0.
  327. else:
  328. t_squared = (df / ((1.0 - r) * (1.0 + r))) * r * r
  329. prob = _betai(0.5*df, 0.5, df/(df + t_squared))
  330. return r, prob
  331. SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue'))
  332. def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate'):
  333. """
  334. Calculates a Spearman rank-order correlation coefficient and the p-value
  335. to test for non-correlation.
  336. The Spearman correlation is a nonparametric measure of the linear
  337. relationship between two datasets. Unlike the Pearson correlation, the
  338. Spearman correlation does not assume that both datasets are normally
  339. distributed. Like other correlation coefficients, this one varies
  340. between -1 and +1 with 0 implying no correlation. Correlations of -1 or
  341. +1 imply a monotonic relationship. Positive correlations imply that
  342. as `x` increases, so does `y`. Negative correlations imply that as `x`
  343. increases, `y` decreases.
  344. Missing values are discarded pair-wise: if a value is missing in `x`, the
  345. corresponding value in `y` is masked.
  346. The p-value roughly indicates the probability of an uncorrelated system
  347. producing datasets that have a Spearman correlation at least as extreme
  348. as the one computed from these datasets. The p-values are not entirely
  349. reliable but are probably reasonable for datasets larger than 500 or so.
  350. Parameters
  351. ----------
  352. x, y : 1D or 2D array_like, y is optional
  353. One or two 1-D or 2-D arrays containing multiple variables and
  354. observations. When these are 1-D, each represents a vector of
  355. observations of a single variable. For the behavior in the 2-D case,
  356. see under ``axis``, below.
  357. use_ties : bool, optional
  358. DO NOT USE. Does not do anything, keyword is only left in place for
  359. backwards compatibility reasons.
  360. axis : int or None, optional
  361. If axis=0 (default), then each column represents a variable, with
  362. observations in the rows. If axis=1, the relationship is transposed:
  363. each row represents a variable, while the columns contain observations.
  364. If axis=None, then both arrays will be raveled.
  365. nan_policy : {'propagate', 'raise', 'omit'}, optional
  366. Defines how to handle when input contains nan. 'propagate' returns nan,
  367. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  368. values. Default is 'propagate'.
  369. Returns
  370. -------
  371. correlation : float
  372. Spearman correlation coefficient
  373. pvalue : float
  374. 2-tailed p-value.
  375. References
  376. ----------
  377. [CRCProbStat2000] section 14.7
  378. """
  379. if not use_ties:
  380. raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
  381. # Always returns a masked array, raveled if axis=None
  382. x, axisout = _chk_asarray(x, axis)
  383. if y is not None:
  384. # Deal only with 2-D `x` case.
  385. y, _ = _chk_asarray(y, axis)
  386. if axisout == 0:
  387. x = ma.column_stack((x, y))
  388. else:
  389. x = ma.row_stack((x, y))
  390. if axisout == 1:
  391. # To simplify the code that follow (always use `n_obs, n_vars` shape)
  392. x = x.T
  393. if nan_policy == 'omit':
  394. x = ma.masked_invalid(x)
  395. def _spearmanr_2cols(x):
  396. # Mask the same observations for all variables, and then drop those
  397. # observations (can't leave them masked, rankdata is weird).
  398. x = ma.mask_rowcols(x, axis=0)
  399. x = x[~x.mask.any(axis=1), :]
  400. m = ma.getmask(x)
  401. n_obs = x.shape[0]
  402. dof = n_obs - 2 - int(m.sum(axis=0)[0])
  403. if dof < 0:
  404. raise ValueError("The input must have at least 3 entries!")
  405. # Gets the ranks and rank differences
  406. x_ranked = rankdata(x, axis=0)
  407. rs = ma.corrcoef(x_ranked, rowvar=False).data
  408. # rs can have elements equal to 1, so avoid zero division warnings
  409. olderr = np.seterr(divide='ignore')
  410. try:
  411. # clip the small negative values possibly caused by rounding
  412. # errors before taking the square root
  413. t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  414. finally:
  415. np.seterr(**olderr)
  416. prob = 2 * distributions.t.sf(np.abs(t), dof)
  417. # For backwards compatibility, return scalars when comparing 2 columns
  418. if rs.shape == (2, 2):
  419. return SpearmanrResult(rs[1, 0], prob[1, 0])
  420. else:
  421. return SpearmanrResult(rs, prob)
  422. # Need to do this per pair of variables, otherwise the dropped observations
  423. # in a third column mess up the result for a pair.
  424. n_vars = x.shape[1]
  425. if n_vars == 2:
  426. return _spearmanr_2cols(x)
  427. else:
  428. rs = np.ones((n_vars, n_vars), dtype=float)
  429. prob = np.zeros((n_vars, n_vars), dtype=float)
  430. for var1 in range(n_vars - 1):
  431. for var2 in range(var1+1, n_vars):
  432. result = _spearmanr_2cols(x[:, [var1, var2]])
  433. rs[var1, var2] = result.correlation
  434. rs[var2, var1] = result.correlation
  435. prob[var1, var2] = result.pvalue
  436. prob[var2, var1] = result.pvalue
  437. return SpearmanrResult(rs, prob)
  438. KendalltauResult = namedtuple('KendalltauResult', ('correlation', 'pvalue'))
  439. def kendalltau(x, y, use_ties=True, use_missing=False, method='auto'):
  440. """
  441. Computes Kendall's rank correlation tau on two variables *x* and *y*.
  442. Parameters
  443. ----------
  444. x : sequence
  445. First data list (for example, time).
  446. y : sequence
  447. Second data list.
  448. use_ties : {True, False}, optional
  449. Whether ties correction should be performed.
  450. use_missing : {False, True}, optional
  451. Whether missing data should be allocated a rank of 0 (False) or the
  452. average rank (True)
  453. method: {'auto', 'asymptotic', 'exact'}, optional
  454. Defines which method is used to calculate the p-value [1]_.
  455. 'asymptotic' uses a normal approximation valid for large samples.
  456. 'exact' computes the exact p-value, but can only be used if no ties
  457. are present. 'auto' is the default and selects the appropriate
  458. method based on a trade-off between speed and accuracy.
  459. Returns
  460. -------
  461. correlation : float
  462. Kendall tau
  463. pvalue : float
  464. Approximate 2-side p-value.
  465. References
  466. ----------
  467. .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
  468. Charles Griffin & Co., 1970.
  469. """
  470. (x, y, n) = _chk_size(x, y)
  471. (x, y) = (x.flatten(), y.flatten())
  472. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  473. if m is not nomask:
  474. x = ma.array(x, mask=m, copy=True)
  475. y = ma.array(y, mask=m, copy=True)
  476. # need int() here, otherwise numpy defaults to 32 bit
  477. # integer on all Windows architectures, causing overflow.
  478. # int() will keep it infinite precision.
  479. n -= int(m.sum())
  480. if n < 2:
  481. return KendalltauResult(np.nan, np.nan)
  482. rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0)
  483. ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0)
  484. idx = rx.argsort()
  485. (rx, ry) = (rx[idx], ry[idx])
  486. C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum()
  487. for i in range(len(ry)-1)], dtype=float)
  488. D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum()
  489. for i in range(len(ry)-1)], dtype=float)
  490. xties = count_tied_groups(x)
  491. yties = count_tied_groups(y)
  492. if use_ties:
  493. corr_x = np.sum([v*k*(k-1) for (k,v) in iteritems(xties)], dtype=float)
  494. corr_y = np.sum([v*k*(k-1) for (k,v) in iteritems(yties)], dtype=float)
  495. denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.)
  496. else:
  497. denom = n*(n-1)/2.
  498. tau = (C-D) / denom
  499. if method == 'exact' and (xties or yties):
  500. raise ValueError("Ties found, exact method cannot be used.")
  501. if method == 'auto':
  502. if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1):
  503. method = 'exact'
  504. else:
  505. method = 'asymptotic'
  506. if not xties and not yties and method == 'exact':
  507. # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), Charles Griffin & Co., 1970.
  508. c = int(min(C, (n*(n-1))/2-C))
  509. if n <= 0:
  510. raise ValueError
  511. elif c < 0 or 2*c > n*(n-1):
  512. raise ValueError
  513. elif n == 1:
  514. prob = 1.0
  515. elif n == 2:
  516. prob = 1.0
  517. elif c == 0:
  518. prob = 2.0/np.math.factorial(n)
  519. elif c == 1:
  520. prob = 2.0/np.math.factorial(n-1)
  521. else:
  522. old = [0.0]*(c+1)
  523. new = [0.0]*(c+1)
  524. new[0] = 1.0
  525. new[1] = 1.0
  526. for j in range(3,n+1):
  527. old = new[:]
  528. for k in range(1,min(j,c+1)):
  529. new[k] += new[k-1]
  530. for k in range(j,c+1):
  531. new[k] += new[k-1] - old[k-j]
  532. prob = 2.0*sum(new)/np.math.factorial(n)
  533. elif method == 'asymptotic':
  534. var_s = n*(n-1)*(2*n+5)
  535. if use_ties:
  536. var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in iteritems(xties)])
  537. var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in iteritems(yties)])
  538. v1 = np.sum([v*k*(k-1) for (k, v) in iteritems(xties)], dtype=float) *\
  539. np.sum([v*k*(k-1) for (k, v) in iteritems(yties)], dtype=float)
  540. v1 /= 2.*n*(n-1)
  541. if n > 2:
  542. v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in iteritems(xties)],
  543. dtype=float) * \
  544. np.sum([v*k*(k-1)*(k-2) for (k,v) in iteritems(yties)],
  545. dtype=float)
  546. v2 /= 9.*n*(n-1)*(n-2)
  547. else:
  548. v2 = 0
  549. else:
  550. v1 = v2 = 0
  551. var_s /= 18.
  552. var_s += (v1 + v2)
  553. z = (C-D)/np.sqrt(var_s)
  554. prob = special.erfc(abs(z)/np.sqrt(2))
  555. else:
  556. raise ValueError("Unknown method "+str(method)+" specified, please use auto, exact or asymptotic.")
  557. return KendalltauResult(tau, prob)
  558. def kendalltau_seasonal(x):
  559. """
  560. Computes a multivariate Kendall's rank correlation tau, for seasonal data.
  561. Parameters
  562. ----------
  563. x : 2-D ndarray
  564. Array of seasonal data, with seasons in columns.
  565. """
  566. x = ma.array(x, subok=True, copy=False, ndmin=2)
  567. (n,m) = x.shape
  568. n_p = x.count(0)
  569. S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
  570. S_tot = S_szn.sum()
  571. n_tot = x.count()
  572. ties = count_tied_groups(x.compressed())
  573. corr_ties = sum(v*k*(k-1) for (k,v) in iteritems(ties))
  574. denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
  575. R = rankdata(x, axis=0, use_missing=True)
  576. K = ma.empty((m,m), dtype=int)
  577. covmat = ma.empty((m,m), dtype=float)
  578. denom_szn = ma.empty(m, dtype=float)
  579. for j in range(m):
  580. ties_j = count_tied_groups(x[:,j].compressed())
  581. corr_j = sum(v*k*(k-1) for (k,v) in iteritems(ties_j))
  582. cmb = n_p[j]*(n_p[j]-1)
  583. for k in range(j,m,1):
  584. K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
  585. for i in range(n))
  586. covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() -
  587. n*(n_p[j]+1)*(n_p[k]+1))/3.
  588. K[k,j] = K[j,k]
  589. covmat[k,j] = covmat[j,k]
  590. denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
  591. var_szn = covmat.diagonal()
  592. z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
  593. z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
  594. z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
  595. prob_szn = special.erfc(abs(z_szn)/np.sqrt(2))
  596. prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
  597. prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
  598. chi2_tot = (z_szn*z_szn).sum()
  599. chi2_trd = m * z_szn.mean()**2
  600. output = {'seasonal tau': S_szn/denom_szn,
  601. 'global tau': S_tot/denom_tot,
  602. 'global tau (alt)': S_tot/denom_szn.sum(),
  603. 'seasonal p-value': prob_szn,
  604. 'global p-value (indep)': prob_tot_ind,
  605. 'global p-value (dep)': prob_tot_dep,
  606. 'chi2 total': chi2_tot,
  607. 'chi2 trend': chi2_trd,
  608. }
  609. return output
  610. PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation',
  611. 'pvalue'))
  612. def pointbiserialr(x, y):
  613. """Calculates a point biserial correlation coefficient and its p-value.
  614. Parameters
  615. ----------
  616. x : array_like of bools
  617. Input array.
  618. y : array_like
  619. Input array.
  620. Returns
  621. -------
  622. correlation : float
  623. R value
  624. pvalue : float
  625. 2-tailed p-value
  626. Notes
  627. -----
  628. Missing values are considered pair-wise: if a value is missing in x,
  629. the corresponding value in y is masked.
  630. For more details on `pointbiserialr`, see `stats.pointbiserialr`.
  631. """
  632. x = ma.fix_invalid(x, copy=True).astype(bool)
  633. y = ma.fix_invalid(y, copy=True).astype(float)
  634. # Get rid of the missing data
  635. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  636. if m is not nomask:
  637. unmask = np.logical_not(m)
  638. x = x[unmask]
  639. y = y[unmask]
  640. n = len(x)
  641. # phat is the fraction of x values that are True
  642. phat = x.sum() / float(n)
  643. y0 = y[~x] # y-values where x is False
  644. y1 = y[x] # y-values where x is True
  645. y0m = y0.mean()
  646. y1m = y1.mean()
  647. rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
  648. df = n-2
  649. t = rpb*ma.sqrt(df/(1.0-rpb**2))
  650. prob = _betai(0.5*df, 0.5, df/(df+t*t))
  651. return PointbiserialrResult(rpb, prob)
  652. LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept',
  653. 'rvalue', 'pvalue',
  654. 'stderr'))
  655. def linregress(x, y=None):
  656. """
  657. Linear regression calculation
  658. Note that the non-masked version is used, and that this docstring is
  659. replaced by the non-masked docstring + some info on missing data.
  660. """
  661. if y is None:
  662. x = ma.array(x)
  663. if x.shape[0] == 2:
  664. x, y = x
  665. elif x.shape[1] == 2:
  666. x, y = x.T
  667. else:
  668. msg = ("If only `x` is given as input, it has to be of shape "
  669. "(2, N) or (N, 2), provided shape was %s" % str(x.shape))
  670. raise ValueError(msg)
  671. else:
  672. x = ma.array(x)
  673. y = ma.array(y)
  674. x = x.flatten()
  675. y = y.flatten()
  676. m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False)
  677. if m is not nomask:
  678. x = ma.array(x, mask=m)
  679. y = ma.array(y, mask=m)
  680. if np.any(~m):
  681. slope, intercept, r, prob, sterrest = stats_linregress(x.data[~m],
  682. y.data[~m])
  683. else:
  684. # All data is masked
  685. return None, None, None, None, None
  686. else:
  687. slope, intercept, r, prob, sterrest = stats_linregress(x.data, y.data)
  688. return LinregressResult(slope, intercept, r, prob, sterrest)
  689. if stats_linregress.__doc__:
  690. linregress.__doc__ = stats_linregress.__doc__ + genmissingvaldoc
  691. def theilslopes(y, x=None, alpha=0.95):
  692. r"""
  693. Computes the Theil-Sen estimator for a set of points (x, y).
  694. `theilslopes` implements a method for robust linear regression. It
  695. computes the slope as the median of all slopes between paired values.
  696. Parameters
  697. ----------
  698. y : array_like
  699. Dependent variable.
  700. x : array_like or None, optional
  701. Independent variable. If None, use ``arange(len(y))`` instead.
  702. alpha : float, optional
  703. Confidence degree between 0 and 1. Default is 95% confidence.
  704. Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
  705. interpreted as "find the 90% confidence interval".
  706. Returns
  707. -------
  708. medslope : float
  709. Theil slope.
  710. medintercept : float
  711. Intercept of the Theil line, as ``median(y) - medslope*median(x)``.
  712. lo_slope : float
  713. Lower bound of the confidence interval on `medslope`.
  714. up_slope : float
  715. Upper bound of the confidence interval on `medslope`.
  716. See also
  717. --------
  718. siegelslopes : a similar technique with repeated medians
  719. Notes
  720. -----
  721. For more details on `theilslopes`, see `stats.theilslopes`.
  722. """
  723. y = ma.asarray(y).flatten()
  724. if x is None:
  725. x = ma.arange(len(y), dtype=float)
  726. else:
  727. x = ma.asarray(x).flatten()
  728. if len(x) != len(y):
  729. raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x)))
  730. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  731. y._mask = x._mask = m
  732. # Disregard any masked elements of x or y
  733. y = y.compressed()
  734. x = x.compressed().astype(float)
  735. # We now have unmasked arrays so can use `stats.theilslopes`
  736. return stats_theilslopes(y, x, alpha=alpha)
  737. def siegelslopes(y, x=None, method="hierarchical"):
  738. r"""
  739. Computes the Siegel estimator for a set of points (x, y).
  740. `siegelslopes` implements a method for robust linear regression
  741. using repeated medians to fit a line to the points (x, y).
  742. The method is robust to outliers with an asymptotic breakdown point
  743. of 50%.
  744. Parameters
  745. ----------
  746. y : array_like
  747. Dependent variable.
  748. x : array_like or None, optional
  749. Independent variable. If None, use ``arange(len(y))`` instead.
  750. method : {'hierarchical', 'separate'}
  751. If 'hierarchical', estimate the intercept using the estimated
  752. slope ``medslope`` (default option).
  753. If 'separate', estimate the intercept independent of the estimated
  754. slope. See Notes for details.
  755. Returns
  756. -------
  757. medslope : float
  758. Estimate of the slope of the regression line.
  759. medintercept : float
  760. Estimate of the intercept of the regression line.
  761. See also
  762. --------
  763. theilslopes : a similar technique without repeated medians
  764. Notes
  765. -----
  766. For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.
  767. """
  768. y = ma.asarray(y).ravel()
  769. if x is None:
  770. x = ma.arange(len(y), dtype=float)
  771. else:
  772. x = ma.asarray(x).ravel()
  773. if len(x) != len(y):
  774. raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
  775. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  776. y._mask = x._mask = m
  777. # Disregard any masked elements of x or y
  778. y = y.compressed()
  779. x = x.compressed().astype(float)
  780. # We now have unmasked arrays so can use `stats.siegelslopes`
  781. return stats_siegelslopes(y, x)
  782. def sen_seasonal_slopes(x):
  783. x = ma.array(x, subok=True, copy=False, ndmin=2)
  784. (n,_) = x.shape
  785. # Get list of slopes per season
  786. szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
  787. for i in range(n)])
  788. szn_medslopes = ma.median(szn_slopes, axis=0)
  789. medslope = ma.median(szn_slopes, axis=None)
  790. return szn_medslopes, medslope
  791. Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
  792. def ttest_1samp(a, popmean, axis=0):
  793. """
  794. Calculates the T-test for the mean of ONE group of scores.
  795. Parameters
  796. ----------
  797. a : array_like
  798. sample observation
  799. popmean : float or array_like
  800. expected value in null hypothesis, if array_like than it must have the
  801. same shape as `a` excluding the axis dimension
  802. axis : int or None, optional
  803. Axis along which to compute test. If None, compute over the whole
  804. array `a`.
  805. Returns
  806. -------
  807. statistic : float or array
  808. t-statistic
  809. pvalue : float or array
  810. two-tailed p-value
  811. Notes
  812. -----
  813. For more details on `ttest_1samp`, see `stats.ttest_1samp`.
  814. """
  815. a, axis = _chk_asarray(a, axis)
  816. if a.size == 0:
  817. return (np.nan, np.nan)
  818. x = a.mean(axis=axis)
  819. v = a.var(axis=axis, ddof=1)
  820. n = a.count(axis=axis)
  821. # force df to be an array for masked division not to throw a warning
  822. df = ma.asanyarray(n - 1.0)
  823. svar = ((n - 1.0) * v) / df
  824. with np.errstate(divide='ignore', invalid='ignore'):
  825. t = (x - popmean) / ma.sqrt(svar / n)
  826. prob = special.betainc(0.5*df, 0.5, df/(df + t*t))
  827. return Ttest_1sampResult(t, prob)
  828. ttest_onesamp = ttest_1samp
  829. Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
  830. def ttest_ind(a, b, axis=0, equal_var=True):
  831. """
  832. Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
  833. Parameters
  834. ----------
  835. a, b : array_like
  836. The arrays must have the same shape, except in the dimension
  837. corresponding to `axis` (the first, by default).
  838. axis : int or None, optional
  839. Axis along which to compute test. If None, compute over the whole
  840. arrays, `a`, and `b`.
  841. equal_var : bool, optional
  842. If True, perform a standard independent 2 sample test that assumes equal
  843. population variances.
  844. If False, perform Welch's t-test, which does not assume equal population
  845. variance.
  846. .. versionadded:: 0.17.0
  847. Returns
  848. -------
  849. statistic : float or array
  850. The calculated t-statistic.
  851. pvalue : float or array
  852. The two-tailed p-value.
  853. Notes
  854. -----
  855. For more details on `ttest_ind`, see `stats.ttest_ind`.
  856. """
  857. a, b, axis = _chk2_asarray(a, b, axis)
  858. if a.size == 0 or b.size == 0:
  859. return Ttest_indResult(np.nan, np.nan)
  860. (x1, x2) = (a.mean(axis), b.mean(axis))
  861. (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
  862. (n1, n2) = (a.count(axis), b.count(axis))
  863. if equal_var:
  864. # force df to be an array for masked division not to throw a warning
  865. df = ma.asanyarray(n1 + n2 - 2.0)
  866. svar = ((n1-1)*v1+(n2-1)*v2) / df
  867. denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here!
  868. else:
  869. vn1 = v1/n1
  870. vn2 = v2/n2
  871. with np.errstate(divide='ignore', invalid='ignore'):
  872. df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
  873. # If df is undefined, variances are zero.
  874. # It doesn't matter what df is as long as it is not NaN.
  875. df = np.where(np.isnan(df), 1, df)
  876. denom = ma.sqrt(vn1 + vn2)
  877. with np.errstate(divide='ignore', invalid='ignore'):
  878. t = (x1-x2) / denom
  879. probs = special.betainc(0.5*df, 0.5, df/(df + t*t)).reshape(t.shape)
  880. return Ttest_indResult(t, probs.squeeze())
  881. Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
  882. def ttest_rel(a, b, axis=0):
  883. """
  884. Calculates the T-test on TWO RELATED samples of scores, a and b.
  885. Parameters
  886. ----------
  887. a, b : array_like
  888. The arrays must have the same shape.
  889. axis : int or None, optional
  890. Axis along which to compute test. If None, compute over the whole
  891. arrays, `a`, and `b`.
  892. Returns
  893. -------
  894. statistic : float or array
  895. t-statistic
  896. pvalue : float or array
  897. two-tailed p-value
  898. Notes
  899. -----
  900. For more details on `ttest_rel`, see `stats.ttest_rel`.
  901. """
  902. a, b, axis = _chk2_asarray(a, b, axis)
  903. if len(a) != len(b):
  904. raise ValueError('unequal length arrays')
  905. if a.size == 0 or b.size == 0:
  906. return Ttest_relResult(np.nan, np.nan)
  907. n = a.count(axis)
  908. df = ma.asanyarray(n-1.0)
  909. d = (a-b).astype('d')
  910. dm = d.mean(axis)
  911. v = d.var(axis=axis, ddof=1)
  912. denom = ma.sqrt(v / n)
  913. with np.errstate(divide='ignore', invalid='ignore'):
  914. t = dm / denom
  915. probs = special.betainc(0.5*df, 0.5, df/(df + t*t)).reshape(t.shape).squeeze()
  916. return Ttest_relResult(t, probs)
  917. MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic',
  918. 'pvalue'))
  919. def mannwhitneyu(x,y, use_continuity=True):
  920. """
  921. Computes the Mann-Whitney statistic
  922. Missing values in `x` and/or `y` are discarded.
  923. Parameters
  924. ----------
  925. x : sequence
  926. Input
  927. y : sequence
  928. Input
  929. use_continuity : {True, False}, optional
  930. Whether a continuity correction (1/2.) should be taken into account.
  931. Returns
  932. -------
  933. statistic : float
  934. The Mann-Whitney statistics
  935. pvalue : float
  936. Approximate p-value assuming a normal distribution.
  937. """
  938. x = ma.asarray(x).compressed().view(ndarray)
  939. y = ma.asarray(y).compressed().view(ndarray)
  940. ranks = rankdata(np.concatenate([x,y]))
  941. (nx, ny) = (len(x), len(y))
  942. nt = nx + ny
  943. U = ranks[:nx].sum() - nx*(nx+1)/2.
  944. U = max(U, nx*ny - U)
  945. u = nx*ny - U
  946. mu = (nx*ny)/2.
  947. sigsq = (nt**3 - nt)/12.
  948. ties = count_tied_groups(ranks)
  949. sigsq -= sum(v*(k**3-k) for (k,v) in iteritems(ties))/12.
  950. sigsq *= nx*ny/float(nt*(nt-1))
  951. if use_continuity:
  952. z = (U - 1/2. - mu) / ma.sqrt(sigsq)
  953. else:
  954. z = (U - mu) / ma.sqrt(sigsq)
  955. prob = special.erfc(abs(z)/np.sqrt(2))
  956. return MannwhitneyuResult(u, prob)
  957. KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
  958. def kruskal(*args):
  959. """
  960. Compute the Kruskal-Wallis H-test for independent samples
  961. Parameters
  962. ----------
  963. sample1, sample2, ... : array_like
  964. Two or more arrays with the sample measurements can be given as
  965. arguments.
  966. Returns
  967. -------
  968. statistic : float
  969. The Kruskal-Wallis H statistic, corrected for ties
  970. pvalue : float
  971. The p-value for the test using the assumption that H has a chi
  972. square distribution
  973. Notes
  974. -----
  975. For more details on `kruskal`, see `stats.kruskal`.
  976. Examples
  977. --------
  978. >>> from scipy.stats.mstats import kruskal
  979. Random samples from three different brands of batteries were tested
  980. to see how long the charge lasted. Results were as follows:
  981. >>> a = [6.3, 5.4, 5.7, 5.2, 5.0]
  982. >>> b = [6.9, 7.0, 6.1, 7.9]
  983. >>> c = [7.2, 6.9, 6.1, 6.5]
  984. Test the hypotesis that the distribution functions for all of the brands'
  985. durations are identical. Use 5% level of significance.
  986. >>> kruskal(a, b, c)
  987. KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164)
  988. The null hypothesis is rejected at the 5% level of significance
  989. because the returned p-value is less than the critical value of 5%.
  990. """
  991. output = argstoarray(*args)
  992. ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
  993. sumrk = ranks.sum(-1)
  994. ngrp = ranks.count(-1)
  995. ntot = ranks.count()
  996. H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
  997. # Tie correction
  998. ties = count_tied_groups(ranks)
  999. T = 1. - sum(v*(k**3-k) for (k,v) in iteritems(ties))/float(ntot**3-ntot)
  1000. if T == 0:
  1001. raise ValueError('All numbers are identical in kruskal')
  1002. H /= T
  1003. df = len(output) - 1
  1004. prob = distributions.chi2.sf(H, df)
  1005. return KruskalResult(H, prob)
  1006. kruskalwallis = kruskal
  1007. def ks_twosamp(data1, data2, alternative="two-sided"):
  1008. """
  1009. Computes the Kolmogorov-Smirnov test on two samples.
  1010. Missing values are discarded.
  1011. Parameters
  1012. ----------
  1013. data1 : array_like
  1014. First data set
  1015. data2 : array_like
  1016. Second data set
  1017. alternative : {'two-sided', 'less', 'greater'}, optional
  1018. Indicates the alternative hypothesis. Default is 'two-sided'.
  1019. Returns
  1020. -------
  1021. d : float
  1022. Value of the Kolmogorov Smirnov test
  1023. p : float
  1024. Corresponding p-value.
  1025. """
  1026. (data1, data2) = (ma.asarray(data1), ma.asarray(data2))
  1027. (n1, n2) = (data1.count(), data2.count())
  1028. n = (n1*n2/float(n1+n2))
  1029. mix = ma.concatenate((data1.compressed(), data2.compressed()))
  1030. mixsort = mix.argsort(kind='mergesort')
  1031. csum = np.where(mixsort < n1, 1./n1, -1./n2).cumsum()
  1032. # Check for ties
  1033. if len(np.unique(mix)) < (n1+n2):
  1034. csum = csum[np.r_[np.diff(mix[mixsort]).nonzero()[0],-1]]
  1035. alternative = str(alternative).lower()[0]
  1036. if alternative == 't':
  1037. d = ma.abs(csum).max()
  1038. prob = special.kolmogorov(np.sqrt(n)*d)
  1039. elif alternative == 'l':
  1040. d = -csum.min()
  1041. prob = np.exp(-2*n*d**2)
  1042. elif alternative == 'g':
  1043. d = csum.max()
  1044. prob = np.exp(-2*n*d**2)
  1045. else:
  1046. raise ValueError("Invalid value for the alternative hypothesis: "
  1047. "should be in 'two-sided', 'less' or 'greater'")
  1048. return (d, prob)
  1049. ks_2samp = ks_twosamp
  1050. def trima(a, limits=None, inclusive=(True,True)):
  1051. """
  1052. Trims an array by masking the data outside some given limits.
  1053. Returns a masked version of the input array.
  1054. Parameters
  1055. ----------
  1056. a : array_like
  1057. Input array.
  1058. limits : {None, tuple}, optional
  1059. Tuple of (lower limit, upper limit) in absolute values.
  1060. Values of the input array lower (greater) than the lower (upper) limit
  1061. will be masked. A limit is None indicates an open interval.
  1062. inclusive : (bool, bool) tuple, optional
  1063. Tuple of (lower flag, upper flag), indicating whether values exactly
  1064. equal to the lower (upper) limit are allowed.
  1065. """
  1066. a = ma.asarray(a)
  1067. a.unshare_mask()
  1068. if (limits is None) or (limits == (None, None)):
  1069. return a
  1070. (lower_lim, upper_lim) = limits
  1071. (lower_in, upper_in) = inclusive
  1072. condition = False
  1073. if lower_lim is not None:
  1074. if lower_in:
  1075. condition |= (a < lower_lim)
  1076. else:
  1077. condition |= (a <= lower_lim)
  1078. if upper_lim is not None:
  1079. if upper_in:
  1080. condition |= (a > upper_lim)
  1081. else:
  1082. condition |= (a >= upper_lim)
  1083. a[condition.filled(True)] = masked
  1084. return a
  1085. def trimr(a, limits=None, inclusive=(True, True), axis=None):
  1086. """
  1087. Trims an array by masking some proportion of the data on each end.
  1088. Returns a masked version of the input array.
  1089. Parameters
  1090. ----------
  1091. a : sequence
  1092. Input array.
  1093. limits : {None, tuple}, optional
  1094. Tuple of the percentages to cut on each side of the array, with respect
  1095. to the number of unmasked data, as floats between 0. and 1.
  1096. Noting n the number of unmasked data before trimming, the
  1097. (n*limits[0])th smallest data and the (n*limits[1])th largest data are
  1098. masked, and the total number of unmasked data after trimming is
  1099. n*(1.-sum(limits)). The value of one limit can be set to None to
  1100. indicate an open interval.
  1101. inclusive : {(True,True) tuple}, optional
  1102. Tuple of flags indicating whether the number of data being masked on
  1103. the left (right) end should be truncated (True) or rounded (False) to
  1104. integers.
  1105. axis : {None,int}, optional
  1106. Axis along which to trim. If None, the whole array is trimmed, but its
  1107. shape is maintained.
  1108. """
  1109. def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
  1110. n = a.count()
  1111. idx = a.argsort()
  1112. if low_limit:
  1113. if low_inclusive:
  1114. lowidx = int(low_limit*n)
  1115. else:
  1116. lowidx = np.round(low_limit*n)
  1117. a[idx[:lowidx]] = masked
  1118. if up_limit is not None:
  1119. if up_inclusive:
  1120. upidx = n - int(n*up_limit)
  1121. else:
  1122. upidx = n - np.round(n*up_limit)
  1123. a[idx[upidx:]] = masked
  1124. return a
  1125. a = ma.asarray(a)
  1126. a.unshare_mask()
  1127. if limits is None:
  1128. return a
  1129. # Check the limits
  1130. (lolim, uplim) = limits
  1131. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  1132. if lolim is not None:
  1133. if lolim > 1. or lolim < 0:
  1134. raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
  1135. if uplim is not None:
  1136. if uplim > 1. or uplim < 0:
  1137. raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
  1138. (loinc, upinc) = inclusive
  1139. if axis is None:
  1140. shp = a.shape
  1141. return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
  1142. else:
  1143. return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
  1144. trimdoc = """
  1145. Parameters
  1146. ----------
  1147. a : sequence
  1148. Input array
  1149. limits : {None, tuple}, optional
  1150. If `relative` is False, tuple (lower limit, upper limit) in absolute values.
  1151. Values of the input array lower (greater) than the lower (upper) limit are
  1152. masked.
  1153. If `relative` is True, tuple (lower percentage, upper percentage) to cut
  1154. on each side of the array, with respect to the number of unmasked data.
  1155. Noting n the number of unmasked data before trimming, the (n*limits[0])th
  1156. smallest data and the (n*limits[1])th largest data are masked, and the
  1157. total number of unmasked data after trimming is n*(1.-sum(limits))
  1158. In each case, the value of one limit can be set to None to indicate an
  1159. open interval.
  1160. If limits is None, no trimming is performed
  1161. inclusive : {(bool, bool) tuple}, optional
  1162. If `relative` is False, tuple indicating whether values exactly equal
  1163. to the absolute limits are allowed.
  1164. If `relative` is True, tuple indicating whether the number of data
  1165. being masked on each side should be rounded (True) or truncated
  1166. (False).
  1167. relative : bool, optional
  1168. Whether to consider the limits as absolute values (False) or proportions
  1169. to cut (True).
  1170. axis : int, optional
  1171. Axis along which to trim.
  1172. """
  1173. def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
  1174. """
  1175. Trims an array by masking the data outside some given limits.
  1176. Returns a masked version of the input array.
  1177. %s
  1178. Examples
  1179. --------
  1180. >>> from scipy.stats.mstats import trim
  1181. >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
  1182. >>> print(trim(z,(3,8)))
  1183. [-- -- 3 4 5 6 7 8 -- --]
  1184. >>> print(trim(z,(0.1,0.2),relative=True))
  1185. [-- 2 3 4 5 6 7 8 -- --]
  1186. """
  1187. if relative:
  1188. return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
  1189. else:
  1190. return trima(a, limits=limits, inclusive=inclusive)
  1191. if trim.__doc__ is not None:
  1192. trim.__doc__ = trim.__doc__ % trimdoc
  1193. def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
  1194. """
  1195. Trims the smallest and largest data values.
  1196. Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and
  1197. ``int(proportiontocut * n)`` largest values of data along the given axis,
  1198. where n is the number of unmasked values before trimming.
  1199. Parameters
  1200. ----------
  1201. data : ndarray
  1202. Data to trim.
  1203. proportiontocut : float, optional
  1204. Percentage of trimming (as a float between 0 and 1).
  1205. If n is the number of unmasked values before trimming, the number of
  1206. values after trimming is ``(1 - 2*proportiontocut) * n``.
  1207. Default is 0.2.
  1208. inclusive : {(bool, bool) tuple}, optional
  1209. Tuple indicating whether the number of data being masked on each side
  1210. should be rounded (True) or truncated (False).
  1211. axis : int, optional
  1212. Axis along which to perform the trimming.
  1213. If None, the input array is first flattened.
  1214. """
  1215. return trimr(data, limits=(proportiontocut,proportiontocut),
  1216. inclusive=inclusive, axis=axis)
  1217. def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
  1218. axis=None):
  1219. """
  1220. Trims the data by masking values from one tail.
  1221. Parameters
  1222. ----------
  1223. data : array_like
  1224. Data to trim.
  1225. proportiontocut : float, optional
  1226. Percentage of trimming. If n is the number of unmasked values
  1227. before trimming, the number of values after trimming is
  1228. ``(1 - proportiontocut) * n``. Default is 0.2.
  1229. tail : {'left','right'}, optional
  1230. If 'left' the `proportiontocut` lowest values will be masked.
  1231. If 'right' the `proportiontocut` highest values will be masked.
  1232. Default is 'left'.
  1233. inclusive : {(bool, bool) tuple}, optional
  1234. Tuple indicating whether the number of data being masked on each side
  1235. should be rounded (True) or truncated (False). Default is
  1236. (True, True).
  1237. axis : int, optional
  1238. Axis along which to perform the trimming.
  1239. If None, the input array is first flattened. Default is None.
  1240. Returns
  1241. -------
  1242. trimtail : ndarray
  1243. Returned array of same shape as `data` with masked tail values.
  1244. """
  1245. tail = str(tail).lower()[0]
  1246. if tail == 'l':
  1247. limits = (proportiontocut,None)
  1248. elif tail == 'r':
  1249. limits = (None, proportiontocut)
  1250. else:
  1251. raise TypeError("The tail argument should be in ('left','right')")
  1252. return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
  1253. trim1 = trimtail
  1254. def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1255. axis=None):
  1256. """Returns the trimmed mean of the data along the given axis.
  1257. %s
  1258. """ % trimdoc
  1259. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1260. limits = (limits, limits)
  1261. if relative:
  1262. return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
  1263. else:
  1264. return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
  1265. def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1266. axis=None, ddof=0):
  1267. """Returns the trimmed variance of the data along the given axis.
  1268. %s
  1269. ddof : {0,integer}, optional
  1270. Means Delta Degrees of Freedom. The denominator used during computations
  1271. is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
  1272. biased estimate of the variance.
  1273. """ % trimdoc
  1274. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1275. limits = (limits, limits)
  1276. if relative:
  1277. out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
  1278. else:
  1279. out = trima(a,limits=limits,inclusive=inclusive)
  1280. return out.var(axis=axis, ddof=ddof)
  1281. def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1282. axis=None, ddof=0):
  1283. """Returns the trimmed standard deviation of the data along the given axis.
  1284. %s
  1285. ddof : {0,integer}, optional
  1286. Means Delta Degrees of Freedom. The denominator used during computations
  1287. is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
  1288. biased estimate of the variance.
  1289. """ % trimdoc
  1290. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1291. limits = (limits, limits)
  1292. if relative:
  1293. out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
  1294. else:
  1295. out = trima(a,limits=limits,inclusive=inclusive)
  1296. return out.std(axis=axis,ddof=ddof)
  1297. def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
  1298. """
  1299. Returns the standard error of the trimmed mean along the given axis.
  1300. Parameters
  1301. ----------
  1302. a : sequence
  1303. Input array
  1304. limits : {(0.1,0.1), tuple of float}, optional
  1305. tuple (lower percentage, upper percentage) to cut on each side of the
  1306. array, with respect to the number of unmasked data.
  1307. If n is the number of unmasked data before trimming, the values
  1308. smaller than ``n * limits[0]`` and the values larger than
  1309. ``n * `limits[1]`` are masked, and the total number of unmasked
  1310. data after trimming is ``n * (1.-sum(limits))``. In each case,
  1311. the value of one limit can be set to None to indicate an open interval.
  1312. If `limits` is None, no trimming is performed.
  1313. inclusive : {(bool, bool) tuple} optional
  1314. Tuple indicating whether the number of data being masked on each side
  1315. should be rounded (True) or truncated (False).
  1316. axis : int, optional
  1317. Axis along which to trim.
  1318. Returns
  1319. -------
  1320. trimmed_stde : scalar or ndarray
  1321. """
  1322. def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
  1323. "Returns the standard error of the trimmed mean for a 1D input data."
  1324. n = a.count()
  1325. idx = a.argsort()
  1326. if low_limit:
  1327. if low_inclusive:
  1328. lowidx = int(low_limit*n)
  1329. else:
  1330. lowidx = np.round(low_limit*n)
  1331. a[idx[:lowidx]] = masked
  1332. if up_limit is not None:
  1333. if up_inclusive:
  1334. upidx = n - int(n*up_limit)
  1335. else:
  1336. upidx = n - np.round(n*up_limit)
  1337. a[idx[upidx:]] = masked
  1338. a[idx[:lowidx]] = a[idx[lowidx]]
  1339. a[idx[upidx:]] = a[idx[upidx-1]]
  1340. winstd = a.std(ddof=1)
  1341. return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
  1342. a = ma.array(a, copy=True, subok=True)
  1343. a.unshare_mask()
  1344. if limits is None:
  1345. return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
  1346. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1347. limits = (limits, limits)
  1348. # Check the limits
  1349. (lolim, uplim) = limits
  1350. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  1351. if lolim is not None:
  1352. if lolim > 1. or lolim < 0:
  1353. raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
  1354. if uplim is not None:
  1355. if uplim > 1. or uplim < 0:
  1356. raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
  1357. (loinc, upinc) = inclusive
  1358. if (axis is None):
  1359. return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
  1360. else:
  1361. if a.ndim > 2:
  1362. raise ValueError("Array 'a' must be at most two dimensional, but got a.ndim = %d" % a.ndim)
  1363. return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
  1364. lolim,uplim,loinc,upinc)
  1365. def _mask_to_limits(a, limits, inclusive):
  1366. """Mask an array for values outside of given limits.
  1367. This is primarily a utility function.
  1368. Parameters
  1369. ----------
  1370. a : array
  1371. limits : (float or None, float or None)
  1372. A tuple consisting of the (lower limit, upper limit). Values in the
  1373. input array less than the lower limit or greater than the upper limit
  1374. will be masked out. None implies no limit.
  1375. inclusive : (bool, bool)
  1376. A tuple consisting of the (lower flag, upper flag). These flags
  1377. determine whether values exactly equal to lower or upper are allowed.
  1378. Returns
  1379. -------
  1380. A MaskedArray.
  1381. Raises
  1382. ------
  1383. A ValueError if there are no values within the given limits.
  1384. """
  1385. lower_limit, upper_limit = limits
  1386. lower_include, upper_include = inclusive
  1387. am = ma.MaskedArray(a)
  1388. if lower_limit is not None:
  1389. if lower_include:
  1390. am = ma.masked_less(am, lower_limit)
  1391. else:
  1392. am = ma.masked_less_equal(am, lower_limit)
  1393. if upper_limit is not None:
  1394. if upper_include:
  1395. am = ma.masked_greater(am, upper_limit)
  1396. else:
  1397. am = ma.masked_greater_equal(am, upper_limit)
  1398. if am.count() == 0:
  1399. raise ValueError("No array values within given limits")
  1400. return am
  1401. def tmean(a, limits=None, inclusive=(True, True), axis=None):
  1402. """
  1403. Compute the trimmed mean.
  1404. Parameters
  1405. ----------
  1406. a : array_like
  1407. Array of values.
  1408. limits : None or (lower limit, upper limit), optional
  1409. Values in the input array less than the lower limit or greater than the
  1410. upper limit will be ignored. When limits is None (default), then all
  1411. values are used. Either of the limit values in the tuple can also be
  1412. None representing a half-open interval.
  1413. inclusive : (bool, bool), optional
  1414. A tuple consisting of the (lower flag, upper flag). These flags
  1415. determine whether values exactly equal to the lower or upper limits
  1416. are included. The default value is (True, True).
  1417. axis : int or None, optional
  1418. Axis along which to operate. If None, compute over the
  1419. whole array. Default is None.
  1420. Returns
  1421. -------
  1422. tmean : float
  1423. Notes
  1424. -----
  1425. For more details on `tmean`, see `stats.tmean`.
  1426. Examples
  1427. --------
  1428. >>> from scipy.stats import mstats
  1429. >>> a = np.array([[6, 8, 3, 0],
  1430. ... [3, 9, 1, 2],
  1431. ... [8, 7, 8, 2],
  1432. ... [5, 6, 0, 2],
  1433. ... [4, 5, 5, 2]])
  1434. ...
  1435. ...
  1436. >>> mstats.tmean(a, (2,5))
  1437. 3.3
  1438. >>> mstats.tmean(a, (2,5), axis=0)
  1439. masked_array(data=[4.0, 5.0, 4.0, 2.0],
  1440. mask=[False, False, False, False],
  1441. fill_value=1e+20)
  1442. """
  1443. return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis)
  1444. def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  1445. """
  1446. Compute the trimmed variance
  1447. This function computes the sample variance of an array of values,
  1448. while ignoring values which are outside of given `limits`.
  1449. Parameters
  1450. ----------
  1451. a : array_like
  1452. Array of values.
  1453. limits : None or (lower limit, upper limit), optional
  1454. Values in the input array less than the lower limit or greater than the
  1455. upper limit will be ignored. When limits is None, then all values are
  1456. used. Either of the limit values in the tuple can also be None
  1457. representing a half-open interval. The default value is None.
  1458. inclusive : (bool, bool), optional
  1459. A tuple consisting of the (lower flag, upper flag). These flags
  1460. determine whether values exactly equal to the lower or upper limits
  1461. are included. The default value is (True, True).
  1462. axis : int or None, optional
  1463. Axis along which to operate. If None, compute over the
  1464. whole array. Default is zero.
  1465. ddof : int, optional
  1466. Delta degrees of freedom. Default is 1.
  1467. Returns
  1468. -------
  1469. tvar : float
  1470. Trimmed variance.
  1471. Notes
  1472. -----
  1473. For more details on `tvar`, see `stats.tvar`.
  1474. """
  1475. a = a.astype(float).ravel()
  1476. if limits is None:
  1477. n = (~a.mask).sum() # todo: better way to do that?
  1478. return np.ma.var(a) * n/(n-1.)
  1479. am = _mask_to_limits(a, limits=limits, inclusive=inclusive)
  1480. return np.ma.var(am, axis=axis, ddof=ddof)
  1481. def tmin(a, lowerlimit=None, axis=0, inclusive=True):
  1482. """
  1483. Compute the trimmed minimum
  1484. Parameters
  1485. ----------
  1486. a : array_like
  1487. array of values
  1488. lowerlimit : None or float, optional
  1489. Values in the input array less than the given limit will be ignored.
  1490. When lowerlimit is None, then all values are used. The default value
  1491. is None.
  1492. axis : int or None, optional
  1493. Axis along which to operate. Default is 0. If None, compute over the
  1494. whole array `a`.
  1495. inclusive : {True, False}, optional
  1496. This flag determines whether values exactly equal to the lower limit
  1497. are included. The default value is True.
  1498. Returns
  1499. -------
  1500. tmin : float, int or ndarray
  1501. Notes
  1502. -----
  1503. For more details on `tmin`, see `stats.tmin`.
  1504. Examples
  1505. --------
  1506. >>> from scipy.stats import mstats
  1507. >>> a = np.array([[6, 8, 3, 0],
  1508. ... [3, 2, 1, 2],
  1509. ... [8, 1, 8, 2],
  1510. ... [5, 3, 0, 2],
  1511. ... [4, 7, 5, 2]])
  1512. ...
  1513. >>> mstats.tmin(a, 5)
  1514. masked_array(data=[5, 7, 5, --],
  1515. mask=[False, False, False, True],
  1516. fill_value=999999)
  1517. """
  1518. a, axis = _chk_asarray(a, axis)
  1519. am = trima(a, (lowerlimit, None), (inclusive, False))
  1520. return ma.minimum.reduce(am, axis)
  1521. def tmax(a, upperlimit=None, axis=0, inclusive=True):
  1522. """
  1523. Compute the trimmed maximum
  1524. This function computes the maximum value of an array along a given axis,
  1525. while ignoring values larger than a specified upper limit.
  1526. Parameters
  1527. ----------
  1528. a : array_like
  1529. array of values
  1530. upperlimit : None or float, optional
  1531. Values in the input array greater than the given limit will be ignored.
  1532. When upperlimit is None, then all values are used. The default value
  1533. is None.
  1534. axis : int or None, optional
  1535. Axis along which to operate. Default is 0. If None, compute over the
  1536. whole array `a`.
  1537. inclusive : {True, False}, optional
  1538. This flag determines whether values exactly equal to the upper limit
  1539. are included. The default value is True.
  1540. Returns
  1541. -------
  1542. tmax : float, int or ndarray
  1543. Notes
  1544. -----
  1545. For more details on `tmax`, see `stats.tmax`.
  1546. Examples
  1547. --------
  1548. >>> from scipy.stats import mstats
  1549. >>> a = np.array([[6, 8, 3, 0],
  1550. ... [3, 9, 1, 2],
  1551. ... [8, 7, 8, 2],
  1552. ... [5, 6, 0, 2],
  1553. ... [4, 5, 5, 2]])
  1554. ...
  1555. ...
  1556. >>> mstats.tmax(a, 4)
  1557. masked_array(data=[4, --, 3, 2],
  1558. mask=[False, True, False, False],
  1559. fill_value=999999)
  1560. """
  1561. a, axis = _chk_asarray(a, axis)
  1562. am = trima(a, (None, upperlimit), (False, inclusive))
  1563. return ma.maximum.reduce(am, axis)
  1564. def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  1565. """
  1566. Compute the trimmed standard error of the mean.
  1567. This function finds the standard error of the mean for given
  1568. values, ignoring values outside the given `limits`.
  1569. Parameters
  1570. ----------
  1571. a : array_like
  1572. array of values
  1573. limits : None or (lower limit, upper limit), optional
  1574. Values in the input array less than the lower limit or greater than the
  1575. upper limit will be ignored. When limits is None, then all values are
  1576. used. Either of the limit values in the tuple can also be None
  1577. representing a half-open interval. The default value is None.
  1578. inclusive : (bool, bool), optional
  1579. A tuple consisting of the (lower flag, upper flag). These flags
  1580. determine whether values exactly equal to the lower or upper limits
  1581. are included. The default value is (True, True).
  1582. axis : int or None, optional
  1583. Axis along which to operate. If None, compute over the
  1584. whole array. Default is zero.
  1585. ddof : int, optional
  1586. Delta degrees of freedom. Default is 1.
  1587. Returns
  1588. -------
  1589. tsem : float
  1590. Notes
  1591. -----
  1592. For more details on `tsem`, see `stats.tsem`.
  1593. """
  1594. a = ma.asarray(a).ravel()
  1595. if limits is None:
  1596. n = float(a.count())
  1597. return a.std(axis=axis, ddof=ddof)/ma.sqrt(n)
  1598. am = trima(a.ravel(), limits, inclusive)
  1599. sd = np.sqrt(am.var(axis=axis, ddof=ddof))
  1600. return sd / np.sqrt(am.count())
  1601. def winsorize(a, limits=None, inclusive=(True, True), inplace=False,
  1602. axis=None):
  1603. """Returns a Winsorized version of the input array.
  1604. The (limits[0])th lowest values are set to the (limits[0])th percentile,
  1605. and the (limits[1])th highest values are set to the (1 - limits[1])th
  1606. percentile.
  1607. Masked values are skipped.
  1608. Parameters
  1609. ----------
  1610. a : sequence
  1611. Input array.
  1612. limits : {None, tuple of float}, optional
  1613. Tuple of the percentages to cut on each side of the array, with respect
  1614. to the number of unmasked data, as floats between 0. and 1.
  1615. Noting n the number of unmasked data before trimming, the
  1616. (n*limits[0])th smallest data and the (n*limits[1])th largest data are
  1617. masked, and the total number of unmasked data after trimming
  1618. is n*(1.-sum(limits)) The value of one limit can be set to None to
  1619. indicate an open interval.
  1620. inclusive : {(True, True) tuple}, optional
  1621. Tuple indicating whether the number of data being masked on each side
  1622. should be truncated (True) or rounded (False).
  1623. inplace : {False, True}, optional
  1624. Whether to winsorize in place (True) or to use a copy (False)
  1625. axis : {None, int}, optional
  1626. Axis along which to trim. If None, the whole array is trimmed, but its
  1627. shape is maintained.
  1628. Notes
  1629. -----
  1630. This function is applied to reduce the effect of possibly spurious outliers
  1631. by limiting the extreme values.
  1632. """
  1633. def _winsorize1D(a, low_limit, up_limit, low_include, up_include):
  1634. n = a.count()
  1635. idx = a.argsort()
  1636. if low_limit:
  1637. if low_include:
  1638. lowidx = int(low_limit * n)
  1639. else:
  1640. lowidx = np.round(low_limit * n).astype(int)
  1641. a[idx[:lowidx]] = a[idx[lowidx]]
  1642. if up_limit is not None:
  1643. if up_include:
  1644. upidx = n - int(n * up_limit)
  1645. else:
  1646. upidx = n - np.round(n * up_limit).astype(int)
  1647. a[idx[upidx:]] = a[idx[upidx - 1]]
  1648. return a
  1649. # We are going to modify a: better make a copy
  1650. a = ma.array(a, copy=np.logical_not(inplace))
  1651. if limits is None:
  1652. return a
  1653. if (not isinstance(limits, tuple)) and isinstance(limits, float):
  1654. limits = (limits, limits)
  1655. # Check the limits
  1656. (lolim, uplim) = limits
  1657. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  1658. if lolim is not None:
  1659. if lolim > 1. or lolim < 0:
  1660. raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
  1661. if uplim is not None:
  1662. if uplim > 1. or uplim < 0:
  1663. raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
  1664. (loinc, upinc) = inclusive
  1665. if axis is None:
  1666. shp = a.shape
  1667. return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc).reshape(shp)
  1668. else:
  1669. return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc,
  1670. upinc)
  1671. def moment(a, moment=1, axis=0):
  1672. """
  1673. Calculates the nth moment about the mean for a sample.
  1674. Parameters
  1675. ----------
  1676. a : array_like
  1677. data
  1678. moment : int, optional
  1679. order of central moment that is returned
  1680. axis : int or None, optional
  1681. Axis along which the central moment is computed. Default is 0.
  1682. If None, compute over the whole array `a`.
  1683. Returns
  1684. -------
  1685. n-th central moment : ndarray or float
  1686. The appropriate moment along the given axis or over all values if axis
  1687. is None. The denominator for the moment calculation is the number of
  1688. observations, no degrees of freedom correction is done.
  1689. Notes
  1690. -----
  1691. For more details about `moment`, see `stats.moment`.
  1692. """
  1693. a, axis = _chk_asarray(a, axis)
  1694. if moment == 1:
  1695. # By definition the first moment about the mean is 0.
  1696. shape = list(a.shape)
  1697. del shape[axis]
  1698. if shape:
  1699. # return an actual array of the appropriate shape
  1700. return np.zeros(shape, dtype=float)
  1701. else:
  1702. # the input was 1D, so return a scalar instead of a rank-0 array
  1703. return np.float64(0.0)
  1704. else:
  1705. # Exponentiation by squares: form exponent sequence
  1706. n_list = [moment]
  1707. current_n = moment
  1708. while current_n > 2:
  1709. if current_n % 2:
  1710. current_n = (current_n-1)/2
  1711. else:
  1712. current_n /= 2
  1713. n_list.append(current_n)
  1714. # Starting point for exponentiation by squares
  1715. a_zero_mean = a - ma.expand_dims(a.mean(axis), axis)
  1716. if n_list[-1] == 1:
  1717. s = a_zero_mean.copy()
  1718. else:
  1719. s = a_zero_mean**2
  1720. # Perform multiplications
  1721. for n in n_list[-2::-1]:
  1722. s = s**2
  1723. if n % 2:
  1724. s *= a_zero_mean
  1725. return s.mean(axis)
  1726. def variation(a, axis=0):
  1727. """
  1728. Computes the coefficient of variation, the ratio of the biased standard
  1729. deviation to the mean.
  1730. Parameters
  1731. ----------
  1732. a : array_like
  1733. Input array.
  1734. axis : int or None, optional
  1735. Axis along which to calculate the coefficient of variation. Default
  1736. is 0. If None, compute over the whole array `a`.
  1737. Returns
  1738. -------
  1739. variation : ndarray
  1740. The calculated variation along the requested axis.
  1741. Notes
  1742. -----
  1743. For more details about `variation`, see `stats.variation`.
  1744. """
  1745. a, axis = _chk_asarray(a, axis)
  1746. return a.std(axis)/a.mean(axis)
  1747. def skew(a, axis=0, bias=True):
  1748. """
  1749. Computes the skewness of a data set.
  1750. Parameters
  1751. ----------
  1752. a : ndarray
  1753. data
  1754. axis : int or None, optional
  1755. Axis along which skewness is calculated. Default is 0.
  1756. If None, compute over the whole array `a`.
  1757. bias : bool, optional
  1758. If False, then the calculations are corrected for statistical bias.
  1759. Returns
  1760. -------
  1761. skewness : ndarray
  1762. The skewness of values along an axis, returning 0 where all values are
  1763. equal.
  1764. Notes
  1765. -----
  1766. For more details about `skew`, see `stats.skew`.
  1767. """
  1768. a, axis = _chk_asarray(a,axis)
  1769. n = a.count(axis)
  1770. m2 = moment(a, 2, axis)
  1771. m3 = moment(a, 3, axis)
  1772. olderr = np.seterr(all='ignore')
  1773. try:
  1774. vals = ma.where(m2 == 0, 0, m3 / m2**1.5)
  1775. finally:
  1776. np.seterr(**olderr)
  1777. if not bias:
  1778. can_correct = (n > 2) & (m2 > 0)
  1779. if can_correct.any():
  1780. m2 = np.extract(can_correct, m2)
  1781. m3 = np.extract(can_correct, m3)
  1782. nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
  1783. np.place(vals, can_correct, nval)
  1784. return vals
  1785. def kurtosis(a, axis=0, fisher=True, bias=True):
  1786. """
  1787. Computes the kurtosis (Fisher or Pearson) of a dataset.
  1788. Kurtosis is the fourth central moment divided by the square of the
  1789. variance. If Fisher's definition is used, then 3.0 is subtracted from
  1790. the result to give 0.0 for a normal distribution.
  1791. If bias is False then the kurtosis is calculated using k statistics to
  1792. eliminate bias coming from biased moment estimators
  1793. Use `kurtosistest` to see if result is close enough to normal.
  1794. Parameters
  1795. ----------
  1796. a : array
  1797. data for which the kurtosis is calculated
  1798. axis : int or None, optional
  1799. Axis along which the kurtosis is calculated. Default is 0.
  1800. If None, compute over the whole array `a`.
  1801. fisher : bool, optional
  1802. If True, Fisher's definition is used (normal ==> 0.0). If False,
  1803. Pearson's definition is used (normal ==> 3.0).
  1804. bias : bool, optional
  1805. If False, then the calculations are corrected for statistical bias.
  1806. Returns
  1807. -------
  1808. kurtosis : array
  1809. The kurtosis of values along an axis. If all values are equal,
  1810. return -3 for Fisher's definition and 0 for Pearson's definition.
  1811. Notes
  1812. -----
  1813. For more details about `kurtosis`, see `stats.kurtosis`.
  1814. """
  1815. a, axis = _chk_asarray(a, axis)
  1816. m2 = moment(a, 2, axis)
  1817. m4 = moment(a, 4, axis)
  1818. olderr = np.seterr(all='ignore')
  1819. try:
  1820. vals = ma.where(m2 == 0, 0, m4 / m2**2.0)
  1821. finally:
  1822. np.seterr(**olderr)
  1823. if not bias:
  1824. n = a.count(axis)
  1825. can_correct = (n > 3) & (m2 is not ma.masked and m2 > 0)
  1826. if can_correct.any():
  1827. n = np.extract(can_correct, n)
  1828. m2 = np.extract(can_correct, m2)
  1829. m4 = np.extract(can_correct, m4)
  1830. nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
  1831. np.place(vals, can_correct, nval+3.0)
  1832. if fisher:
  1833. return vals - 3
  1834. else:
  1835. return vals
  1836. DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean',
  1837. 'variance', 'skewness',
  1838. 'kurtosis'))
  1839. def describe(a, axis=0, ddof=0, bias=True):
  1840. """
  1841. Computes several descriptive statistics of the passed array.
  1842. Parameters
  1843. ----------
  1844. a : array_like
  1845. Data array
  1846. axis : int or None, optional
  1847. Axis along which to calculate statistics. Default 0. If None,
  1848. compute over the whole array `a`.
  1849. ddof : int, optional
  1850. degree of freedom (default 0); note that default ddof is different
  1851. from the same routine in stats.describe
  1852. bias : bool, optional
  1853. If False, then the skewness and kurtosis calculations are corrected for
  1854. statistical bias.
  1855. Returns
  1856. -------
  1857. nobs : int
  1858. (size of the data (discarding missing values)
  1859. minmax : (int, int)
  1860. min, max
  1861. mean : float
  1862. arithmetic mean
  1863. variance : float
  1864. unbiased variance
  1865. skewness : float
  1866. biased skewness
  1867. kurtosis : float
  1868. biased kurtosis
  1869. Examples
  1870. --------
  1871. >>> from scipy.stats.mstats import describe
  1872. >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])
  1873. >>> describe(ma)
  1874. DescribeResult(nobs=3, minmax=(masked_array(data=0,
  1875. mask=False,
  1876. fill_value=999999), masked_array(data=2,
  1877. mask=False,
  1878. fill_value=999999)), mean=1.0, variance=0.6666666666666666,
  1879. skewness=masked_array(data=0., mask=False, fill_value=1e+20),
  1880. kurtosis=-1.5)
  1881. """
  1882. a, axis = _chk_asarray(a, axis)
  1883. n = a.count(axis)
  1884. mm = (ma.minimum.reduce(a), ma.maximum.reduce(a))
  1885. m = a.mean(axis)
  1886. v = a.var(axis, ddof=ddof)
  1887. sk = skew(a, axis, bias=bias)
  1888. kurt = kurtosis(a, axis, bias=bias)
  1889. return DescribeResult(n, mm, m, v, sk, kurt)
  1890. def stde_median(data, axis=None):
  1891. """Returns the McKean-Schrader estimate of the standard error of the sample
  1892. median along the given axis. masked values are discarded.
  1893. Parameters
  1894. ----------
  1895. data : ndarray
  1896. Data to trim.
  1897. axis : {None,int}, optional
  1898. Axis along which to perform the trimming.
  1899. If None, the input array is first flattened.
  1900. """
  1901. def _stdemed_1D(data):
  1902. data = np.sort(data.compressed())
  1903. n = len(data)
  1904. z = 2.5758293035489004
  1905. k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
  1906. return ((data[n-k] - data[k-1])/(2.*z))
  1907. data = ma.array(data, copy=False, subok=True)
  1908. if (axis is None):
  1909. return _stdemed_1D(data)
  1910. else:
  1911. if data.ndim > 2:
  1912. raise ValueError("Array 'data' must be at most two dimensional, "
  1913. "but got data.ndim = %d" % data.ndim)
  1914. return ma.apply_along_axis(_stdemed_1D, axis, data)
  1915. SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
  1916. def skewtest(a, axis=0):
  1917. """
  1918. Tests whether the skew is different from the normal distribution.
  1919. Parameters
  1920. ----------
  1921. a : array
  1922. The data to be tested
  1923. axis : int or None, optional
  1924. Axis along which statistics are calculated. Default is 0.
  1925. If None, compute over the whole array `a`.
  1926. Returns
  1927. -------
  1928. statistic : float
  1929. The computed z-score for this test.
  1930. pvalue : float
  1931. a 2-sided p-value for the hypothesis test
  1932. Notes
  1933. -----
  1934. For more details about `skewtest`, see `stats.skewtest`.
  1935. """
  1936. a, axis = _chk_asarray(a, axis)
  1937. if axis is None:
  1938. a = a.ravel()
  1939. axis = 0
  1940. b2 = skew(a,axis)
  1941. n = a.count(axis)
  1942. if np.min(n) < 8:
  1943. raise ValueError(
  1944. "skewtest is not valid with less than 8 samples; %i samples"
  1945. " were given." % np.min(n))
  1946. y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)))
  1947. beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9))
  1948. W2 = -1 + ma.sqrt(2*(beta2-1))
  1949. delta = 1/ma.sqrt(0.5*ma.log(W2))
  1950. alpha = ma.sqrt(2.0/(W2-1))
  1951. y = ma.where(y == 0, 1, y)
  1952. Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
  1953. return SkewtestResult(Z, 2 * distributions.norm.sf(np.abs(Z)))
  1954. KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
  1955. def kurtosistest(a, axis=0):
  1956. """
  1957. Tests whether a dataset has normal kurtosis
  1958. Parameters
  1959. ----------
  1960. a : array
  1961. array of the sample data
  1962. axis : int or None, optional
  1963. Axis along which to compute test. Default is 0. If None,
  1964. compute over the whole array `a`.
  1965. Returns
  1966. -------
  1967. statistic : float
  1968. The computed z-score for this test.
  1969. pvalue : float
  1970. The 2-sided p-value for the hypothesis test
  1971. Notes
  1972. -----
  1973. For more details about `kurtosistest`, see `stats.kurtosistest`.
  1974. """
  1975. a, axis = _chk_asarray(a, axis)
  1976. n = a.count(axis=axis)
  1977. if np.min(n) < 5:
  1978. raise ValueError(
  1979. "kurtosistest requires at least 5 observations; %i observations"
  1980. " were given." % np.min(n))
  1981. if np.min(n) < 20:
  1982. warnings.warn(
  1983. "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" %
  1984. np.min(n))
  1985. b2 = kurtosis(a, axis, fisher=False)
  1986. E = 3.0*(n-1) / (n+1)
  1987. varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))
  1988. x = (b2-E)/ma.sqrt(varb2)
  1989. sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
  1990. (n*(n-2)*(n-3)))
  1991. A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
  1992. term1 = 1 - 2./(9.0*A)
  1993. denom = 1 + x*ma.sqrt(2/(A-4.0))
  1994. if np.ma.isMaskedArray(denom):
  1995. # For multi-dimensional array input
  1996. denom[denom == 0.0] = masked
  1997. elif denom == 0.0:
  1998. denom = masked
  1999. term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0),
  2000. -ma.power(-(1-2.0/A)/denom, 1/3.0))
  2001. Z = (term1 - term2) / np.sqrt(2/(9.0*A))
  2002. return KurtosistestResult(Z, 2 * distributions.norm.sf(np.abs(Z)))
  2003. NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
  2004. def normaltest(a, axis=0):
  2005. """
  2006. Tests whether a sample differs from a normal distribution.
  2007. Parameters
  2008. ----------
  2009. a : array_like
  2010. The array containing the data to be tested.
  2011. axis : int or None, optional
  2012. Axis along which to compute test. Default is 0. If None,
  2013. compute over the whole array `a`.
  2014. Returns
  2015. -------
  2016. statistic : float or array
  2017. ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
  2018. ``k`` is the z-score returned by `kurtosistest`.
  2019. pvalue : float or array
  2020. A 2-sided chi squared probability for the hypothesis test.
  2021. Notes
  2022. -----
  2023. For more details about `normaltest`, see `stats.normaltest`.
  2024. """
  2025. a, axis = _chk_asarray(a, axis)
  2026. s, _ = skewtest(a, axis)
  2027. k, _ = kurtosistest(a, axis)
  2028. k2 = s*s + k*k
  2029. return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
  2030. def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None,
  2031. limit=()):
  2032. """
  2033. Computes empirical quantiles for a data array.
  2034. Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``,
  2035. where ``x[j]`` is the j-th order statistic, and gamma is a function of
  2036. ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and
  2037. ``g = n*p + m - j``.
  2038. Reinterpreting the above equations to compare to **R** lead to the
  2039. equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)``
  2040. Typical values of (alphap,betap) are:
  2041. - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf
  2042. (**R** type 4)
  2043. - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function
  2044. (**R** type 5)
  2045. - (0,0) : ``p(k) = k/(n+1)`` :
  2046. (**R** type 6)
  2047. - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])].
  2048. (**R** type 7, **R** default)
  2049. - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])].
  2050. The resulting quantile estimates are approximately median-unbiased
  2051. regardless of the distribution of x.
  2052. (**R** type 8)
  2053. - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom.
  2054. The resulting quantile estimates are approximately unbiased
  2055. if x is normally distributed
  2056. (**R** type 9)
  2057. - (.4,.4) : approximately quantile unbiased (Cunnane)
  2058. - (.35,.35): APL, used with PWM
  2059. Parameters
  2060. ----------
  2061. a : array_like
  2062. Input data, as a sequence or array of dimension at most 2.
  2063. prob : array_like, optional
  2064. List of quantiles to compute.
  2065. alphap : float, optional
  2066. Plotting positions parameter, default is 0.4.
  2067. betap : float, optional
  2068. Plotting positions parameter, default is 0.4.
  2069. axis : int, optional
  2070. Axis along which to perform the trimming.
  2071. If None (default), the input array is first flattened.
  2072. limit : tuple, optional
  2073. Tuple of (lower, upper) values.
  2074. Values of `a` outside this open interval are ignored.
  2075. Returns
  2076. -------
  2077. mquantiles : MaskedArray
  2078. An array containing the calculated quantiles.
  2079. Notes
  2080. -----
  2081. This formulation is very similar to **R** except the calculation of
  2082. ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined
  2083. with each type.
  2084. References
  2085. ----------
  2086. .. [1] *R* statistical software: http://www.r-project.org/
  2087. .. [2] *R* ``quantile`` function:
  2088. http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
  2089. Examples
  2090. --------
  2091. >>> from scipy.stats.mstats import mquantiles
  2092. >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])
  2093. >>> mquantiles(a)
  2094. array([ 19.2, 40. , 42.8])
  2095. Using a 2D array, specifying axis and limit.
  2096. >>> data = np.array([[ 6., 7., 1.],
  2097. ... [ 47., 15., 2.],
  2098. ... [ 49., 36., 3.],
  2099. ... [ 15., 39., 4.],
  2100. ... [ 42., 40., -999.],
  2101. ... [ 41., 41., -999.],
  2102. ... [ 7., -999., -999.],
  2103. ... [ 39., -999., -999.],
  2104. ... [ 43., -999., -999.],
  2105. ... [ 40., -999., -999.],
  2106. ... [ 36., -999., -999.]])
  2107. >>> print(mquantiles(data, axis=0, limit=(0, 50)))
  2108. [[19.2 14.6 1.45]
  2109. [40. 37.5 2.5 ]
  2110. [42.8 40.05 3.55]]
  2111. >>> data[:, 2] = -999.
  2112. >>> print(mquantiles(data, axis=0, limit=(0, 50)))
  2113. [[19.200000000000003 14.6 --]
  2114. [40.0 37.5 --]
  2115. [42.800000000000004 40.05 --]]
  2116. """
  2117. def _quantiles1D(data,m,p):
  2118. x = np.sort(data.compressed())
  2119. n = len(x)
  2120. if n == 0:
  2121. return ma.array(np.empty(len(p), dtype=float), mask=True)
  2122. elif n == 1:
  2123. return ma.array(np.resize(x, p.shape), mask=nomask)
  2124. aleph = (n*p + m)
  2125. k = np.floor(aleph.clip(1, n-1)).astype(int)
  2126. gamma = (aleph-k).clip(0,1)
  2127. return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
  2128. data = ma.array(a, copy=False)
  2129. if data.ndim > 2:
  2130. raise TypeError("Array should be 2D at most !")
  2131. if limit:
  2132. condition = (limit[0] < data) & (data < limit[1])
  2133. data[~condition.filled(True)] = masked
  2134. p = np.array(prob, copy=False, ndmin=1)
  2135. m = alphap + p*(1.-alphap-betap)
  2136. # Computes quantiles along axis (or globally)
  2137. if (axis is None):
  2138. return _quantiles1D(data, m, p)
  2139. return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
  2140. def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
  2141. """Calculate the score at the given 'per' percentile of the
  2142. sequence a. For example, the score at per=50 is the median.
  2143. This function is a shortcut to mquantile
  2144. """
  2145. if (per < 0) or (per > 100.):
  2146. raise ValueError("The percentile should be between 0. and 100. !"
  2147. " (got %s)" % per)
  2148. return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
  2149. limit=limit, axis=0).squeeze()
  2150. def plotting_positions(data, alpha=0.4, beta=0.4):
  2151. """
  2152. Returns plotting positions (or empirical percentile points) for the data.
  2153. Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where:
  2154. - i is the rank order statistics
  2155. - n is the number of unmasked values along the given axis
  2156. - `alpha` and `beta` are two parameters.
  2157. Typical values for `alpha` and `beta` are:
  2158. - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4)
  2159. - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function
  2160. (R, type 5)
  2161. - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6)
  2162. - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case,
  2163. ``p(k) = mode[F(x[k])]``. That's R default (R type 7)
  2164. - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then
  2165. ``p(k) ~ median[F(x[k])]``.
  2166. The resulting quantile estimates are approximately median-unbiased
  2167. regardless of the distribution of x. (R type 8)
  2168. - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom.
  2169. The resulting quantile estimates are approximately unbiased
  2170. if x is normally distributed (R type 9)
  2171. - (.4,.4) : approximately quantile unbiased (Cunnane)
  2172. - (.35,.35): APL, used with PWM
  2173. - (.3175, .3175): used in scipy.stats.probplot
  2174. Parameters
  2175. ----------
  2176. data : array_like
  2177. Input data, as a sequence or array of dimension at most 2.
  2178. alpha : float, optional
  2179. Plotting positions parameter. Default is 0.4.
  2180. beta : float, optional
  2181. Plotting positions parameter. Default is 0.4.
  2182. Returns
  2183. -------
  2184. positions : MaskedArray
  2185. The calculated plotting positions.
  2186. """
  2187. data = ma.array(data, copy=False).reshape(1,-1)
  2188. n = data.count()
  2189. plpos = np.empty(data.size, dtype=float)
  2190. plpos[n:] = 0
  2191. plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) /
  2192. (n + 1.0 - alpha - beta))
  2193. return ma.array(plpos, mask=data._mask)
  2194. meppf = plotting_positions
  2195. def obrientransform(*args):
  2196. """
  2197. Computes a transform on input data (any number of columns). Used to
  2198. test for homogeneity of variance prior to running one-way stats. Each
  2199. array in ``*args`` is one level of a factor. If an `f_oneway()` run on
  2200. the transformed data and found significant, variances are unequal. From
  2201. Maxwell and Delaney, p.112.
  2202. Returns: transformed data for use in an ANOVA
  2203. """
  2204. data = argstoarray(*args).T
  2205. v = data.var(axis=0,ddof=1)
  2206. m = data.mean(0)
  2207. n = data.count(0).astype(float)
  2208. # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
  2209. data -= m
  2210. data **= 2
  2211. data *= (n-1.5)*n
  2212. data -= 0.5*v*(n-1)
  2213. data /= (n-1.)*(n-2.)
  2214. if not ma.allclose(v,data.mean(0)):
  2215. raise ValueError("Lack of convergence in obrientransform.")
  2216. return data
  2217. def sem(a, axis=0, ddof=1):
  2218. """
  2219. Calculates the standard error of the mean of the input array.
  2220. Also sometimes called standard error of measurement.
  2221. Parameters
  2222. ----------
  2223. a : array_like
  2224. An array containing the values for which the standard error is
  2225. returned.
  2226. axis : int or None, optional
  2227. If axis is None, ravel `a` first. If axis is an integer, this will be
  2228. the axis over which to operate. Defaults to 0.
  2229. ddof : int, optional
  2230. Delta degrees-of-freedom. How many degrees of freedom to adjust
  2231. for bias in limited samples relative to the population estimate
  2232. of variance. Defaults to 1.
  2233. Returns
  2234. -------
  2235. s : ndarray or float
  2236. The standard error of the mean in the sample(s), along the input axis.
  2237. Notes
  2238. -----
  2239. The default value for `ddof` changed in scipy 0.15.0 to be consistent with
  2240. `stats.sem` as well as with the most common definition used (like in the R
  2241. documentation).
  2242. Examples
  2243. --------
  2244. Find standard error along the first axis:
  2245. >>> from scipy import stats
  2246. >>> a = np.arange(20).reshape(5,4)
  2247. >>> print(stats.mstats.sem(a))
  2248. [2.8284271247461903 2.8284271247461903 2.8284271247461903
  2249. 2.8284271247461903]
  2250. Find standard error across the whole array, using n degrees of freedom:
  2251. >>> print(stats.mstats.sem(a, axis=None, ddof=0))
  2252. 1.2893796958227628
  2253. """
  2254. a, axis = _chk_asarray(a, axis)
  2255. n = a.count(axis=axis)
  2256. s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n)
  2257. return s
  2258. F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
  2259. def f_oneway(*args):
  2260. """
  2261. Performs a 1-way ANOVA, returning an F-value and probability given
  2262. any number of groups. From Heiman, pp.394-7.
  2263. Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays,
  2264. one per treatment group.
  2265. Returns
  2266. -------
  2267. statistic : float
  2268. The computed F-value of the test.
  2269. pvalue : float
  2270. The associated p-value from the F-distribution.
  2271. """
  2272. # Construct a single array of arguments: each row is a group
  2273. data = argstoarray(*args)
  2274. ngroups = len(data)
  2275. ntot = data.count()
  2276. sstot = (data**2).sum() - (data.sum())**2/float(ntot)
  2277. ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
  2278. sswg = sstot-ssbg
  2279. dfbg = ngroups-1
  2280. dfwg = ntot - ngroups
  2281. msb = ssbg/float(dfbg)
  2282. msw = sswg/float(dfwg)
  2283. f = msb/msw
  2284. prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf
  2285. return F_onewayResult(f, prob)
  2286. FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
  2287. ('statistic', 'pvalue'))
  2288. def friedmanchisquare(*args):
  2289. """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
  2290. This function calculates the Friedman Chi-square test for repeated measures
  2291. and returns the result, along with the associated probability value.
  2292. Each input is considered a given group. Ideally, the number of treatments
  2293. among each group should be equal. If this is not the case, only the first
  2294. n treatments are taken into account, where n is the number of treatments
  2295. of the smallest group.
  2296. If a group has some missing values, the corresponding treatments are masked
  2297. in the other groups.
  2298. The test statistic is corrected for ties.
  2299. Masked values in one group are propagated to the other groups.
  2300. Returns
  2301. -------
  2302. statistic : float
  2303. the test statistic.
  2304. pvalue : float
  2305. the associated p-value.
  2306. """
  2307. data = argstoarray(*args).astype(float)
  2308. k = len(data)
  2309. if k < 3:
  2310. raise ValueError("Less than 3 groups (%i): " % k +
  2311. "the Friedman test is NOT appropriate.")
  2312. ranked = ma.masked_values(rankdata(data, axis=0), 0)
  2313. if ranked._mask is not nomask:
  2314. ranked = ma.mask_cols(ranked)
  2315. ranked = ranked.compressed().reshape(k,-1).view(ndarray)
  2316. else:
  2317. ranked = ranked._data
  2318. (k,n) = ranked.shape
  2319. # Ties correction
  2320. repeats = [find_repeats(row) for row in ranked.T]
  2321. ties = np.array([y for x, y in repeats if x.size > 0])
  2322. tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
  2323. ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
  2324. chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
  2325. return FriedmanchisquareResult(chisq,
  2326. distributions.chi2.sf(chisq, k-1))
  2327. BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue'))
  2328. def brunnermunzel(x, y, alternative="two-sided", distribution="t"):
  2329. """
  2330. Computes the Brunner-Munzel test on samples x and y
  2331. Missing values in `x` and/or `y` are discarded.
  2332. Parameters
  2333. ----------
  2334. x, y : array_like
  2335. Array of samples, should be one-dimensional.
  2336. alternative : 'less', 'two-sided', or 'greater', optional
  2337. Whether to get the p-value for the one-sided hypothesis ('less'
  2338. or 'greater') or for the two-sided hypothesis ('two-sided').
  2339. Defaults value is 'two-sided' .
  2340. distribution: 't' or 'normal', optional
  2341. Whether to get the p-value by t-distribution or by standard normal
  2342. distribution.
  2343. Defaults value is 't' .
  2344. Returns
  2345. -------
  2346. statistic : float
  2347. The Brunner-Munzer W statistic.
  2348. pvalue : float
  2349. p-value assuming an t distribution. One-sided or
  2350. two-sided, depending on the choice of `alternative` and `distribution`.
  2351. See Also
  2352. --------
  2353. mannwhitneyu : Mann-Whitney rank test on two samples.
  2354. Notes
  2355. -------
  2356. For more details on `brunnermunzel`, see `stats.brunnermunzel`.
  2357. """
  2358. x = ma.asarray(x).compressed().view(ndarray)
  2359. y = ma.asarray(y).compressed().view(ndarray)
  2360. nx = len(x)
  2361. ny = len(y)
  2362. if nx == 0 or ny == 0:
  2363. return BrunnerMunzelResult(np.nan, np.nan)
  2364. nc = nx + ny
  2365. rankc = rankdata(np.concatenate((x,y)))
  2366. rankcx = rankc[0:nx]
  2367. rankcy = rankc[nx:nx+ny]
  2368. rankcx_mean = np.mean(rankcx)
  2369. rankcy_mean = np.mean(rankcy)
  2370. rankx = rankdata(x)
  2371. ranky = rankdata(y)
  2372. rankx_mean = np.mean(rankx)
  2373. ranky_mean = np.mean(ranky)
  2374. Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
  2375. Sx /= nx - 1
  2376. Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
  2377. Sy /= ny - 1
  2378. sigmax = Sx / np.power(nc - nx, 2.0)
  2379. sigmay = Sx / np.power(nc - ny, 2.0)
  2380. wbfn = nx * ny * (rankcy_mean - rankcx_mean)
  2381. wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
  2382. if distribution == "t":
  2383. df_numer = np.power(nx * Sx + ny * Sy, 2.0)
  2384. df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
  2385. df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
  2386. df = df_numer / df_denom
  2387. p = distributions.t.cdf(wbfn, df)
  2388. elif distribution == "normal":
  2389. p = distributions.norm.cdf(wbfn)
  2390. else:
  2391. raise ValueError(
  2392. "distribution should be 't' or 'normal'")
  2393. if alternative == "greater":
  2394. p = p
  2395. elif alternative == "less":
  2396. p = 1 - p
  2397. elif alternative == "two-sided":
  2398. p = 2 * np.min([p, 1-p])
  2399. else:
  2400. raise ValueError(
  2401. "alternative should be 'less', 'greater' or 'two-sided'")
  2402. return BrunnerMunzelResult(wbfn, p)