morestats.py 104 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165
  1. from __future__ import division, print_function, absolute_import
  2. import math
  3. import warnings
  4. from collections import namedtuple
  5. import numpy as np
  6. from numpy import (isscalar, r_, log, around, unique, asarray,
  7. zeros, arange, sort, amin, amax, any, atleast_1d,
  8. sqrt, ceil, floor, array, compress,
  9. pi, exp, ravel, count_nonzero, sin, cos, arctan2, hypot)
  10. from scipy._lib.six import string_types
  11. from scipy import optimize
  12. from scipy import special
  13. from . import statlib
  14. from . import stats
  15. from .stats import find_repeats, _contains_nan
  16. from .contingency import chi2_contingency
  17. from . import distributions
  18. from ._distn_infrastructure import rv_generic
  19. __all__ = ['mvsdist',
  20. 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
  21. 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
  22. 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test',
  23. 'fligner', 'mood', 'wilcoxon', 'median_test',
  24. 'circmean', 'circvar', 'circstd', 'anderson_ksamp',
  25. 'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
  26. 'yeojohnson_normplot'
  27. ]
  28. Mean = namedtuple('Mean', ('statistic', 'minmax'))
  29. Variance = namedtuple('Variance', ('statistic', 'minmax'))
  30. Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))
  31. def bayes_mvs(data, alpha=0.90):
  32. r"""
  33. Bayesian confidence intervals for the mean, var, and std.
  34. Parameters
  35. ----------
  36. data : array_like
  37. Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.
  38. Requires 2 or more data points.
  39. alpha : float, optional
  40. Probability that the returned confidence interval contains
  41. the true parameter.
  42. Returns
  43. -------
  44. mean_cntr, var_cntr, std_cntr : tuple
  45. The three results are for the mean, variance and standard deviation,
  46. respectively. Each result is a tuple of the form::
  47. (center, (lower, upper))
  48. with `center` the mean of the conditional pdf of the value given the
  49. data, and `(lower, upper)` a confidence interval, centered on the
  50. median, containing the estimate to a probability ``alpha``.
  51. See Also
  52. --------
  53. mvsdist
  54. Notes
  55. -----
  56. Each tuple of mean, variance, and standard deviation estimates represent
  57. the (center, (lower, upper)) with center the mean of the conditional pdf
  58. of the value given the data and (lower, upper) is a confidence interval
  59. centered on the median, containing the estimate to a probability
  60. ``alpha``.
  61. Converts data to 1-D and assumes all data has the same mean and variance.
  62. Uses Jeffrey's prior for variance and std.
  63. Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``
  64. References
  65. ----------
  66. T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
  67. standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
  68. 2006.
  69. Examples
  70. --------
  71. First a basic example to demonstrate the outputs:
  72. >>> from scipy import stats
  73. >>> data = [6, 9, 12, 7, 8, 8, 13]
  74. >>> mean, var, std = stats.bayes_mvs(data)
  75. >>> mean
  76. Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))
  77. >>> var
  78. Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...))
  79. >>> std
  80. Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631))
  81. Now we generate some normally distributed random data, and get estimates of
  82. mean and standard deviation with 95% confidence intervals for those
  83. estimates:
  84. >>> n_samples = 100000
  85. >>> data = stats.norm.rvs(size=n_samples)
  86. >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)
  87. >>> import matplotlib.pyplot as plt
  88. >>> fig = plt.figure()
  89. >>> ax = fig.add_subplot(111)
  90. >>> ax.hist(data, bins=100, density=True, label='Histogram of data')
  91. >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')
  92. >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',
  93. ... alpha=0.2, label=r'Estimated mean (95% limits)')
  94. >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')
  95. >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,
  96. ... label=r'Estimated scale (95% limits)')
  97. >>> ax.legend(fontsize=10)
  98. >>> ax.set_xlim([-4, 4])
  99. >>> ax.set_ylim([0, 0.5])
  100. >>> plt.show()
  101. """
  102. m, v, s = mvsdist(data)
  103. if alpha >= 1 or alpha <= 0:
  104. raise ValueError("0 < alpha < 1 is required, but alpha=%s was given."
  105. % alpha)
  106. m_res = Mean(m.mean(), m.interval(alpha))
  107. v_res = Variance(v.mean(), v.interval(alpha))
  108. s_res = Std_dev(s.mean(), s.interval(alpha))
  109. return m_res, v_res, s_res
  110. def mvsdist(data):
  111. """
  112. 'Frozen' distributions for mean, variance, and standard deviation of data.
  113. Parameters
  114. ----------
  115. data : array_like
  116. Input array. Converted to 1-D using ravel.
  117. Requires 2 or more data-points.
  118. Returns
  119. -------
  120. mdist : "frozen" distribution object
  121. Distribution object representing the mean of the data
  122. vdist : "frozen" distribution object
  123. Distribution object representing the variance of the data
  124. sdist : "frozen" distribution object
  125. Distribution object representing the standard deviation of the data
  126. See Also
  127. --------
  128. bayes_mvs
  129. Notes
  130. -----
  131. The return values from ``bayes_mvs(data)`` is equivalent to
  132. ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.
  133. In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``
  134. on the three distribution objects returned from this function will give
  135. the same results that are returned from `bayes_mvs`.
  136. References
  137. ----------
  138. T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
  139. standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
  140. 2006.
  141. Examples
  142. --------
  143. >>> from scipy import stats
  144. >>> data = [6, 9, 12, 7, 8, 8, 13]
  145. >>> mean, var, std = stats.mvsdist(data)
  146. We now have frozen distribution objects "mean", "var" and "std" that we can
  147. examine:
  148. >>> mean.mean()
  149. 9.0
  150. >>> mean.interval(0.95)
  151. (6.6120585482655692, 11.387941451734431)
  152. >>> mean.std()
  153. 1.1952286093343936
  154. """
  155. x = ravel(data)
  156. n = len(x)
  157. if n < 2:
  158. raise ValueError("Need at least 2 data-points.")
  159. xbar = x.mean()
  160. C = x.var()
  161. if n > 1000: # gaussian approximations for large n
  162. mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))
  163. sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))
  164. vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)
  165. else:
  166. nm1 = n - 1
  167. fac = n * C / 2.
  168. val = nm1 / 2.
  169. mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))
  170. sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))
  171. vdist = distributions.invgamma(val, scale=fac)
  172. return mdist, vdist, sdist
  173. def kstat(data, n=2):
  174. r"""
  175. Return the nth k-statistic (1<=n<=4 so far).
  176. The nth k-statistic k_n is the unique symmetric unbiased estimator of the
  177. nth cumulant kappa_n.
  178. Parameters
  179. ----------
  180. data : array_like
  181. Input array. Note that n-D input gets flattened.
  182. n : int, {1, 2, 3, 4}, optional
  183. Default is equal to 2.
  184. Returns
  185. -------
  186. kstat : float
  187. The nth k-statistic.
  188. See Also
  189. --------
  190. kstatvar: Returns an unbiased estimator of the variance of the k-statistic.
  191. moment: Returns the n-th central moment about the mean for a sample.
  192. Notes
  193. -----
  194. For a sample size n, the first few k-statistics are given by:
  195. .. math::
  196. k_{1} = \mu
  197. k_{2} = \frac{n}{n-1} m_{2}
  198. k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3}
  199. k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)}
  200. where :math:`\mu` is the sample mean, :math:`m_2` is the sample
  201. variance, and :math:`m_i` is the i-th sample central moment.
  202. References
  203. ----------
  204. http://mathworld.wolfram.com/k-Statistic.html
  205. http://mathworld.wolfram.com/Cumulant.html
  206. Examples
  207. --------
  208. >>> from scipy import stats
  209. >>> rndm = np.random.RandomState(1234)
  210. As sample size increases, n-th moment and n-th k-statistic converge to the
  211. same number (although they aren't identical). In the case of the normal
  212. distribution, they converge to zero.
  213. >>> for n in [2, 3, 4, 5, 6, 7]:
  214. ... x = rndm.normal(size=10**n)
  215. ... m, k = stats.moment(x, 3), stats.kstat(x, 3)
  216. ... print("%.3g %.3g %.3g" % (m, k, m-k))
  217. -0.631 -0.651 0.0194
  218. 0.0282 0.0283 -8.49e-05
  219. -0.0454 -0.0454 1.36e-05
  220. 7.53e-05 7.53e-05 -2.26e-09
  221. 0.00166 0.00166 -4.99e-09
  222. -2.88e-06 -2.88e-06 8.63e-13
  223. """
  224. if n > 4 or n < 1:
  225. raise ValueError("k-statistics only supported for 1<=n<=4")
  226. n = int(n)
  227. S = np.zeros(n + 1, np.float64)
  228. data = ravel(data)
  229. N = data.size
  230. # raise ValueError on empty input
  231. if N == 0:
  232. raise ValueError("Data input must not be empty")
  233. # on nan input, return nan without warning
  234. if np.isnan(np.sum(data)):
  235. return np.nan
  236. for k in range(1, n + 1):
  237. S[k] = np.sum(data**k, axis=0)
  238. if n == 1:
  239. return S[1] * 1.0/N
  240. elif n == 2:
  241. return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))
  242. elif n == 3:
  243. return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))
  244. elif n == 4:
  245. return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -
  246. 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /
  247. (N*(N-1.0)*(N-2.0)*(N-3.0)))
  248. else:
  249. raise ValueError("Should not be here.")
  250. def kstatvar(data, n=2):
  251. r"""
  252. Returns an unbiased estimator of the variance of the k-statistic.
  253. See `kstat` for more details of the k-statistic.
  254. Parameters
  255. ----------
  256. data : array_like
  257. Input array. Note that n-D input gets flattened.
  258. n : int, {1, 2}, optional
  259. Default is equal to 2.
  260. Returns
  261. -------
  262. kstatvar : float
  263. The nth k-statistic variance.
  264. See Also
  265. --------
  266. kstat: Returns the n-th k-statistic.
  267. moment: Returns the n-th central moment about the mean for a sample.
  268. Notes
  269. -----
  270. The variances of the first few k-statistics are given by:
  271. .. math::
  272. var(k_{1}) = \frac{\kappa^2}{n}
  273. var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1}
  274. var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} +
  275. \frac{9 \kappa^2_{3}}{n - 1} +
  276. \frac{6 n \kappa^3_{2}}{(n-1) (n-2)}
  277. var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} +
  278. \frac{48 \kappa_{3} \kappa_5}{n - 1} +
  279. \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} +
  280. \frac{144 n \kappa_{2} \kappa^2_{3}}{(n - 1) (n - 2)} +
  281. \frac{24 (n + 1) n \kappa^4_{2}}{(n - 1) (n - 2) (n - 3)}
  282. """
  283. data = ravel(data)
  284. N = len(data)
  285. if n == 1:
  286. return kstat(data, n=2) * 1.0/N
  287. elif n == 2:
  288. k2 = kstat(data, n=2)
  289. k4 = kstat(data, n=4)
  290. return (2*N*k2**2 + (N-1)*k4) / (N*(N+1))
  291. else:
  292. raise ValueError("Only n=1 or n=2 supported.")
  293. def _calc_uniform_order_statistic_medians(n):
  294. """
  295. Approximations of uniform order statistic medians.
  296. Parameters
  297. ----------
  298. n : int
  299. Sample size.
  300. Returns
  301. -------
  302. v : 1d float array
  303. Approximations of the order statistic medians.
  304. References
  305. ----------
  306. .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient
  307. Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
  308. Examples
  309. --------
  310. Order statistics of the uniform distribution on the unit interval
  311. are marginally distributed according to beta distributions.
  312. The expectations of these order statistic are evenly spaced across
  313. the interval, but the distributions are skewed in a way that
  314. pushes the medians slightly towards the endpoints of the unit interval:
  315. >>> n = 4
  316. >>> k = np.arange(1, n+1)
  317. >>> from scipy.stats import beta
  318. >>> a = k
  319. >>> b = n-k+1
  320. >>> beta.mean(a, b)
  321. array([ 0.2, 0.4, 0.6, 0.8])
  322. >>> beta.median(a, b)
  323. array([ 0.15910358, 0.38572757, 0.61427243, 0.84089642])
  324. The Filliben approximation uses the exact medians of the smallest
  325. and greatest order statistics, and the remaining medians are approximated
  326. by points spread evenly across a sub-interval of the unit interval:
  327. >>> from scipy.morestats import _calc_uniform_order_statistic_medians
  328. >>> _calc_uniform_order_statistic_medians(n)
  329. array([ 0.15910358, 0.38545246, 0.61454754, 0.84089642])
  330. This plot shows the skewed distributions of the order statistics
  331. of a sample of size four from a uniform distribution on the unit interval:
  332. >>> import matplotlib.pyplot as plt
  333. >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True)
  334. >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)]
  335. >>> plt.figure()
  336. >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3])
  337. """
  338. v = np.zeros(n, dtype=np.float64)
  339. v[-1] = 0.5**(1.0 / n)
  340. v[0] = 1 - v[-1]
  341. i = np.arange(2, n)
  342. v[1:-1] = (i - 0.3175) / (n + 0.365)
  343. return v
  344. def _parse_dist_kw(dist, enforce_subclass=True):
  345. """Parse `dist` keyword.
  346. Parameters
  347. ----------
  348. dist : str or stats.distributions instance.
  349. Several functions take `dist` as a keyword, hence this utility
  350. function.
  351. enforce_subclass : bool, optional
  352. If True (default), `dist` needs to be a
  353. `_distn_infrastructure.rv_generic` instance.
  354. It can sometimes be useful to set this keyword to False, if a function
  355. wants to accept objects that just look somewhat like such an instance
  356. (for example, they have a ``ppf`` method).
  357. """
  358. if isinstance(dist, rv_generic):
  359. pass
  360. elif isinstance(dist, string_types):
  361. try:
  362. dist = getattr(distributions, dist)
  363. except AttributeError:
  364. raise ValueError("%s is not a valid distribution name" % dist)
  365. elif enforce_subclass:
  366. msg = ("`dist` should be a stats.distributions instance or a string "
  367. "with the name of such a distribution.")
  368. raise ValueError(msg)
  369. return dist
  370. def _add_axis_labels_title(plot, xlabel, ylabel, title):
  371. """Helper function to add axes labels and a title to stats plots"""
  372. try:
  373. if hasattr(plot, 'set_title'):
  374. # Matplotlib Axes instance or something that looks like it
  375. plot.set_title(title)
  376. plot.set_xlabel(xlabel)
  377. plot.set_ylabel(ylabel)
  378. else:
  379. # matplotlib.pyplot module
  380. plot.title(title)
  381. plot.xlabel(xlabel)
  382. plot.ylabel(ylabel)
  383. except Exception:
  384. # Not an MPL object or something that looks (enough) like it.
  385. # Don't crash on adding labels or title
  386. pass
  387. def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False):
  388. """
  389. Calculate quantiles for a probability plot, and optionally show the plot.
  390. Generates a probability plot of sample data against the quantiles of a
  391. specified theoretical distribution (the normal distribution by default).
  392. `probplot` optionally calculates a best-fit line for the data and plots the
  393. results using Matplotlib or a given plot function.
  394. Parameters
  395. ----------
  396. x : array_like
  397. Sample/response data from which `probplot` creates the plot.
  398. sparams : tuple, optional
  399. Distribution-specific shape parameters (shape parameters plus location
  400. and scale).
  401. dist : str or stats.distributions instance, optional
  402. Distribution or distribution function name. The default is 'norm' for a
  403. normal probability plot. Objects that look enough like a
  404. stats.distributions instance (i.e. they have a ``ppf`` method) are also
  405. accepted.
  406. fit : bool, optional
  407. Fit a least-squares regression (best-fit) line to the sample data if
  408. True (default).
  409. plot : object, optional
  410. If given, plots the quantiles and least squares fit.
  411. `plot` is an object that has to have methods "plot" and "text".
  412. The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
  413. or a custom object with the same methods.
  414. Default is None, which means that no plot is created.
  415. Returns
  416. -------
  417. (osm, osr) : tuple of ndarrays
  418. Tuple of theoretical quantiles (osm, or order statistic medians) and
  419. ordered responses (osr). `osr` is simply sorted input `x`.
  420. For details on how `osm` is calculated see the Notes section.
  421. (slope, intercept, r) : tuple of floats, optional
  422. Tuple containing the result of the least-squares fit, if that is
  423. performed by `probplot`. `r` is the square root of the coefficient of
  424. determination. If ``fit=False`` and ``plot=None``, this tuple is not
  425. returned.
  426. Notes
  427. -----
  428. Even if `plot` is given, the figure is not shown or saved by `probplot`;
  429. ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after
  430. calling `probplot`.
  431. `probplot` generates a probability plot, which should not be confused with
  432. a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this
  433. type, see ``statsmodels.api.ProbPlot``.
  434. The formula used for the theoretical quantiles (horizontal axis of the
  435. probability plot) is Filliben's estimate::
  436. quantiles = dist.ppf(val), for
  437. 0.5**(1/n), for i = n
  438. val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1
  439. 1 - 0.5**(1/n), for i = 1
  440. where ``i`` indicates the i-th ordered value and ``n`` is the total number
  441. of values.
  442. Examples
  443. --------
  444. >>> from scipy import stats
  445. >>> import matplotlib.pyplot as plt
  446. >>> nsample = 100
  447. >>> np.random.seed(7654321)
  448. A t distribution with small degrees of freedom:
  449. >>> ax1 = plt.subplot(221)
  450. >>> x = stats.t.rvs(3, size=nsample)
  451. >>> res = stats.probplot(x, plot=plt)
  452. A t distribution with larger degrees of freedom:
  453. >>> ax2 = plt.subplot(222)
  454. >>> x = stats.t.rvs(25, size=nsample)
  455. >>> res = stats.probplot(x, plot=plt)
  456. A mixture of two normal distributions with broadcasting:
  457. >>> ax3 = plt.subplot(223)
  458. >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5],
  459. ... size=(nsample//2,2)).ravel()
  460. >>> res = stats.probplot(x, plot=plt)
  461. A standard normal distribution:
  462. >>> ax4 = plt.subplot(224)
  463. >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample)
  464. >>> res = stats.probplot(x, plot=plt)
  465. Produce a new figure with a loggamma distribution, using the ``dist`` and
  466. ``sparams`` keywords:
  467. >>> fig = plt.figure()
  468. >>> ax = fig.add_subplot(111)
  469. >>> x = stats.loggamma.rvs(c=2.5, size=500)
  470. >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax)
  471. >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5")
  472. Show the results with Matplotlib:
  473. >>> plt.show()
  474. """
  475. x = np.asarray(x)
  476. _perform_fit = fit or (plot is not None)
  477. if x.size == 0:
  478. if _perform_fit:
  479. return (x, x), (np.nan, np.nan, 0.0)
  480. else:
  481. return x, x
  482. osm_uniform = _calc_uniform_order_statistic_medians(len(x))
  483. dist = _parse_dist_kw(dist, enforce_subclass=False)
  484. if sparams is None:
  485. sparams = ()
  486. if isscalar(sparams):
  487. sparams = (sparams,)
  488. if not isinstance(sparams, tuple):
  489. sparams = tuple(sparams)
  490. osm = dist.ppf(osm_uniform, *sparams)
  491. osr = sort(x)
  492. if _perform_fit:
  493. # perform a linear least squares fit.
  494. slope, intercept, r, prob, sterrest = stats.linregress(osm, osr)
  495. if plot is not None:
  496. plot.plot(osm, osr, 'bo', osm, slope*osm + intercept, 'r-')
  497. _add_axis_labels_title(plot, xlabel='Theoretical quantiles',
  498. ylabel='Ordered Values',
  499. title='Probability Plot')
  500. # Add R^2 value to the plot as text
  501. if rvalue:
  502. xmin = amin(osm)
  503. xmax = amax(osm)
  504. ymin = amin(x)
  505. ymax = amax(x)
  506. posx = xmin + 0.70 * (xmax - xmin)
  507. posy = ymin + 0.01 * (ymax - ymin)
  508. plot.text(posx, posy, "$R^2=%1.4f$" % r**2)
  509. if fit:
  510. return (osm, osr), (slope, intercept, r)
  511. else:
  512. return osm, osr
  513. def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'):
  514. """
  515. Calculate the shape parameter that maximizes the PPCC
  516. The probability plot correlation coefficient (PPCC) plot can be used to
  517. determine the optimal shape parameter for a one-parameter family of
  518. distributions. ppcc_max returns the shape parameter that would maximize the
  519. probability plot correlation coefficient for the given data to a
  520. one-parameter family of distributions.
  521. Parameters
  522. ----------
  523. x : array_like
  524. Input array.
  525. brack : tuple, optional
  526. Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c)
  527. then they are assumed to be a starting interval for a downhill bracket
  528. search (see `scipy.optimize.brent`).
  529. dist : str or stats.distributions instance, optional
  530. Distribution or distribution function name. Objects that look enough
  531. like a stats.distributions instance (i.e. they have a ``ppf`` method)
  532. are also accepted. The default is ``'tukeylambda'``.
  533. Returns
  534. -------
  535. shape_value : float
  536. The shape parameter at which the probability plot correlation
  537. coefficient reaches its max value.
  538. See also
  539. --------
  540. ppcc_plot, probplot, boxcox
  541. Notes
  542. -----
  543. The brack keyword serves as a starting point which is useful in corner
  544. cases. One can use a plot to obtain a rough visual estimate of the location
  545. for the maximum to start the search near it.
  546. References
  547. ----------
  548. .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test for
  549. Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
  550. .. [2] https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm
  551. Examples
  552. --------
  553. First we generate some random data from a Tukey-Lambda distribution,
  554. with shape parameter -0.7:
  555. >>> from scipy import stats
  556. >>> x = stats.tukeylambda.rvs(-0.7, loc=2, scale=0.5, size=10000,
  557. ... random_state=1234567) + 1e4
  558. Now we explore this data with a PPCC plot as well as the related
  559. probability plot and Box-Cox normplot. A red line is drawn where we
  560. expect the PPCC value to be maximal (at the shape parameter -0.7 used
  561. above):
  562. >>> import matplotlib.pyplot as plt
  563. >>> fig = plt.figure(figsize=(8, 6))
  564. >>> ax = fig.add_subplot(111)
  565. >>> res = stats.ppcc_plot(x, -5, 5, plot=ax)
  566. We calculate the value where the shape should reach its maximum and a red
  567. line is drawn there. The line should coincide with the highest point in the
  568. ppcc_plot.
  569. >>> max = stats.ppcc_max(x)
  570. >>> ax.vlines(max, 0, 1, colors='r', label='Expected shape value')
  571. >>> plt.show()
  572. """
  573. dist = _parse_dist_kw(dist)
  574. osm_uniform = _calc_uniform_order_statistic_medians(len(x))
  575. osr = sort(x)
  576. # this function computes the x-axis values of the probability plot
  577. # and computes a linear regression (including the correlation)
  578. # and returns 1-r so that a minimization function maximizes the
  579. # correlation
  580. def tempfunc(shape, mi, yvals, func):
  581. xvals = func(mi, shape)
  582. r, prob = stats.pearsonr(xvals, yvals)
  583. return 1 - r
  584. return optimize.brent(tempfunc, brack=brack, args=(osm_uniform, osr, dist.ppf))
  585. def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80):
  586. """
  587. Calculate and optionally plot probability plot correlation coefficient.
  588. The probability plot correlation coefficient (PPCC) plot can be used to
  589. determine the optimal shape parameter for a one-parameter family of
  590. distributions. It cannot be used for distributions without shape parameters
  591. (like the normal distribution) or with multiple shape parameters.
  592. By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A
  593. Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed
  594. distributions via an approximately normal one, and is therefore particularly
  595. useful in practice.
  596. Parameters
  597. ----------
  598. x : array_like
  599. Input array.
  600. a, b: scalar
  601. Lower and upper bounds of the shape parameter to use.
  602. dist : str or stats.distributions instance, optional
  603. Distribution or distribution function name. Objects that look enough
  604. like a stats.distributions instance (i.e. they have a ``ppf`` method)
  605. are also accepted. The default is ``'tukeylambda'``.
  606. plot : object, optional
  607. If given, plots PPCC against the shape parameter.
  608. `plot` is an object that has to have methods "plot" and "text".
  609. The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
  610. or a custom object with the same methods.
  611. Default is None, which means that no plot is created.
  612. N : int, optional
  613. Number of points on the horizontal axis (equally distributed from
  614. `a` to `b`).
  615. Returns
  616. -------
  617. svals : ndarray
  618. The shape values for which `ppcc` was calculated.
  619. ppcc : ndarray
  620. The calculated probability plot correlation coefficient values.
  621. See also
  622. --------
  623. ppcc_max, probplot, boxcox_normplot, tukeylambda
  624. References
  625. ----------
  626. J.J. Filliben, "The Probability Plot Correlation Coefficient Test for
  627. Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
  628. Examples
  629. --------
  630. First we generate some random data from a Tukey-Lambda distribution,
  631. with shape parameter -0.7:
  632. >>> from scipy import stats
  633. >>> import matplotlib.pyplot as plt
  634. >>> np.random.seed(1234567)
  635. >>> x = stats.tukeylambda.rvs(-0.7, loc=2, scale=0.5, size=10000) + 1e4
  636. Now we explore this data with a PPCC plot as well as the related
  637. probability plot and Box-Cox normplot. A red line is drawn where we
  638. expect the PPCC value to be maximal (at the shape parameter -0.7 used
  639. above):
  640. >>> fig = plt.figure(figsize=(12, 4))
  641. >>> ax1 = fig.add_subplot(131)
  642. >>> ax2 = fig.add_subplot(132)
  643. >>> ax3 = fig.add_subplot(133)
  644. >>> res = stats.probplot(x, plot=ax1)
  645. >>> res = stats.boxcox_normplot(x, -5, 5, plot=ax2)
  646. >>> res = stats.ppcc_plot(x, -5, 5, plot=ax3)
  647. >>> ax3.vlines(-0.7, 0, 1, colors='r', label='Expected shape value')
  648. >>> plt.show()
  649. """
  650. if b <= a:
  651. raise ValueError("`b` has to be larger than `a`.")
  652. svals = np.linspace(a, b, num=N)
  653. ppcc = np.empty_like(svals)
  654. for k, sval in enumerate(svals):
  655. _, r2 = probplot(x, sval, dist=dist, fit=True)
  656. ppcc[k] = r2[-1]
  657. if plot is not None:
  658. plot.plot(svals, ppcc, 'x')
  659. _add_axis_labels_title(plot, xlabel='Shape Values',
  660. ylabel='Prob Plot Corr. Coef.',
  661. title='(%s) PPCC Plot' % dist)
  662. return svals, ppcc
  663. def boxcox_llf(lmb, data):
  664. r"""The boxcox log-likelihood function.
  665. Parameters
  666. ----------
  667. lmb : scalar
  668. Parameter for Box-Cox transformation. See `boxcox` for details.
  669. data : array_like
  670. Data to calculate Box-Cox log-likelihood for. If `data` is
  671. multi-dimensional, the log-likelihood is calculated along the first
  672. axis.
  673. Returns
  674. -------
  675. llf : float or ndarray
  676. Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`,
  677. an array otherwise.
  678. See Also
  679. --------
  680. boxcox, probplot, boxcox_normplot, boxcox_normmax
  681. Notes
  682. -----
  683. The Box-Cox log-likelihood function is defined here as
  684. .. math::
  685. llf = (\lambda - 1) \sum_i(\log(x_i)) -
  686. N/2 \log(\sum_i (y_i - \bar{y})^2 / N),
  687. where ``y`` is the Box-Cox transformed input data ``x``.
  688. Examples
  689. --------
  690. >>> from scipy import stats
  691. >>> import matplotlib.pyplot as plt
  692. >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
  693. >>> np.random.seed(1245)
  694. Generate some random variates and calculate Box-Cox log-likelihood values
  695. for them for a range of ``lmbda`` values:
  696. >>> x = stats.loggamma.rvs(5, loc=10, size=1000)
  697. >>> lmbdas = np.linspace(-2, 10)
  698. >>> llf = np.zeros(lmbdas.shape, dtype=float)
  699. >>> for ii, lmbda in enumerate(lmbdas):
  700. ... llf[ii] = stats.boxcox_llf(lmbda, x)
  701. Also find the optimal lmbda value with `boxcox`:
  702. >>> x_most_normal, lmbda_optimal = stats.boxcox(x)
  703. Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a
  704. horizontal line to check that that's really the optimum:
  705. >>> fig = plt.figure()
  706. >>> ax = fig.add_subplot(111)
  707. >>> ax.plot(lmbdas, llf, 'b.-')
  708. >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
  709. >>> ax.set_xlabel('lmbda parameter')
  710. >>> ax.set_ylabel('Box-Cox log-likelihood')
  711. Now add some probability plots to show that where the log-likelihood is
  712. maximized the data transformed with `boxcox` looks closest to normal:
  713. >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
  714. >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
  715. ... xt = stats.boxcox(x, lmbda=lmbda)
  716. ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
  717. ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
  718. ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
  719. ... ax_inset.set_xticklabels([])
  720. ... ax_inset.set_yticklabels([])
  721. ... ax_inset.set_title('$\lambda=%1.2f$' % lmbda)
  722. >>> plt.show()
  723. """
  724. data = np.asarray(data)
  725. N = data.shape[0]
  726. if N == 0:
  727. return np.nan
  728. y = boxcox(data, lmb)
  729. y_mean = np.mean(y, axis=0)
  730. llf = (lmb - 1) * np.sum(np.log(data), axis=0)
  731. llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0))
  732. return llf
  733. def _boxcox_conf_interval(x, lmax, alpha):
  734. # Need to find the lambda for which
  735. # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1
  736. fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1)
  737. target = boxcox_llf(lmax, x) - fac
  738. def rootfunc(lmbda, data, target):
  739. return boxcox_llf(lmbda, data) - target
  740. # Find positive endpoint of interval in which answer is to be found
  741. newlm = lmax + 0.5
  742. N = 0
  743. while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
  744. newlm += 0.1
  745. N += 1
  746. if N == 500:
  747. raise RuntimeError("Could not find endpoint.")
  748. lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target))
  749. # Now find negative interval in the same way
  750. newlm = lmax - 0.5
  751. N = 0
  752. while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
  753. newlm -= 0.1
  754. N += 1
  755. if N == 500:
  756. raise RuntimeError("Could not find endpoint.")
  757. lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target))
  758. return lmminus, lmplus
  759. def boxcox(x, lmbda=None, alpha=None):
  760. r"""
  761. Return a positive dataset transformed by a Box-Cox power transformation.
  762. Parameters
  763. ----------
  764. x : ndarray
  765. Input array. Should be 1-dimensional.
  766. lmbda : {None, scalar}, optional
  767. If `lmbda` is not None, do the transformation for that value.
  768. If `lmbda` is None, find the lambda that maximizes the log-likelihood
  769. function and return it as the second output argument.
  770. alpha : {None, float}, optional
  771. If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence
  772. interval for `lmbda` as the third output argument.
  773. Must be between 0.0 and 1.0.
  774. Returns
  775. -------
  776. boxcox : ndarray
  777. Box-Cox power transformed array.
  778. maxlog : float, optional
  779. If the `lmbda` parameter is None, the second returned argument is
  780. the lambda that maximizes the log-likelihood function.
  781. (min_ci, max_ci) : tuple of float, optional
  782. If `lmbda` parameter is None and ``alpha`` is not None, this returned
  783. tuple of floats represents the minimum and maximum confidence limits
  784. given ``alpha``.
  785. See Also
  786. --------
  787. probplot, boxcox_normplot, boxcox_normmax, boxcox_llf
  788. Notes
  789. -----
  790. The Box-Cox transform is given by::
  791. y = (x**lmbda - 1) / lmbda, for lmbda > 0
  792. log(x), for lmbda = 0
  793. `boxcox` requires the input data to be positive. Sometimes a Box-Cox
  794. transformation provides a shift parameter to achieve this; `boxcox` does
  795. not. Such a shift parameter is equivalent to adding a positive constant to
  796. `x` before calling `boxcox`.
  797. The confidence limits returned when ``alpha`` is provided give the interval
  798. where:
  799. .. math::
  800. llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
  801. with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared
  802. function.
  803. References
  804. ----------
  805. G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
  806. Royal Statistical Society B, 26, 211-252 (1964).
  807. Examples
  808. --------
  809. >>> from scipy import stats
  810. >>> import matplotlib.pyplot as plt
  811. We generate some random variates from a non-normal distribution and make a
  812. probability plot for it, to show it is non-normal in the tails:
  813. >>> fig = plt.figure()
  814. >>> ax1 = fig.add_subplot(211)
  815. >>> x = stats.loggamma.rvs(5, size=500) + 5
  816. >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
  817. >>> ax1.set_xlabel('')
  818. >>> ax1.set_title('Probplot against normal distribution')
  819. We now use `boxcox` to transform the data so it's closest to normal:
  820. >>> ax2 = fig.add_subplot(212)
  821. >>> xt, _ = stats.boxcox(x)
  822. >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
  823. >>> ax2.set_title('Probplot after Box-Cox transformation')
  824. >>> plt.show()
  825. """
  826. x = np.asarray(x)
  827. if x.size == 0:
  828. return x
  829. if any(x <= 0):
  830. raise ValueError("Data must be positive.")
  831. if lmbda is not None: # single transformation
  832. return special.boxcox(x, lmbda)
  833. # If lmbda=None, find the lmbda that maximizes the log-likelihood function.
  834. lmax = boxcox_normmax(x, method='mle')
  835. y = boxcox(x, lmax)
  836. if alpha is None:
  837. return y, lmax
  838. else:
  839. # Find confidence interval
  840. interval = _boxcox_conf_interval(x, lmax, alpha)
  841. return y, lmax, interval
  842. def boxcox_normmax(x, brack=(-2.0, 2.0), method='pearsonr'):
  843. """Compute optimal Box-Cox transform parameter for input data.
  844. Parameters
  845. ----------
  846. x : array_like
  847. Input array.
  848. brack : 2-tuple, optional
  849. The starting interval for a downhill bracket search with
  850. `optimize.brent`. Note that this is in most cases not critical; the
  851. final result is allowed to be outside this bracket.
  852. method : str, optional
  853. The method to determine the optimal transform parameter (`boxcox`
  854. ``lmbda`` parameter). Options are:
  855. 'pearsonr' (default)
  856. Maximizes the Pearson correlation coefficient between
  857. ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be
  858. normally-distributed.
  859. 'mle'
  860. Minimizes the log-likelihood `boxcox_llf`. This is the method used
  861. in `boxcox`.
  862. 'all'
  863. Use all optimization methods available, and return all results.
  864. Useful to compare different methods.
  865. Returns
  866. -------
  867. maxlog : float or ndarray
  868. The optimal transform parameter found. An array instead of a scalar
  869. for ``method='all'``.
  870. See Also
  871. --------
  872. boxcox, boxcox_llf, boxcox_normplot
  873. Examples
  874. --------
  875. >>> from scipy import stats
  876. >>> import matplotlib.pyplot as plt
  877. >>> np.random.seed(1234) # make this example reproducible
  878. Generate some data and determine optimal ``lmbda`` in various ways:
  879. >>> x = stats.loggamma.rvs(5, size=30) + 5
  880. >>> y, lmax_mle = stats.boxcox(x)
  881. >>> lmax_pearsonr = stats.boxcox_normmax(x)
  882. >>> lmax_mle
  883. 7.177...
  884. >>> lmax_pearsonr
  885. 7.916...
  886. >>> stats.boxcox_normmax(x, method='all')
  887. array([ 7.91667384, 7.17718692])
  888. >>> fig = plt.figure()
  889. >>> ax = fig.add_subplot(111)
  890. >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax)
  891. >>> ax.axvline(lmax_mle, color='r')
  892. >>> ax.axvline(lmax_pearsonr, color='g', ls='--')
  893. >>> plt.show()
  894. """
  895. def _pearsonr(x, brack):
  896. osm_uniform = _calc_uniform_order_statistic_medians(len(x))
  897. xvals = distributions.norm.ppf(osm_uniform)
  898. def _eval_pearsonr(lmbda, xvals, samps):
  899. # This function computes the x-axis values of the probability plot
  900. # and computes a linear regression (including the correlation) and
  901. # returns ``1 - r`` so that a minimization function maximizes the
  902. # correlation.
  903. y = boxcox(samps, lmbda)
  904. yvals = np.sort(y)
  905. r, prob = stats.pearsonr(xvals, yvals)
  906. return 1 - r
  907. return optimize.brent(_eval_pearsonr, brack=brack, args=(xvals, x))
  908. def _mle(x, brack):
  909. def _eval_mle(lmb, data):
  910. # function to minimize
  911. return -boxcox_llf(lmb, data)
  912. return optimize.brent(_eval_mle, brack=brack, args=(x,))
  913. def _all(x, brack):
  914. maxlog = np.zeros(2, dtype=float)
  915. maxlog[0] = _pearsonr(x, brack)
  916. maxlog[1] = _mle(x, brack)
  917. return maxlog
  918. methods = {'pearsonr': _pearsonr,
  919. 'mle': _mle,
  920. 'all': _all}
  921. if method not in methods.keys():
  922. raise ValueError("Method %s not recognized." % method)
  923. optimfunc = methods[method]
  924. return optimfunc(x, brack)
  925. def _normplot(method, x, la, lb, plot=None, N=80):
  926. """Compute parameters for a Box-Cox or Yeo-Johnson normality plot,
  927. optionally show it. See `boxcox_normplot` or `yeojohnson_normplot` for
  928. details."""
  929. if method == 'boxcox':
  930. title = 'Box-Cox Normality Plot'
  931. transform_func = boxcox
  932. else:
  933. title = 'Yeo-Johnson Normality Plot'
  934. transform_func = yeojohnson
  935. x = np.asarray(x)
  936. if x.size == 0:
  937. return x
  938. if lb <= la:
  939. raise ValueError("`lb` has to be larger than `la`.")
  940. lmbdas = np.linspace(la, lb, num=N)
  941. ppcc = lmbdas * 0.0
  942. for i, val in enumerate(lmbdas):
  943. # Determine for each lmbda the square root of correlation coefficient
  944. # of transformed x
  945. z = transform_func(x, lmbda=val)
  946. _, (_, _, r) = probplot(z, dist='norm', fit=True)
  947. ppcc[i] = r
  948. if plot is not None:
  949. plot.plot(lmbdas, ppcc, 'x')
  950. _add_axis_labels_title(plot, xlabel='$\\lambda$',
  951. ylabel='Prob Plot Corr. Coef.',
  952. title=title)
  953. return lmbdas, ppcc
  954. def boxcox_normplot(x, la, lb, plot=None, N=80):
  955. """Compute parameters for a Box-Cox normality plot, optionally show it.
  956. A Box-Cox normality plot shows graphically what the best transformation
  957. parameter is to use in `boxcox` to obtain a distribution that is close
  958. to normal.
  959. Parameters
  960. ----------
  961. x : array_like
  962. Input array.
  963. la, lb : scalar
  964. The lower and upper bounds for the ``lmbda`` values to pass to `boxcox`
  965. for Box-Cox transformations. These are also the limits of the
  966. horizontal axis of the plot if that is generated.
  967. plot : object, optional
  968. If given, plots the quantiles and least squares fit.
  969. `plot` is an object that has to have methods "plot" and "text".
  970. The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
  971. or a custom object with the same methods.
  972. Default is None, which means that no plot is created.
  973. N : int, optional
  974. Number of points on the horizontal axis (equally distributed from
  975. `la` to `lb`).
  976. Returns
  977. -------
  978. lmbdas : ndarray
  979. The ``lmbda`` values for which a Box-Cox transform was done.
  980. ppcc : ndarray
  981. Probability Plot Correlelation Coefficient, as obtained from `probplot`
  982. when fitting the Box-Cox transformed input `x` against a normal
  983. distribution.
  984. See Also
  985. --------
  986. probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max
  987. Notes
  988. -----
  989. Even if `plot` is given, the figure is not shown or saved by
  990. `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
  991. should be used after calling `probplot`.
  992. Examples
  993. --------
  994. >>> from scipy import stats
  995. >>> import matplotlib.pyplot as plt
  996. Generate some non-normally distributed data, and create a Box-Cox plot:
  997. >>> x = stats.loggamma.rvs(5, size=500) + 5
  998. >>> fig = plt.figure()
  999. >>> ax = fig.add_subplot(111)
  1000. >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax)
  1001. Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
  1002. the same plot:
  1003. >>> _, maxlog = stats.boxcox(x)
  1004. >>> ax.axvline(maxlog, color='r')
  1005. >>> plt.show()
  1006. """
  1007. return _normplot('boxcox', x, la, lb, plot, N)
  1008. def yeojohnson(x, lmbda=None):
  1009. r"""
  1010. Return a dataset transformed by a Yeo-Johnson power transformation.
  1011. Parameters
  1012. ----------
  1013. x : ndarray
  1014. Input array. Should be 1-dimensional.
  1015. lmbda : float, optional
  1016. If ``lmbda`` is ``None``, find the lambda that maximizes the
  1017. log-likelihood function and return it as the second output argument.
  1018. Otherwise the transformation is done for the given value.
  1019. Returns
  1020. -------
  1021. yeojohnson: ndarray
  1022. Yeo-Johnson power transformed array.
  1023. maxlog : float, optional
  1024. If the `lmbda` parameter is None, the second returned argument is
  1025. the lambda that maximizes the log-likelihood function.
  1026. See Also
  1027. --------
  1028. probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox
  1029. Notes
  1030. -----
  1031. The Yeo-Johnson transform is given by::
  1032. y = ((x + 1)**lmbda - 1) / lmbda, for x >= 0, lmbda != 0
  1033. log(x + 1), for x >= 0, lmbda = 0
  1034. -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda), for x < 0, lmbda != 2
  1035. -log(-x + 1), for x < 0, lmbda = 2
  1036. Unlike `boxcox`, `yeojohnson` does not require the input data to be
  1037. positive.
  1038. .. versionadded:: 1.2.0
  1039. References
  1040. ----------
  1041. I. Yeo and R.A. Johnson, "A New Family of Power Transformations to
  1042. Improve Normality or Symmetry", Biometrika 87.4 (2000):
  1043. Examples
  1044. --------
  1045. >>> from scipy import stats
  1046. >>> import matplotlib.pyplot as plt
  1047. We generate some random variates from a non-normal distribution and make a
  1048. probability plot for it, to show it is non-normal in the tails:
  1049. >>> fig = plt.figure()
  1050. >>> ax1 = fig.add_subplot(211)
  1051. >>> x = stats.loggamma.rvs(5, size=500) + 5
  1052. >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
  1053. >>> ax1.set_xlabel('')
  1054. >>> ax1.set_title('Probplot against normal distribution')
  1055. We now use `yeojohnson` to transform the data so it's closest to normal:
  1056. >>> ax2 = fig.add_subplot(212)
  1057. >>> xt, lmbda = stats.yeojohnson(x)
  1058. >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
  1059. >>> ax2.set_title('Probplot after Yeo-Johnson transformation')
  1060. >>> plt.show()
  1061. """
  1062. x = np.asarray(x)
  1063. if x.size == 0:
  1064. return x
  1065. if lmbda is not None:
  1066. return _yeojohnson_transform(x, lmbda)
  1067. # if lmbda=None, find the lmbda that maximizes the log-likelihood function.
  1068. lmax = yeojohnson_normmax(x)
  1069. y = _yeojohnson_transform(x, lmax)
  1070. return y, lmax
  1071. def _yeojohnson_transform(x, lmbda):
  1072. """Return x transformed by the Yeo-Johnson power transform with given
  1073. parameter lmbda."""
  1074. out = np.zeros_like(x)
  1075. pos = x >= 0 # binary mask
  1076. # when x >= 0
  1077. if abs(lmbda) < np.spacing(1.):
  1078. out[pos] = np.log1p(x[pos])
  1079. else: # lmbda != 0
  1080. out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
  1081. # when x < 0
  1082. if abs(lmbda - 2) > np.spacing(1.):
  1083. out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  1084. else: # lmbda == 2
  1085. out[~pos] = -np.log1p(-x[~pos])
  1086. return out
  1087. def yeojohnson_llf(lmb, data):
  1088. r"""The yeojohnson log-likelihood function.
  1089. Parameters
  1090. ----------
  1091. lmb : scalar
  1092. Parameter for Yeo-Johnson transformation. See `yeojohnson` for
  1093. details.
  1094. data : array_like
  1095. Data to calculate Yeo-Johnson log-likelihood for. If `data` is
  1096. multi-dimensional, the log-likelihood is calculated along the first
  1097. axis.
  1098. Returns
  1099. -------
  1100. llf : float
  1101. Yeo-Johnson log-likelihood of `data` given `lmb`.
  1102. See Also
  1103. --------
  1104. yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax
  1105. Notes
  1106. -----
  1107. The Yeo-Johnson log-likelihood function is defined here as
  1108. .. math::
  1109. llf = N/2 \log(\hat{\sigma}^2) + (\lambda - 1)
  1110. \sum_i \text{ sign }(x_i)\log(|x_i| + 1)
  1111. where :math:`\hat{\sigma}^2` is estimated variance of the the Yeo-Johnson
  1112. transformed input data ``x``.
  1113. .. versionadded:: 1.2.0
  1114. Examples
  1115. --------
  1116. >>> from scipy import stats
  1117. >>> import matplotlib.pyplot as plt
  1118. >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
  1119. >>> np.random.seed(1245)
  1120. Generate some random variates and calculate Yeo-Johnson log-likelihood
  1121. values for them for a range of ``lmbda`` values:
  1122. >>> x = stats.loggamma.rvs(5, loc=10, size=1000)
  1123. >>> lmbdas = np.linspace(-2, 10)
  1124. >>> llf = np.zeros(lmbdas.shape, dtype=float)
  1125. >>> for ii, lmbda in enumerate(lmbdas):
  1126. ... llf[ii] = stats.yeojohnson_llf(lmbda, x)
  1127. Also find the optimal lmbda value with `yeojohnson`:
  1128. >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x)
  1129. Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a
  1130. horizontal line to check that that's really the optimum:
  1131. >>> fig = plt.figure()
  1132. >>> ax = fig.add_subplot(111)
  1133. >>> ax.plot(lmbdas, llf, 'b.-')
  1134. >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r')
  1135. >>> ax.set_xlabel('lmbda parameter')
  1136. >>> ax.set_ylabel('Yeo-Johnson log-likelihood')
  1137. Now add some probability plots to show that where the log-likelihood is
  1138. maximized the data transformed with `yeojohnson` looks closest to normal:
  1139. >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right'
  1140. >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
  1141. ... xt = stats.yeojohnson(x, lmbda=lmbda)
  1142. ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
  1143. ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
  1144. ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
  1145. ... ax_inset.set_xticklabels([])
  1146. ... ax_inset.set_yticklabels([])
  1147. ... ax_inset.set_title('$\lambda=%1.2f$' % lmbda)
  1148. >>> plt.show()
  1149. """
  1150. data = np.asarray(data)
  1151. n_samples = data.shape[0]
  1152. if n_samples == 0:
  1153. return np.nan
  1154. trans = _yeojohnson_transform(data, lmb)
  1155. loglike = -n_samples / 2 * np.log(trans.var(axis=0))
  1156. loglike += (lmb - 1) * (np.sign(data) * np.log(np.abs(data) + 1)).sum(axis=0)
  1157. return loglike
  1158. def yeojohnson_normmax(x, brack=(-2, 2)):
  1159. """Compute optimal Yeo-Johnson transform parameter for input data, using
  1160. maximum likelihood estimation.
  1161. Parameters
  1162. ----------
  1163. x : array_like
  1164. Input array.
  1165. brack : 2-tuple, optional
  1166. The starting interval for a downhill bracket search with
  1167. `optimize.brent`. Note that this is in most cases not critical; the
  1168. final result is allowed to be outside this bracket.
  1169. Returns
  1170. -------
  1171. maxlog : float
  1172. The optimal transform parameter found.
  1173. Notes
  1174. -----
  1175. .. versionadded:: 1.2.0
  1176. See Also
  1177. --------
  1178. yeojohnson, yeojohnson_llf, yeojohnson_normplot
  1179. Examples
  1180. --------
  1181. >>> from scipy import stats
  1182. >>> import matplotlib.pyplot as plt
  1183. >>> np.random.seed(1234) # make this example reproducible
  1184. Generate some data and determine optimal ``lmbda``
  1185. >>> x = stats.loggamma.rvs(5, size=30) + 5
  1186. >>> lmax = stats.yeojohnson_normmax(x)
  1187. >>> fig = plt.figure()
  1188. >>> ax = fig.add_subplot(111)
  1189. >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax)
  1190. >>> ax.axvline(lmax, color='r')
  1191. >>> plt.show()
  1192. """
  1193. def _neg_llf(lmbda, data):
  1194. return -yeojohnson_llf(lmbda, data)
  1195. return optimize.brent(_neg_llf, brack=brack, args=(x,))
  1196. def yeojohnson_normplot(x, la, lb, plot=None, N=80):
  1197. """Compute parameters for a Yeo-Johnson normality plot, optionally show it.
  1198. A Yeo-Johnson normality plot shows graphically what the best
  1199. transformation parameter is to use in `yeojohnson` to obtain a
  1200. distribution that is close to normal.
  1201. Parameters
  1202. ----------
  1203. x : array_like
  1204. Input array.
  1205. la, lb : scalar
  1206. The lower and upper bounds for the ``lmbda`` values to pass to
  1207. `yeojohnson` for Yeo-Johnson transformations. These are also the
  1208. limits of the horizontal axis of the plot if that is generated.
  1209. plot : object, optional
  1210. If given, plots the quantiles and least squares fit.
  1211. `plot` is an object that has to have methods "plot" and "text".
  1212. The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
  1213. or a custom object with the same methods.
  1214. Default is None, which means that no plot is created.
  1215. N : int, optional
  1216. Number of points on the horizontal axis (equally distributed from
  1217. `la` to `lb`).
  1218. Returns
  1219. -------
  1220. lmbdas : ndarray
  1221. The ``lmbda`` values for which a Yeo-Johnson transform was done.
  1222. ppcc : ndarray
  1223. Probability Plot Correlelation Coefficient, as obtained from `probplot`
  1224. when fitting the Box-Cox transformed input `x` against a normal
  1225. distribution.
  1226. See Also
  1227. --------
  1228. probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max
  1229. Notes
  1230. -----
  1231. Even if `plot` is given, the figure is not shown or saved by
  1232. `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
  1233. should be used after calling `probplot`.
  1234. .. versionadded:: 1.2.0
  1235. Examples
  1236. --------
  1237. >>> from scipy import stats
  1238. >>> import matplotlib.pyplot as plt
  1239. Generate some non-normally distributed data, and create a Yeo-Johnson plot:
  1240. >>> x = stats.loggamma.rvs(5, size=500) + 5
  1241. >>> fig = plt.figure()
  1242. >>> ax = fig.add_subplot(111)
  1243. >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax)
  1244. Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
  1245. the same plot:
  1246. >>> _, maxlog = stats.yeojohnson(x)
  1247. >>> ax.axvline(maxlog, color='r')
  1248. >>> plt.show()
  1249. """
  1250. return _normplot('yeojohnson', x, la, lb, plot, N)
  1251. def shapiro(x):
  1252. """
  1253. Perform the Shapiro-Wilk test for normality.
  1254. The Shapiro-Wilk test tests the null hypothesis that the
  1255. data was drawn from a normal distribution.
  1256. Parameters
  1257. ----------
  1258. x : array_like
  1259. Array of sample data.
  1260. Returns
  1261. -------
  1262. W : float
  1263. The test statistic.
  1264. p-value : float
  1265. The p-value for the hypothesis test.
  1266. See Also
  1267. --------
  1268. anderson : The Anderson-Darling test for normality
  1269. kstest : The Kolmogorov-Smirnov test for goodness of fit.
  1270. Notes
  1271. -----
  1272. The algorithm used is described in [4]_ but censoring parameters as
  1273. described are not implemented. For N > 5000 the W test statistic is accurate
  1274. but the p-value may not be.
  1275. The chance of rejecting the null hypothesis when it is true is close to 5%
  1276. regardless of sample size.
  1277. References
  1278. ----------
  1279. .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
  1280. .. [2] Shapiro, S. S. & Wilk, M.B (1965). An analysis of variance test for
  1281. normality (complete samples), Biometrika, Vol. 52, pp. 591-611.
  1282. .. [3] Razali, N. M. & Wah, Y. B. (2011) Power comparisons of Shapiro-Wilk,
  1283. Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests, Journal of
  1284. Statistical Modeling and Analytics, Vol. 2, pp. 21-33.
  1285. .. [4] ALGORITHM AS R94 APPL. STATIST. (1995) VOL. 44, NO. 4.
  1286. Examples
  1287. --------
  1288. >>> from scipy import stats
  1289. >>> np.random.seed(12345678)
  1290. >>> x = stats.norm.rvs(loc=5, scale=3, size=100)
  1291. >>> stats.shapiro(x)
  1292. (0.9772805571556091, 0.08144091814756393)
  1293. """
  1294. x = np.ravel(x)
  1295. N = len(x)
  1296. if N < 3:
  1297. raise ValueError("Data must be at least length 3.")
  1298. a = zeros(N, 'f')
  1299. init = 0
  1300. y = sort(x)
  1301. a, w, pw, ifault = statlib.swilk(y, a[:N//2], init)
  1302. if ifault not in [0, 2]:
  1303. warnings.warn("Input data for shapiro has range zero. The results "
  1304. "may not be accurate.")
  1305. if N > 5000:
  1306. warnings.warn("p-value may not be accurate for N > 5000.")
  1307. return w, pw
  1308. # Values from Stephens, M A, "EDF Statistics for Goodness of Fit and
  1309. # Some Comparisons", Journal of he American Statistical
  1310. # Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737
  1311. _Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092])
  1312. _Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957])
  1313. # From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution",
  1314. # Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588.
  1315. _Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038])
  1316. # From Stephens, M A, "Tests of Fit for the Logistic Distribution Based
  1317. # on the Empirical Distribution Function.", Biometrika,
  1318. # Vol. 66, Issue 3, Dec. 1979, pp 591-595.
  1319. _Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010])
  1320. AndersonResult = namedtuple('AndersonResult', ('statistic',
  1321. 'critical_values',
  1322. 'significance_level'))
  1323. def anderson(x, dist='norm'):
  1324. """
  1325. Anderson-Darling test for data coming from a particular distribution
  1326. The Anderson-Darling tests the null hypothesis that a sample is
  1327. drawn from a population that follows a particular distribution.
  1328. For the Anderson-Darling test, the critical values depend on
  1329. which distribution is being tested against. This function works
  1330. for normal, exponential, logistic, or Gumbel (Extreme Value
  1331. Type I) distributions.
  1332. Parameters
  1333. ----------
  1334. x : array_like
  1335. array of sample data
  1336. dist : {'norm','expon','logistic','gumbel','gumbel_l', gumbel_r',
  1337. 'extreme1'}, optional
  1338. the type of distribution to test against. The default is 'norm'
  1339. and 'extreme1', 'gumbel_l' and 'gumbel' are synonyms.
  1340. Returns
  1341. -------
  1342. statistic : float
  1343. The Anderson-Darling test statistic
  1344. critical_values : list
  1345. The critical values for this distribution
  1346. significance_level : list
  1347. The significance levels for the corresponding critical values
  1348. in percents. The function returns critical values for a
  1349. differing set of significance levels depending on the
  1350. distribution that is being tested against.
  1351. See Also
  1352. --------
  1353. kstest : The Kolmogorov-Smirnov test for goodness-of-fit.
  1354. Notes
  1355. -----
  1356. Critical values provided are for the following significance levels:
  1357. normal/exponenential
  1358. 15%, 10%, 5%, 2.5%, 1%
  1359. logistic
  1360. 25%, 10%, 5%, 2.5%, 1%, 0.5%
  1361. Gumbel
  1362. 25%, 10%, 5%, 2.5%, 1%
  1363. If the returned statistic is larger than these critical values then
  1364. for the corresponding significance level, the null hypothesis that
  1365. the data come from the chosen distribution can be rejected.
  1366. The returned statistic is referred to as 'A2' in the references.
  1367. References
  1368. ----------
  1369. .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
  1370. .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and
  1371. Some Comparisons, Journal of the American Statistical Association,
  1372. Vol. 69, pp. 730-737.
  1373. .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit
  1374. Statistics with Unknown Parameters, Annals of Statistics, Vol. 4,
  1375. pp. 357-369.
  1376. .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value
  1377. Distribution, Biometrika, Vol. 64, pp. 583-588.
  1378. .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference
  1379. to Tests for Exponentiality , Technical Report No. 262,
  1380. Department of Statistics, Stanford University, Stanford, CA.
  1381. .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution
  1382. Based on the Empirical Distribution Function, Biometrika, Vol. 66,
  1383. pp. 591-595.
  1384. """
  1385. if dist not in ['norm', 'expon', 'gumbel', 'gumbel_l',
  1386. 'gumbel_r', 'extreme1', 'logistic']:
  1387. raise ValueError("Invalid distribution; dist must be 'norm', "
  1388. "'expon', 'gumbel', 'extreme1' or 'logistic'.")
  1389. y = sort(x)
  1390. xbar = np.mean(x, axis=0)
  1391. N = len(y)
  1392. if dist == 'norm':
  1393. s = np.std(x, ddof=1, axis=0)
  1394. w = (y - xbar) / s
  1395. logcdf = distributions.norm.logcdf(w)
  1396. logsf = distributions.norm.logsf(w)
  1397. sig = array([15, 10, 5, 2.5, 1])
  1398. critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N), 3)
  1399. elif dist == 'expon':
  1400. w = y / xbar
  1401. logcdf = distributions.expon.logcdf(w)
  1402. logsf = distributions.expon.logsf(w)
  1403. sig = array([15, 10, 5, 2.5, 1])
  1404. critical = around(_Avals_expon / (1.0 + 0.6/N), 3)
  1405. elif dist == 'logistic':
  1406. def rootfunc(ab, xj, N):
  1407. a, b = ab
  1408. tmp = (xj - a) / b
  1409. tmp2 = exp(tmp)
  1410. val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N,
  1411. np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N]
  1412. return array(val)
  1413. sol0 = array([xbar, np.std(x, ddof=1, axis=0)])
  1414. sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5)
  1415. w = (y - sol[0]) / sol[1]
  1416. logcdf = distributions.logistic.logcdf(w)
  1417. logsf = distributions.logistic.logsf(w)
  1418. sig = array([25, 10, 5, 2.5, 1, 0.5])
  1419. critical = around(_Avals_logistic / (1.0 + 0.25/N), 3)
  1420. elif dist == 'gumbel_r':
  1421. xbar, s = distributions.gumbel_r.fit(x)
  1422. w = (y - xbar) / s
  1423. logcdf = distributions.gumbel_r.logcdf(w)
  1424. logsf = distributions.gumbel_r.logsf(w)
  1425. sig = array([25, 10, 5, 2.5, 1])
  1426. critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
  1427. else: # (dist == 'gumbel') or (dist == 'gumbel_l') or (dist == 'extreme1')
  1428. xbar, s = distributions.gumbel_l.fit(x)
  1429. w = (y - xbar) / s
  1430. logcdf = distributions.gumbel_l.logcdf(w)
  1431. logsf = distributions.gumbel_l.logsf(w)
  1432. sig = array([25, 10, 5, 2.5, 1])
  1433. critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
  1434. i = arange(1, N + 1)
  1435. A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0)
  1436. return AndersonResult(A2, critical, sig)
  1437. def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
  1438. """
  1439. Compute A2akN equation 7 of Scholz and Stephens.
  1440. Parameters
  1441. ----------
  1442. samples : sequence of 1-D array_like
  1443. Array of sample arrays.
  1444. Z : array_like
  1445. Sorted array of all observations.
  1446. Zstar : array_like
  1447. Sorted array of unique observations.
  1448. k : int
  1449. Number of samples.
  1450. n : array_like
  1451. Number of observations in each sample.
  1452. N : int
  1453. Total number of observations.
  1454. Returns
  1455. -------
  1456. A2aKN : float
  1457. The A2aKN statistics of Scholz and Stephens 1987.
  1458. """
  1459. A2akN = 0.
  1460. Z_ssorted_left = Z.searchsorted(Zstar, 'left')
  1461. if N == Zstar.size:
  1462. lj = 1.
  1463. else:
  1464. lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
  1465. Bj = Z_ssorted_left + lj / 2.
  1466. for i in arange(0, k):
  1467. s = np.sort(samples[i])
  1468. s_ssorted_right = s.searchsorted(Zstar, side='right')
  1469. Mij = s_ssorted_right.astype(float)
  1470. fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
  1471. Mij -= fij / 2.
  1472. inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
  1473. A2akN += inner.sum() / n[i]
  1474. A2akN *= (N - 1.) / N
  1475. return A2akN
  1476. def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
  1477. """
  1478. Compute A2akN equation 6 of Scholz & Stephens.
  1479. Parameters
  1480. ----------
  1481. samples : sequence of 1-D array_like
  1482. Array of sample arrays.
  1483. Z : array_like
  1484. Sorted array of all observations.
  1485. Zstar : array_like
  1486. Sorted array of unique observations.
  1487. k : int
  1488. Number of samples.
  1489. n : array_like
  1490. Number of observations in each sample.
  1491. N : int
  1492. Total number of observations.
  1493. Returns
  1494. -------
  1495. A2KN : float
  1496. The A2KN statistics of Scholz and Stephens 1987.
  1497. """
  1498. A2kN = 0.
  1499. lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1],
  1500. 'left')
  1501. Bj = lj.cumsum()
  1502. for i in arange(0, k):
  1503. s = np.sort(samples[i])
  1504. Mij = s.searchsorted(Zstar[:-1], side='right')
  1505. inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
  1506. A2kN += inner.sum() / n[i]
  1507. return A2kN
  1508. Anderson_ksampResult = namedtuple('Anderson_ksampResult',
  1509. ('statistic', 'critical_values',
  1510. 'significance_level'))
  1511. def anderson_ksamp(samples, midrank=True):
  1512. """The Anderson-Darling test for k-samples.
  1513. The k-sample Anderson-Darling test is a modification of the
  1514. one-sample Anderson-Darling test. It tests the null hypothesis
  1515. that k-samples are drawn from the same population without having
  1516. to specify the distribution function of that population. The
  1517. critical values depend on the number of samples.
  1518. Parameters
  1519. ----------
  1520. samples : sequence of 1-D array_like
  1521. Array of sample data in arrays.
  1522. midrank : bool, optional
  1523. Type of Anderson-Darling test which is computed. Default
  1524. (True) is the midrank test applicable to continuous and
  1525. discrete populations. If False, the right side empirical
  1526. distribution is used.
  1527. Returns
  1528. -------
  1529. statistic : float
  1530. Normalized k-sample Anderson-Darling test statistic.
  1531. critical_values : array
  1532. The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%.
  1533. significance_level : float
  1534. An approximate significance level at which the null hypothesis for the
  1535. provided samples can be rejected. The value is floored / capped at
  1536. 1% / 25%.
  1537. Raises
  1538. ------
  1539. ValueError
  1540. If less than 2 samples are provided, a sample is empty, or no
  1541. distinct observations are in the samples.
  1542. See Also
  1543. --------
  1544. ks_2samp : 2 sample Kolmogorov-Smirnov test
  1545. anderson : 1 sample Anderson-Darling test
  1546. Notes
  1547. -----
  1548. [1]_ defines three versions of the k-sample Anderson-Darling test:
  1549. one for continuous distributions and two for discrete
  1550. distributions, in which ties between samples may occur. The
  1551. default of this routine is to compute the version based on the
  1552. midrank empirical distribution function. This test is applicable
  1553. to continuous and discrete data. If midrank is set to False, the
  1554. right side empirical distribution is used for a test for discrete
  1555. data. According to [1]_, the two discrete test statistics differ
  1556. only slightly if a few collisions due to round-off errors occur in
  1557. the test not adjusted for ties between samples.
  1558. The critical values corresponding to the significance levels from 0.01
  1559. to 0.25 are taken from [1]_. p-values are floored / capped
  1560. at 0.1% / 25%. Since the range of critical values might be extended in
  1561. future releases, it is recommended not to test ``p == 0.25``, but rather
  1562. ``p >= 0.25`` (analogously for the lower bound).
  1563. .. versionadded:: 0.14.0
  1564. References
  1565. ----------
  1566. .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample
  1567. Anderson-Darling Tests, Journal of the American Statistical
  1568. Association, Vol. 82, pp. 918-924.
  1569. Examples
  1570. --------
  1571. >>> from scipy import stats
  1572. >>> np.random.seed(314159)
  1573. The null hypothesis that the two random samples come from the same
  1574. distribution can be rejected at the 5% level because the returned
  1575. test value is greater than the critical value for 5% (1.961) but
  1576. not at the 2.5% level. The interpolation gives an approximate
  1577. significance level of 3.2%:
  1578. >>> stats.anderson_ksamp([np.random.normal(size=50),
  1579. ... np.random.normal(loc=0.5, size=30)])
  1580. (2.4615796189876105,
  1581. array([ 0.325, 1.226, 1.961, 2.718, 3.752, 4.592, 6.546]),
  1582. 0.03176687568842282)
  1583. The null hypothesis cannot be rejected for three samples from an
  1584. identical distribution. The reported p-value (25%) has been capped and
  1585. may not be very accurate (since it corresponds to the value 0.449
  1586. whereas the statistic is -0.731):
  1587. >>> stats.anderson_ksamp([np.random.normal(size=50),
  1588. ... np.random.normal(size=30), np.random.normal(size=20)])
  1589. (-0.73091722665244196,
  1590. array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856,
  1591. 4.07210043, 5.56419101]),
  1592. 0.25)
  1593. """
  1594. k = len(samples)
  1595. if (k < 2):
  1596. raise ValueError("anderson_ksamp needs at least two samples")
  1597. samples = list(map(np.asarray, samples))
  1598. Z = np.sort(np.hstack(samples))
  1599. N = Z.size
  1600. Zstar = np.unique(Z)
  1601. if Zstar.size < 2:
  1602. raise ValueError("anderson_ksamp needs more than one distinct "
  1603. "observation")
  1604. n = np.array([sample.size for sample in samples])
  1605. if any(n == 0):
  1606. raise ValueError("anderson_ksamp encountered sample without "
  1607. "observations")
  1608. if midrank:
  1609. A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)
  1610. else:
  1611. A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N)
  1612. H = (1. / n).sum()
  1613. hs_cs = (1. / arange(N - 1, 1, -1)).cumsum()
  1614. h = hs_cs[-1] + 1
  1615. g = (hs_cs / arange(2, N)).sum()
  1616. a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
  1617. b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
  1618. c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
  1619. d = (2*h + 6)*k**2 - 4*h*k
  1620. sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
  1621. m = k - 1
  1622. A2 = (A2kN - m) / math.sqrt(sigmasq)
  1623. # The b_i values are the interpolation coefficients from Table 2
  1624. # of Scholz and Stephens 1987
  1625. b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085])
  1626. b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615])
  1627. b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154])
  1628. critical = b0 + b1 / math.sqrt(m) + b2 / m
  1629. sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001])
  1630. if A2 < critical.min():
  1631. p = sig.max()
  1632. warnings.warn("p-value capped: true value larger than {}".format(p),
  1633. stacklevel=2)
  1634. elif A2 > critical.max():
  1635. p = sig.min()
  1636. warnings.warn("p-value floored: true value smaller than {}".format(p),
  1637. stacklevel=2)
  1638. else:
  1639. # interpolation of probit of significance level
  1640. pf = np.polyfit(critical, log(sig), 2)
  1641. p = math.exp(np.polyval(pf, A2))
  1642. return Anderson_ksampResult(A2, critical, p)
  1643. AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue'))
  1644. def ansari(x, y):
  1645. """
  1646. Perform the Ansari-Bradley test for equal scale parameters
  1647. The Ansari-Bradley test is a non-parametric test for the equality
  1648. of the scale parameter of the distributions from which two
  1649. samples were drawn.
  1650. Parameters
  1651. ----------
  1652. x, y : array_like
  1653. arrays of sample data
  1654. Returns
  1655. -------
  1656. statistic : float
  1657. The Ansari-Bradley test statistic
  1658. pvalue : float
  1659. The p-value of the hypothesis test
  1660. See Also
  1661. --------
  1662. fligner : A non-parametric test for the equality of k variances
  1663. mood : A non-parametric test for the equality of two scale parameters
  1664. Notes
  1665. -----
  1666. The p-value given is exact when the sample sizes are both less than
  1667. 55 and there are no ties, otherwise a normal approximation for the
  1668. p-value is used.
  1669. References
  1670. ----------
  1671. .. [1] Sprent, Peter and N.C. Smeeton. Applied nonparametric statistical
  1672. methods. 3rd ed. Chapman and Hall/CRC. 2001. Section 5.8.2.
  1673. """
  1674. x, y = asarray(x), asarray(y)
  1675. n = len(x)
  1676. m = len(y)
  1677. if m < 1:
  1678. raise ValueError("Not enough other observations.")
  1679. if n < 1:
  1680. raise ValueError("Not enough test observations.")
  1681. N = m + n
  1682. xy = r_[x, y] # combine
  1683. rank = stats.rankdata(xy)
  1684. symrank = amin(array((rank, N - rank + 1)), 0)
  1685. AB = np.sum(symrank[:n], axis=0)
  1686. uxy = unique(xy)
  1687. repeats = (len(uxy) != len(xy))
  1688. exact = ((m < 55) and (n < 55) and not repeats)
  1689. if repeats and (m < 55 or n < 55):
  1690. warnings.warn("Ties preclude use of exact statistic.")
  1691. if exact:
  1692. astart, a1, ifault = statlib.gscale(n, m)
  1693. ind = AB - astart
  1694. total = np.sum(a1, axis=0)
  1695. if ind < len(a1)/2.0:
  1696. cind = int(ceil(ind))
  1697. if ind == cind:
  1698. pval = 2.0 * np.sum(a1[:cind+1], axis=0) / total
  1699. else:
  1700. pval = 2.0 * np.sum(a1[:cind], axis=0) / total
  1701. else:
  1702. find = int(floor(ind))
  1703. if ind == floor(ind):
  1704. pval = 2.0 * np.sum(a1[find:], axis=0) / total
  1705. else:
  1706. pval = 2.0 * np.sum(a1[find+1:], axis=0) / total
  1707. return AnsariResult(AB, min(1.0, pval))
  1708. # otherwise compute normal approximation
  1709. if N % 2: # N odd
  1710. mnAB = n * (N+1.0)**2 / 4.0 / N
  1711. varAB = n * m * (N+1.0) * (3+N**2) / (48.0 * N**2)
  1712. else:
  1713. mnAB = n * (N+2.0) / 4.0
  1714. varAB = m * n * (N+2) * (N-2.0) / 48 / (N-1.0)
  1715. if repeats: # adjust variance estimates
  1716. # compute np.sum(tj * rj**2,axis=0)
  1717. fac = np.sum(symrank**2, axis=0)
  1718. if N % 2: # N odd
  1719. varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1))
  1720. else: # N even
  1721. varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1))
  1722. z = (AB - mnAB) / sqrt(varAB)
  1723. pval = distributions.norm.sf(abs(z)) * 2.0
  1724. return AnsariResult(AB, pval)
  1725. BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue'))
  1726. def bartlett(*args):
  1727. """
  1728. Perform Bartlett's test for equal variances
  1729. Bartlett's test tests the null hypothesis that all input samples
  1730. are from populations with equal variances. For samples
  1731. from significantly non-normal populations, Levene's test
  1732. `levene` is more robust.
  1733. Parameters
  1734. ----------
  1735. sample1, sample2,... : array_like
  1736. arrays of sample data. Only 1d arrays are accepted, they may have
  1737. different lengths.
  1738. Returns
  1739. -------
  1740. statistic : float
  1741. The test statistic.
  1742. pvalue : float
  1743. The p-value of the test.
  1744. See Also
  1745. --------
  1746. fligner : A non-parametric test for the equality of k variances
  1747. levene : A robust parametric test for equality of k variances
  1748. Notes
  1749. -----
  1750. Conover et al. (1981) examine many of the existing parametric and
  1751. nonparametric tests by extensive simulations and they conclude that the
  1752. tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
  1753. superior in terms of robustness of departures from normality and power
  1754. ([3]_).
  1755. References
  1756. ----------
  1757. .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm
  1758. .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical
  1759. Methods, Eighth Edition, Iowa State University Press.
  1760. .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
  1761. Hypothesis Testing based on Quadratic Inference Function. Technical
  1762. Report #99-03, Center for Likelihood Studies, Pennsylvania State
  1763. University.
  1764. .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical
  1765. Tests. Proceedings of the Royal Society of London. Series A,
  1766. Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282.
  1767. """
  1768. # Handle empty input and input that is not 1d
  1769. for a in args:
  1770. if np.asanyarray(a).size == 0:
  1771. return BartlettResult(np.nan, np.nan)
  1772. if np.asanyarray(a).ndim > 1:
  1773. raise ValueError('Samples must be one-dimensional.')
  1774. k = len(args)
  1775. if k < 2:
  1776. raise ValueError("Must enter at least two input sample vectors.")
  1777. Ni = zeros(k)
  1778. ssq = zeros(k, 'd')
  1779. for j in range(k):
  1780. Ni[j] = len(args[j])
  1781. ssq[j] = np.var(args[j], ddof=1)
  1782. Ntot = np.sum(Ni, axis=0)
  1783. spsq = np.sum((Ni - 1)*ssq, axis=0) / (1.0*(Ntot - k))
  1784. numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  1785. denom = 1.0 + 1.0/(3*(k - 1)) * ((np.sum(1.0/(Ni - 1.0), axis=0)) -
  1786. 1.0/(Ntot - k))
  1787. T = numer / denom
  1788. pval = distributions.chi2.sf(T, k - 1) # 1 - cdf
  1789. return BartlettResult(T, pval)
  1790. LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue'))
  1791. def levene(*args, **kwds):
  1792. """
  1793. Perform Levene test for equal variances.
  1794. The Levene test tests the null hypothesis that all input samples
  1795. are from populations with equal variances. Levene's test is an
  1796. alternative to Bartlett's test `bartlett` in the case where
  1797. there are significant deviations from normality.
  1798. Parameters
  1799. ----------
  1800. sample1, sample2, ... : array_like
  1801. The sample data, possibly with different lengths. Only one-dimensional
  1802. samples are accepted.
  1803. center : {'mean', 'median', 'trimmed'}, optional
  1804. Which function of the data to use in the test. The default
  1805. is 'median'.
  1806. proportiontocut : float, optional
  1807. When `center` is 'trimmed', this gives the proportion of data points
  1808. to cut from each end. (See `scipy.stats.trim_mean`.)
  1809. Default is 0.05.
  1810. Returns
  1811. -------
  1812. statistic : float
  1813. The test statistic.
  1814. pvalue : float
  1815. The p-value for the test.
  1816. Notes
  1817. -----
  1818. Three variations of Levene's test are possible. The possibilities
  1819. and their recommended usages are:
  1820. * 'median' : Recommended for skewed (non-normal) distributions>
  1821. * 'mean' : Recommended for symmetric, moderate-tailed distributions.
  1822. * 'trimmed' : Recommended for heavy-tailed distributions.
  1823. The test version using the mean was proposed in the original article
  1824. of Levene ([2]_) while the median and trimmed mean have been studied by
  1825. Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe
  1826. test.
  1827. References
  1828. ----------
  1829. .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
  1830. .. [2] Levene, H. (1960). In Contributions to Probability and Statistics:
  1831. Essays in Honor of Harold Hotelling, I. Olkin et al. eds.,
  1832. Stanford University Press, pp. 278-292.
  1833. .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American
  1834. Statistical Association, 69, 364-367
  1835. """
  1836. # Handle keyword arguments.
  1837. center = 'median'
  1838. proportiontocut = 0.05
  1839. for kw, value in kwds.items():
  1840. if kw not in ['center', 'proportiontocut']:
  1841. raise TypeError("levene() got an unexpected keyword "
  1842. "argument '%s'" % kw)
  1843. if kw == 'center':
  1844. center = value
  1845. else:
  1846. proportiontocut = value
  1847. k = len(args)
  1848. if k < 2:
  1849. raise ValueError("Must enter at least two input sample vectors.")
  1850. # check for 1d input
  1851. for j in range(k):
  1852. if np.asanyarray(args[j]).ndim > 1:
  1853. raise ValueError('Samples must be one-dimensional.')
  1854. Ni = zeros(k)
  1855. Yci = zeros(k, 'd')
  1856. if center not in ['mean', 'median', 'trimmed']:
  1857. raise ValueError("Keyword argument <center> must be 'mean', 'median'"
  1858. " or 'trimmed'.")
  1859. if center == 'median':
  1860. func = lambda x: np.median(x, axis=0)
  1861. elif center == 'mean':
  1862. func = lambda x: np.mean(x, axis=0)
  1863. else: # center == 'trimmed'
  1864. args = tuple(stats.trimboth(np.sort(arg), proportiontocut)
  1865. for arg in args)
  1866. func = lambda x: np.mean(x, axis=0)
  1867. for j in range(k):
  1868. Ni[j] = len(args[j])
  1869. Yci[j] = func(args[j])
  1870. Ntot = np.sum(Ni, axis=0)
  1871. # compute Zij's
  1872. Zij = [None] * k
  1873. for i in range(k):
  1874. Zij[i] = abs(asarray(args[i]) - Yci[i])
  1875. # compute Zbari
  1876. Zbari = zeros(k, 'd')
  1877. Zbar = 0.0
  1878. for i in range(k):
  1879. Zbari[i] = np.mean(Zij[i], axis=0)
  1880. Zbar += Zbari[i] * Ni[i]
  1881. Zbar /= Ntot
  1882. numer = (Ntot - k) * np.sum(Ni * (Zbari - Zbar)**2, axis=0)
  1883. # compute denom_variance
  1884. dvar = 0.0
  1885. for i in range(k):
  1886. dvar += np.sum((Zij[i] - Zbari[i])**2, axis=0)
  1887. denom = (k - 1.0) * dvar
  1888. W = numer / denom
  1889. pval = distributions.f.sf(W, k-1, Ntot-k) # 1 - cdf
  1890. return LeveneResult(W, pval)
  1891. def binom_test(x, n=None, p=0.5, alternative='two-sided'):
  1892. """
  1893. Perform a test that the probability of success is p.
  1894. This is an exact, two-sided test of the null hypothesis
  1895. that the probability of success in a Bernoulli experiment
  1896. is `p`.
  1897. Parameters
  1898. ----------
  1899. x : integer or array_like
  1900. the number of successes, or if x has length 2, it is the
  1901. number of successes and the number of failures.
  1902. n : integer
  1903. the number of trials. This is ignored if x gives both the
  1904. number of successes and failures
  1905. p : float, optional
  1906. The hypothesized probability of success. 0 <= p <= 1. The
  1907. default value is p = 0.5
  1908. alternative : {'two-sided', 'greater', 'less'}, optional
  1909. Indicates the alternative hypothesis. The default value is
  1910. 'two-sided'.
  1911. Returns
  1912. -------
  1913. p-value : float
  1914. The p-value of the hypothesis test
  1915. References
  1916. ----------
  1917. .. [1] https://en.wikipedia.org/wiki/Binomial_test
  1918. Examples
  1919. --------
  1920. >>> from scipy import stats
  1921. A car manufacturer claims that no more than 10% of their cars are unsafe.
  1922. 15 cars are inspected for safety, 3 were found to be unsafe. Test the
  1923. manufacturer's claim:
  1924. >>> stats.binom_test(3, n=15, p=0.1, alternative='greater')
  1925. 0.18406106910639114
  1926. The null hypothesis cannot be rejected at the 5% level of significance
  1927. because the returned p-value is greater than the critical value of 5%.
  1928. """
  1929. x = atleast_1d(x).astype(np.integer)
  1930. if len(x) == 2:
  1931. n = x[1] + x[0]
  1932. x = x[0]
  1933. elif len(x) == 1:
  1934. x = x[0]
  1935. if n is None or n < x:
  1936. raise ValueError("n must be >= x")
  1937. n = np.int_(n)
  1938. else:
  1939. raise ValueError("Incorrect length for x.")
  1940. if (p > 1.0) or (p < 0.0):
  1941. raise ValueError("p must be in range [0,1]")
  1942. if alternative not in ('two-sided', 'less', 'greater'):
  1943. raise ValueError("alternative not recognized\n"
  1944. "should be 'two-sided', 'less' or 'greater'")
  1945. if alternative == 'less':
  1946. pval = distributions.binom.cdf(x, n, p)
  1947. return pval
  1948. if alternative == 'greater':
  1949. pval = distributions.binom.sf(x-1, n, p)
  1950. return pval
  1951. # if alternative was neither 'less' nor 'greater', then it's 'two-sided'
  1952. d = distributions.binom.pmf(x, n, p)
  1953. rerr = 1 + 1e-7
  1954. if x == p * n:
  1955. # special case as shortcut, would also be handled by `else` below
  1956. pval = 1.
  1957. elif x < p * n:
  1958. i = np.arange(np.ceil(p * n), n+1)
  1959. y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)
  1960. pval = (distributions.binom.cdf(x, n, p) +
  1961. distributions.binom.sf(n - y, n, p))
  1962. else:
  1963. i = np.arange(np.floor(p*n) + 1)
  1964. y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)
  1965. pval = (distributions.binom.cdf(y-1, n, p) +
  1966. distributions.binom.sf(x-1, n, p))
  1967. return min(1.0, pval)
  1968. def _apply_func(x, g, func):
  1969. # g is list of indices into x
  1970. # separating x into different groups
  1971. # func should be applied over the groups
  1972. g = unique(r_[0, g, len(x)])
  1973. output = []
  1974. for k in range(len(g) - 1):
  1975. output.append(func(x[g[k]:g[k+1]]))
  1976. return asarray(output)
  1977. FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue'))
  1978. def fligner(*args, **kwds):
  1979. """
  1980. Perform Fligner-Killeen test for equality of variance.
  1981. Fligner's test tests the null hypothesis that all input samples
  1982. are from populations with equal variances. Fligner-Killeen's test is
  1983. distribution free when populations are identical [2]_.
  1984. Parameters
  1985. ----------
  1986. sample1, sample2, ... : array_like
  1987. Arrays of sample data. Need not be the same length.
  1988. center : {'mean', 'median', 'trimmed'}, optional
  1989. Keyword argument controlling which function of the data is used in
  1990. computing the test statistic. The default is 'median'.
  1991. proportiontocut : float, optional
  1992. When `center` is 'trimmed', this gives the proportion of data points
  1993. to cut from each end. (See `scipy.stats.trim_mean`.)
  1994. Default is 0.05.
  1995. Returns
  1996. -------
  1997. statistic : float
  1998. The test statistic.
  1999. pvalue : float
  2000. The p-value for the hypothesis test.
  2001. See Also
  2002. --------
  2003. bartlett : A parametric test for equality of k variances in normal samples
  2004. levene : A robust parametric test for equality of k variances
  2005. Notes
  2006. -----
  2007. As with Levene's test there are three variants of Fligner's test that
  2008. differ by the measure of central tendency used in the test. See `levene`
  2009. for more information.
  2010. Conover et al. (1981) examine many of the existing parametric and
  2011. nonparametric tests by extensive simulations and they conclude that the
  2012. tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
  2013. superior in terms of robustness of departures from normality and power [3]_.
  2014. References
  2015. ----------
  2016. .. [1] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
  2017. Hypothesis Testing based on Quadratic Inference Function. Technical
  2018. Report #99-03, Center for Likelihood Studies, Pennsylvania State
  2019. University.
  2020. https://cecas.clemson.edu/~cspark/cv/paper/qif/draftqif2.pdf
  2021. .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample
  2022. tests for scale. 'Journal of the American Statistical Association.'
  2023. 71(353), 210-213.
  2024. .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
  2025. Hypothesis Testing based on Quadratic Inference Function. Technical
  2026. Report #99-03, Center for Likelihood Studies, Pennsylvania State
  2027. University.
  2028. .. [4] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A
  2029. comparative study of tests for homogeneity of variances, with
  2030. applications to the outer continental shelf biding data.
  2031. Technometrics, 23(4), 351-361.
  2032. """
  2033. # Handle empty input
  2034. for a in args:
  2035. if np.asanyarray(a).size == 0:
  2036. return FlignerResult(np.nan, np.nan)
  2037. # Handle keyword arguments.
  2038. center = 'median'
  2039. proportiontocut = 0.05
  2040. for kw, value in kwds.items():
  2041. if kw not in ['center', 'proportiontocut']:
  2042. raise TypeError("fligner() got an unexpected keyword "
  2043. "argument '%s'" % kw)
  2044. if kw == 'center':
  2045. center = value
  2046. else:
  2047. proportiontocut = value
  2048. k = len(args)
  2049. if k < 2:
  2050. raise ValueError("Must enter at least two input sample vectors.")
  2051. if center not in ['mean', 'median', 'trimmed']:
  2052. raise ValueError("Keyword argument <center> must be 'mean', 'median'"
  2053. " or 'trimmed'.")
  2054. if center == 'median':
  2055. func = lambda x: np.median(x, axis=0)
  2056. elif center == 'mean':
  2057. func = lambda x: np.mean(x, axis=0)
  2058. else: # center == 'trimmed'
  2059. args = tuple(stats.trimboth(arg, proportiontocut) for arg in args)
  2060. func = lambda x: np.mean(x, axis=0)
  2061. Ni = asarray([len(args[j]) for j in range(k)])
  2062. Yci = asarray([func(args[j]) for j in range(k)])
  2063. Ntot = np.sum(Ni, axis=0)
  2064. # compute Zij's
  2065. Zij = [abs(asarray(args[i]) - Yci[i]) for i in range(k)]
  2066. allZij = []
  2067. g = [0]
  2068. for i in range(k):
  2069. allZij.extend(list(Zij[i]))
  2070. g.append(len(allZij))
  2071. ranks = stats.rankdata(allZij)
  2072. a = distributions.norm.ppf(ranks / (2*(Ntot + 1.0)) + 0.5)
  2073. # compute Aibar
  2074. Aibar = _apply_func(a, g, np.sum) / Ni
  2075. anbar = np.mean(a, axis=0)
  2076. varsq = np.var(a, axis=0, ddof=1)
  2077. Xsq = np.sum(Ni * (asarray(Aibar) - anbar)**2.0, axis=0) / varsq
  2078. pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf
  2079. return FlignerResult(Xsq, pval)
  2080. def mood(x, y, axis=0):
  2081. """
  2082. Perform Mood's test for equal scale parameters.
  2083. Mood's two-sample test for scale parameters is a non-parametric
  2084. test for the null hypothesis that two samples are drawn from the
  2085. same distribution with the same scale parameter.
  2086. Parameters
  2087. ----------
  2088. x, y : array_like
  2089. Arrays of sample data.
  2090. axis : int, optional
  2091. The axis along which the samples are tested. `x` and `y` can be of
  2092. different length along `axis`.
  2093. If `axis` is None, `x` and `y` are flattened and the test is done on
  2094. all values in the flattened arrays.
  2095. Returns
  2096. -------
  2097. z : scalar or ndarray
  2098. The z-score for the hypothesis test. For 1-D inputs a scalar is
  2099. returned.
  2100. p-value : scalar ndarray
  2101. The p-value for the hypothesis test.
  2102. See Also
  2103. --------
  2104. fligner : A non-parametric test for the equality of k variances
  2105. ansari : A non-parametric test for the equality of 2 variances
  2106. bartlett : A parametric test for equality of k variances in normal samples
  2107. levene : A parametric test for equality of k variances
  2108. Notes
  2109. -----
  2110. The data are assumed to be drawn from probability distributions ``f(x)``
  2111. and ``f(x/s) / s`` respectively, for some probability density function f.
  2112. The null hypothesis is that ``s == 1``.
  2113. For multi-dimensional arrays, if the inputs are of shapes
  2114. ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the
  2115. resulting z and p values will have shape ``(n0, n2, n3)``. Note that
  2116. ``n1`` and ``m1`` don't have to be equal, but the other dimensions do.
  2117. Examples
  2118. --------
  2119. >>> from scipy import stats
  2120. >>> np.random.seed(1234)
  2121. >>> x2 = np.random.randn(2, 45, 6, 7)
  2122. >>> x1 = np.random.randn(2, 30, 6, 7)
  2123. >>> z, p = stats.mood(x1, x2, axis=1)
  2124. >>> p.shape
  2125. (2, 6, 7)
  2126. Find the number of points where the difference in scale is not significant:
  2127. >>> (p > 0.1).sum()
  2128. 74
  2129. Perform the test with different scales:
  2130. >>> x1 = np.random.randn(2, 30)
  2131. >>> x2 = np.random.randn(2, 35) * 10.0
  2132. >>> stats.mood(x1, x2, axis=1)
  2133. (array([-5.7178125 , -5.25342163]), array([ 1.07904114e-08, 1.49299218e-07]))
  2134. """
  2135. x = np.asarray(x, dtype=float)
  2136. y = np.asarray(y, dtype=float)
  2137. if axis is None:
  2138. x = x.flatten()
  2139. y = y.flatten()
  2140. axis = 0
  2141. # Determine shape of the result arrays
  2142. res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis])
  2143. if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if
  2144. ax != axis])):
  2145. raise ValueError("Dimensions of x and y on all axes except `axis` "
  2146. "should match")
  2147. n = x.shape[axis]
  2148. m = y.shape[axis]
  2149. N = m + n
  2150. if N < 3:
  2151. raise ValueError("Not enough observations.")
  2152. xy = np.concatenate((x, y), axis=axis)
  2153. if axis != 0:
  2154. xy = np.rollaxis(xy, axis)
  2155. xy = xy.reshape(xy.shape[0], -1)
  2156. # Generalized to the n-dimensional case by adding the axis argument, and
  2157. # using for loops, since rankdata is not vectorized. For improving
  2158. # performance consider vectorizing rankdata function.
  2159. all_ranks = np.zeros_like(xy)
  2160. for j in range(xy.shape[1]):
  2161. all_ranks[:, j] = stats.rankdata(xy[:, j])
  2162. Ri = all_ranks[:n]
  2163. M = np.sum((Ri - (N + 1.0) / 2)**2, axis=0)
  2164. # Approx stat.
  2165. mnM = n * (N * N - 1.0) / 12
  2166. varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180
  2167. z = (M - mnM) / sqrt(varM)
  2168. # sf for right tail, cdf for left tail. Factor 2 for two-sidedness
  2169. z_pos = z > 0
  2170. pval = np.zeros_like(z)
  2171. pval[z_pos] = 2 * distributions.norm.sf(z[z_pos])
  2172. pval[~z_pos] = 2 * distributions.norm.cdf(z[~z_pos])
  2173. if res_shape == ():
  2174. # Return scalars, not 0-D arrays
  2175. z = z[0]
  2176. pval = pval[0]
  2177. else:
  2178. z.shape = res_shape
  2179. pval.shape = res_shape
  2180. return z, pval
  2181. WilcoxonResult = namedtuple('WilcoxonResult', ('statistic', 'pvalue'))
  2182. def wilcoxon(x, y=None, zero_method="wilcox", correction=False):
  2183. """
  2184. Calculate the Wilcoxon signed-rank test.
  2185. The Wilcoxon signed-rank test tests the null hypothesis that two
  2186. related paired samples come from the same distribution. In particular,
  2187. it tests whether the distribution of the differences x - y is symmetric
  2188. about zero. It is a non-parametric version of the paired T-test.
  2189. Parameters
  2190. ----------
  2191. x : array_like
  2192. The first set of measurements.
  2193. y : array_like, optional
  2194. The second set of measurements. If `y` is not given, then the `x`
  2195. array is considered to be the differences between the two sets of
  2196. measurements.
  2197. zero_method : string, {"pratt", "wilcox", "zsplit"}, optional
  2198. "pratt":
  2199. Pratt treatment: includes zero-differences in the ranking process
  2200. (more conservative)
  2201. "wilcox":
  2202. Wilcox treatment: discards all zero-differences
  2203. "zsplit":
  2204. Zero rank split: just like Pratt, but splitting the zero rank
  2205. between positive and negative ones
  2206. correction : bool, optional
  2207. If True, apply continuity correction by adjusting the Wilcoxon rank
  2208. statistic by 0.5 towards the mean value when computing the
  2209. z-statistic. Default is False.
  2210. Returns
  2211. -------
  2212. statistic : float
  2213. The sum of the ranks of the differences above or below zero, whichever
  2214. is smaller.
  2215. pvalue : float
  2216. The two-sided p-value for the test.
  2217. Notes
  2218. -----
  2219. Because the normal approximation is used for the calculations, the
  2220. samples used should be large. A typical rule is to require that
  2221. n > 20.
  2222. References
  2223. ----------
  2224. .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
  2225. """
  2226. if zero_method not in ["wilcox", "pratt", "zsplit"]:
  2227. raise ValueError("Zero method should be either 'wilcox' "
  2228. "or 'pratt' or 'zsplit'")
  2229. if y is None:
  2230. d = asarray(x)
  2231. else:
  2232. x, y = map(asarray, (x, y))
  2233. if len(x) != len(y):
  2234. raise ValueError('Unequal N in wilcoxon. Aborting.')
  2235. d = x - y
  2236. if zero_method == "wilcox":
  2237. # Keep all non-zero differences
  2238. d = compress(np.not_equal(d, 0), d, axis=-1)
  2239. count = len(d)
  2240. if count < 10:
  2241. warnings.warn("Warning: sample size too small for normal approximation.")
  2242. r = stats.rankdata(abs(d))
  2243. r_plus = np.sum((d > 0) * r, axis=0)
  2244. r_minus = np.sum((d < 0) * r, axis=0)
  2245. if zero_method == "zsplit":
  2246. r_zero = np.sum((d == 0) * r, axis=0)
  2247. r_plus += r_zero / 2.
  2248. r_minus += r_zero / 2.
  2249. T = min(r_plus, r_minus)
  2250. mn = count * (count + 1.) * 0.25
  2251. se = count * (count + 1.) * (2. * count + 1.)
  2252. if zero_method == "pratt":
  2253. r = r[d != 0]
  2254. replist, repnum = find_repeats(r)
  2255. if repnum.size != 0:
  2256. # Correction for repeated elements.
  2257. se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()
  2258. se = sqrt(se / 24)
  2259. correction = 0.5 * int(bool(correction)) * np.sign(T - mn)
  2260. z = (T - mn - correction) / se
  2261. prob = 2. * distributions.norm.sf(abs(z))
  2262. return WilcoxonResult(T, prob)
  2263. def median_test(*args, **kwds):
  2264. """
  2265. Mood's median test.
  2266. Test that two or more samples come from populations with the same median.
  2267. Let ``n = len(args)`` be the number of samples. The "grand median" of
  2268. all the data is computed, and a contingency table is formed by
  2269. classifying the values in each sample as being above or below the grand
  2270. median. The contingency table, along with `correction` and `lambda_`,
  2271. are passed to `scipy.stats.chi2_contingency` to compute the test statistic
  2272. and p-value.
  2273. Parameters
  2274. ----------
  2275. sample1, sample2, ... : array_like
  2276. The set of samples. There must be at least two samples.
  2277. Each sample must be a one-dimensional sequence containing at least
  2278. one value. The samples are not required to have the same length.
  2279. ties : str, optional
  2280. Determines how values equal to the grand median are classified in
  2281. the contingency table. The string must be one of::
  2282. "below":
  2283. Values equal to the grand median are counted as "below".
  2284. "above":
  2285. Values equal to the grand median are counted as "above".
  2286. "ignore":
  2287. Values equal to the grand median are not counted.
  2288. The default is "below".
  2289. correction : bool, optional
  2290. If True, *and* there are just two samples, apply Yates' correction
  2291. for continuity when computing the test statistic associated with
  2292. the contingency table. Default is True.
  2293. lambda_ : float or str, optional.
  2294. By default, the statistic computed in this test is Pearson's
  2295. chi-squared statistic. `lambda_` allows a statistic from the
  2296. Cressie-Read power divergence family to be used instead. See
  2297. `power_divergence` for details.
  2298. Default is 1 (Pearson's chi-squared statistic).
  2299. nan_policy : {'propagate', 'raise', 'omit'}, optional
  2300. Defines how to handle when input contains nan. 'propagate' returns nan,
  2301. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  2302. values. Default is 'propagate'.
  2303. Returns
  2304. -------
  2305. stat : float
  2306. The test statistic. The statistic that is returned is determined by
  2307. `lambda_`. The default is Pearson's chi-squared statistic.
  2308. p : float
  2309. The p-value of the test.
  2310. m : float
  2311. The grand median.
  2312. table : ndarray
  2313. The contingency table. The shape of the table is (2, n), where
  2314. n is the number of samples. The first row holds the counts of the
  2315. values above the grand median, and the second row holds the counts
  2316. of the values below the grand median. The table allows further
  2317. analysis with, for example, `scipy.stats.chi2_contingency`, or with
  2318. `scipy.stats.fisher_exact` if there are two samples, without having
  2319. to recompute the table. If ``nan_policy`` is "propagate" and there
  2320. are nans in the input, the return value for ``table`` is ``None``.
  2321. See Also
  2322. --------
  2323. kruskal : Compute the Kruskal-Wallis H-test for independent samples.
  2324. mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y.
  2325. Notes
  2326. -----
  2327. .. versionadded:: 0.15.0
  2328. References
  2329. ----------
  2330. .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill
  2331. (1950), pp. 394-399.
  2332. .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010).
  2333. See Sections 8.12 and 10.15.
  2334. Examples
  2335. --------
  2336. A biologist runs an experiment in which there are three groups of plants.
  2337. Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants.
  2338. Each plant produces a number of seeds. The seed counts for each group
  2339. are::
  2340. Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49
  2341. Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99
  2342. Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84
  2343. The following code applies Mood's median test to these samples.
  2344. >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49]
  2345. >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99]
  2346. >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84]
  2347. >>> from scipy.stats import median_test
  2348. >>> stat, p, med, tbl = median_test(g1, g2, g3)
  2349. The median is
  2350. >>> med
  2351. 34.0
  2352. and the contingency table is
  2353. >>> tbl
  2354. array([[ 5, 10, 7],
  2355. [11, 5, 10]])
  2356. `p` is too large to conclude that the medians are not the same:
  2357. >>> p
  2358. 0.12609082774093244
  2359. The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to
  2360. `median_test`.
  2361. >>> g, p, med, tbl = median_test(g1, g2, g3, lambda_="log-likelihood")
  2362. >>> p
  2363. 0.12224779737117837
  2364. The median occurs several times in the data, so we'll get a different
  2365. result if, for example, ``ties="above"`` is used:
  2366. >>> stat, p, med, tbl = median_test(g1, g2, g3, ties="above")
  2367. >>> p
  2368. 0.063873276069553273
  2369. >>> tbl
  2370. array([[ 5, 11, 9],
  2371. [11, 4, 8]])
  2372. This example demonstrates that if the data set is not large and there
  2373. are values equal to the median, the p-value can be sensitive to the
  2374. choice of `ties`.
  2375. """
  2376. ties = kwds.pop('ties', 'below')
  2377. correction = kwds.pop('correction', True)
  2378. lambda_ = kwds.pop('lambda_', None)
  2379. nan_policy = kwds.pop('nan_policy', 'propagate')
  2380. if len(kwds) > 0:
  2381. bad_kwd = kwds.keys()[0]
  2382. raise TypeError("median_test() got an unexpected keyword "
  2383. "argument %r" % bad_kwd)
  2384. if len(args) < 2:
  2385. raise ValueError('median_test requires two or more samples.')
  2386. ties_options = ['below', 'above', 'ignore']
  2387. if ties not in ties_options:
  2388. raise ValueError("invalid 'ties' option '%s'; 'ties' must be one "
  2389. "of: %s" % (ties, str(ties_options)[1:-1]))
  2390. data = [np.asarray(arg) for arg in args]
  2391. # Validate the sizes and shapes of the arguments.
  2392. for k, d in enumerate(data):
  2393. if d.size == 0:
  2394. raise ValueError("Sample %d is empty. All samples must "
  2395. "contain at least one value." % (k + 1))
  2396. if d.ndim != 1:
  2397. raise ValueError("Sample %d has %d dimensions. All "
  2398. "samples must be one-dimensional sequences." %
  2399. (k + 1, d.ndim))
  2400. cdata = np.concatenate(data)
  2401. contains_nan, nan_policy = _contains_nan(cdata, nan_policy)
  2402. if contains_nan and nan_policy == 'propagate':
  2403. return np.nan, np.nan, np.nan, None
  2404. if contains_nan:
  2405. grand_median = np.median(cdata[~np.isnan(cdata)])
  2406. else:
  2407. grand_median = np.median(cdata)
  2408. # When the minimum version of numpy supported by scipy is 1.9.0,
  2409. # the above if/else statement can be replaced by the single line:
  2410. # grand_median = np.nanmedian(cdata)
  2411. # Create the contingency table.
  2412. table = np.zeros((2, len(data)), dtype=np.int64)
  2413. for k, sample in enumerate(data):
  2414. sample = sample[~np.isnan(sample)]
  2415. nabove = count_nonzero(sample > grand_median)
  2416. nbelow = count_nonzero(sample < grand_median)
  2417. nequal = sample.size - (nabove + nbelow)
  2418. table[0, k] += nabove
  2419. table[1, k] += nbelow
  2420. if ties == "below":
  2421. table[1, k] += nequal
  2422. elif ties == "above":
  2423. table[0, k] += nequal
  2424. # Check that no row or column of the table is all zero.
  2425. # Such a table can not be given to chi2_contingency, because it would have
  2426. # a zero in the table of expected frequencies.
  2427. rowsums = table.sum(axis=1)
  2428. if rowsums[0] == 0:
  2429. raise ValueError("All values are below the grand median (%r)." %
  2430. grand_median)
  2431. if rowsums[1] == 0:
  2432. raise ValueError("All values are above the grand median (%r)." %
  2433. grand_median)
  2434. if ties == "ignore":
  2435. # We already checked that each sample has at least one value, but it
  2436. # is possible that all those values equal the grand median. If `ties`
  2437. # is "ignore", that would result in a column of zeros in `table`. We
  2438. # check for that case here.
  2439. zero_cols = np.nonzero((table == 0).all(axis=0))[0]
  2440. if len(zero_cols) > 0:
  2441. msg = ("All values in sample %d are equal to the grand "
  2442. "median (%r), so they are ignored, resulting in an "
  2443. "empty sample." % (zero_cols[0] + 1, grand_median))
  2444. raise ValueError(msg)
  2445. stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_,
  2446. correction=correction)
  2447. return stat, p, grand_median, table
  2448. def _circfuncs_common(samples, high, low):
  2449. samples = np.asarray(samples)
  2450. if samples.size == 0:
  2451. return np.nan, np.nan
  2452. ang = (samples - low)*2.*pi / (high - low)
  2453. return samples, ang
  2454. def circmean(samples, high=2*pi, low=0, axis=None):
  2455. """
  2456. Compute the circular mean for samples in a range.
  2457. Parameters
  2458. ----------
  2459. samples : array_like
  2460. Input array.
  2461. high : float or int, optional
  2462. High boundary for circular mean range. Default is ``2*pi``.
  2463. low : float or int, optional
  2464. Low boundary for circular mean range. Default is 0.
  2465. axis : int, optional
  2466. Axis along which means are computed. The default is to compute
  2467. the mean of the flattened array.
  2468. Returns
  2469. -------
  2470. circmean : float
  2471. Circular mean.
  2472. Examples
  2473. --------
  2474. >>> from scipy.stats import circmean
  2475. >>> circmean([0.1, 2*np.pi+0.2, 6*np.pi+0.3])
  2476. 0.2
  2477. >>> from scipy.stats import circmean
  2478. >>> circmean([0.2, 1.4, 2.6], high = 1, low = 0)
  2479. 0.4
  2480. """
  2481. samples, ang = _circfuncs_common(samples, high, low)
  2482. S = sin(ang).sum(axis=axis)
  2483. C = cos(ang).sum(axis=axis)
  2484. res = arctan2(S, C)
  2485. mask = res < 0
  2486. if mask.ndim > 0:
  2487. res[mask] += 2*pi
  2488. elif mask:
  2489. res += 2*pi
  2490. return res*(high - low)/2.0/pi + low
  2491. def circvar(samples, high=2*pi, low=0, axis=None):
  2492. """
  2493. Compute the circular variance for samples assumed to be in a range
  2494. Parameters
  2495. ----------
  2496. samples : array_like
  2497. Input array.
  2498. low : float or int, optional
  2499. Low boundary for circular variance range. Default is 0.
  2500. high : float or int, optional
  2501. High boundary for circular variance range. Default is ``2*pi``.
  2502. axis : int, optional
  2503. Axis along which variances are computed. The default is to compute
  2504. the variance of the flattened array.
  2505. Returns
  2506. -------
  2507. circvar : float
  2508. Circular variance.
  2509. Notes
  2510. -----
  2511. This uses a definition of circular variance that in the limit of small
  2512. angles returns a number close to the 'linear' variance.
  2513. Examples
  2514. --------
  2515. >>> from scipy.stats import circvar
  2516. >>> circvar([0, 2*np.pi/3, 5*np.pi/3])
  2517. 2.19722457734
  2518. """
  2519. samples, ang = _circfuncs_common(samples, high, low)
  2520. S = sin(ang).mean(axis=axis)
  2521. C = cos(ang).mean(axis=axis)
  2522. R = hypot(S, C)
  2523. return ((high - low)/2.0/pi)**2 * 2 * log(1/R)
  2524. def circstd(samples, high=2*pi, low=0, axis=None):
  2525. """
  2526. Compute the circular standard deviation for samples assumed to be in the
  2527. range [low to high].
  2528. Parameters
  2529. ----------
  2530. samples : array_like
  2531. Input array.
  2532. low : float or int, optional
  2533. Low boundary for circular standard deviation range. Default is 0.
  2534. high : float or int, optional
  2535. High boundary for circular standard deviation range.
  2536. Default is ``2*pi``.
  2537. axis : int, optional
  2538. Axis along which standard deviations are computed. The default is
  2539. to compute the standard deviation of the flattened array.
  2540. Returns
  2541. -------
  2542. circstd : float
  2543. Circular standard deviation.
  2544. Notes
  2545. -----
  2546. This uses a definition of circular standard deviation that in the limit of
  2547. small angles returns a number close to the 'linear' standard deviation.
  2548. Examples
  2549. --------
  2550. >>> from scipy.stats import circstd
  2551. >>> circstd([0, 0.1*np.pi/2, 0.001*np.pi, 0.03*np.pi/2])
  2552. 0.063564063306
  2553. """
  2554. samples, ang = _circfuncs_common(samples, high, low)
  2555. S = sin(ang).mean(axis=axis)
  2556. C = cos(ang).mean(axis=axis)
  2557. R = hypot(S, C)
  2558. return ((high - low)/2.0/pi) * sqrt(-2*log(R))