stats.py 200 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990
  1. # Copyright 2002 Gary Strangman. All rights reserved
  2. # Copyright 2002-2016 The SciPy Developers
  3. #
  4. # The original code from Gary Strangman was heavily adapted for
  5. # use in SciPy by Travis Oliphant. The original code came with the
  6. # following disclaimer:
  7. #
  8. # This software is provided "as-is". There are no expressed or implied
  9. # warranties of any kind, including, but not limited to, the warranties
  10. # of merchantability and fitness for a given application. In no event
  11. # shall Gary Strangman be liable for any direct, indirect, incidental,
  12. # special, exemplary or consequential damages (including, but not limited
  13. # to, loss of use, data or profits, or business interruption) however
  14. # caused and on any theory of liability, whether in contract, strict
  15. # liability or tort (including negligence or otherwise) arising in any way
  16. # out of the use of this software, even if advised of the possibility of
  17. # such damage.
  18. """
  19. A collection of basic statistical functions for Python. The function
  20. names appear below.
  21. Some scalar functions defined here are also available in the scipy.special
  22. package where they work on arbitrary sized arrays.
  23. Disclaimers: The function list is obviously incomplete and, worse, the
  24. functions are not optimized. All functions have been tested (some more
  25. so than others), but they are far from bulletproof. Thus, as with any
  26. free software, no warranty or guarantee is expressed or implied. :-) A
  27. few extra functions that don't appear in the list below can be found by
  28. interested treasure-hunters. These functions don't necessarily have
  29. both list and array versions but were deemed useful.
  30. Central Tendency
  31. ----------------
  32. .. autosummary::
  33. :toctree: generated/
  34. gmean
  35. hmean
  36. mode
  37. Moments
  38. -------
  39. .. autosummary::
  40. :toctree: generated/
  41. moment
  42. variation
  43. skew
  44. kurtosis
  45. normaltest
  46. Altered Versions
  47. ----------------
  48. .. autosummary::
  49. :toctree: generated/
  50. tmean
  51. tvar
  52. tstd
  53. tsem
  54. describe
  55. Frequency Stats
  56. ---------------
  57. .. autosummary::
  58. :toctree: generated/
  59. itemfreq
  60. scoreatpercentile
  61. percentileofscore
  62. cumfreq
  63. relfreq
  64. Variability
  65. -----------
  66. .. autosummary::
  67. :toctree: generated/
  68. obrientransform
  69. sem
  70. zmap
  71. zscore
  72. iqr
  73. Trimming Functions
  74. ------------------
  75. .. autosummary::
  76. :toctree: generated/
  77. trimboth
  78. trim1
  79. Correlation Functions
  80. ---------------------
  81. .. autosummary::
  82. :toctree: generated/
  83. pearsonr
  84. fisher_exact
  85. spearmanr
  86. pointbiserialr
  87. kendalltau
  88. weightedtau
  89. linregress
  90. theilslopes
  91. Inferential Stats
  92. -----------------
  93. .. autosummary::
  94. :toctree: generated/
  95. ttest_1samp
  96. ttest_ind
  97. ttest_ind_from_stats
  98. ttest_rel
  99. chisquare
  100. power_divergence
  101. ks_2samp
  102. mannwhitneyu
  103. ranksums
  104. wilcoxon
  105. kruskal
  106. friedmanchisquare
  107. brunnermunzel
  108. combine_pvalues
  109. Statistical Distances
  110. ---------------------
  111. .. autosummary::
  112. :toctree: generated/
  113. wasserstein_distance
  114. energy_distance
  115. ANOVA Functions
  116. ---------------
  117. .. autosummary::
  118. :toctree: generated/
  119. f_oneway
  120. Support Functions
  121. -----------------
  122. .. autosummary::
  123. :toctree: generated/
  124. rankdata
  125. rvs_ratio_uniforms
  126. References
  127. ----------
  128. .. [CRCProbStat2000] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
  129. Probability and Statistics Tables and Formulae. Chapman & Hall: New
  130. York. 2000.
  131. """
  132. from __future__ import division, print_function, absolute_import
  133. import warnings
  134. import math
  135. from collections import namedtuple
  136. import numpy as np
  137. from numpy import array, asarray, ma
  138. from scipy._lib.six import callable, string_types
  139. from scipy._lib._version import NumpyVersion
  140. from scipy._lib._util import _lazywhere
  141. import scipy.special as special
  142. from . import distributions
  143. from . import mstats_basic
  144. from ._stats_mstats_common import _find_repeats, linregress, theilslopes, siegelslopes
  145. from ._stats import _kendall_dis, _toint64, _weightedrankedtau
  146. from ._rvs_sampling import rvs_ratio_uniforms
  147. __all__ = ['find_repeats', 'gmean', 'hmean', 'mode', 'tmean', 'tvar',
  148. 'tmin', 'tmax', 'tstd', 'tsem', 'moment', 'variation',
  149. 'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',
  150. 'normaltest', 'jarque_bera', 'itemfreq',
  151. 'scoreatpercentile', 'percentileofscore',
  152. 'cumfreq', 'relfreq', 'obrientransform',
  153. 'sem', 'zmap', 'zscore', 'iqr',
  154. 'sigmaclip', 'trimboth', 'trim1', 'trim_mean', 'f_oneway',
  155. 'pearsonr', 'fisher_exact', 'spearmanr', 'pointbiserialr',
  156. 'kendalltau', 'weightedtau',
  157. 'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',
  158. 'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel', 'kstest',
  159. 'chisquare', 'power_divergence', 'ks_2samp', 'mannwhitneyu',
  160. 'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
  161. 'rankdata', 'rvs_ratio_uniforms',
  162. 'combine_pvalues', 'wasserstein_distance', 'energy_distance',
  163. 'brunnermunzel']
  164. def _chk_asarray(a, axis):
  165. if axis is None:
  166. a = np.ravel(a)
  167. outaxis = 0
  168. else:
  169. a = np.asarray(a)
  170. outaxis = axis
  171. if a.ndim == 0:
  172. a = np.atleast_1d(a)
  173. return a, outaxis
  174. def _chk2_asarray(a, b, axis):
  175. if axis is None:
  176. a = np.ravel(a)
  177. b = np.ravel(b)
  178. outaxis = 0
  179. else:
  180. a = np.asarray(a)
  181. b = np.asarray(b)
  182. outaxis = axis
  183. if a.ndim == 0:
  184. a = np.atleast_1d(a)
  185. if b.ndim == 0:
  186. b = np.atleast_1d(b)
  187. return a, b, outaxis
  188. def _contains_nan(a, nan_policy='propagate'):
  189. policies = ['propagate', 'raise', 'omit']
  190. if nan_policy not in policies:
  191. raise ValueError("nan_policy must be one of {%s}" %
  192. ', '.join("'%s'" % s for s in policies))
  193. try:
  194. # Calling np.sum to avoid creating a huge array into memory
  195. # e.g. np.isnan(a).any()
  196. with np.errstate(invalid='ignore'):
  197. contains_nan = np.isnan(np.sum(a))
  198. except TypeError:
  199. # If the check cannot be properly performed we fallback to omitting
  200. # nan values and raising a warning. This can happen when attempting to
  201. # sum things that are not numbers (e.g. as in the function `mode`).
  202. contains_nan = False
  203. nan_policy = 'omit'
  204. warnings.warn("The input array could not be properly checked for nan "
  205. "values. nan values will be ignored.", RuntimeWarning)
  206. if contains_nan and nan_policy == 'raise':
  207. raise ValueError("The input contains nan values")
  208. return (contains_nan, nan_policy)
  209. def gmean(a, axis=0, dtype=None):
  210. """
  211. Compute the geometric mean along the specified axis.
  212. Return the geometric average of the array elements.
  213. That is: n-th root of (x1 * x2 * ... * xn)
  214. Parameters
  215. ----------
  216. a : array_like
  217. Input array or object that can be converted to an array.
  218. axis : int or None, optional
  219. Axis along which the geometric mean is computed. Default is 0.
  220. If None, compute over the whole array `a`.
  221. dtype : dtype, optional
  222. Type of the returned array and of the accumulator in which the
  223. elements are summed. If dtype is not specified, it defaults to the
  224. dtype of a, unless a has an integer dtype with a precision less than
  225. that of the default platform integer. In that case, the default
  226. platform integer is used.
  227. Returns
  228. -------
  229. gmean : ndarray
  230. see dtype parameter above
  231. See Also
  232. --------
  233. numpy.mean : Arithmetic average
  234. numpy.average : Weighted average
  235. hmean : Harmonic mean
  236. Notes
  237. -----
  238. The geometric average is computed over a single dimension of the input
  239. array, axis=0 by default, or all values in the array if axis=None.
  240. float64 intermediate and return values are used for integer inputs.
  241. Use masked arrays to ignore any non-finite values in the input or that
  242. arise in the calculations such as Not a Number and infinity because masked
  243. arrays automatically mask any non-finite values.
  244. Examples
  245. --------
  246. >>> from scipy.stats import gmean
  247. >>> gmean([1, 4])
  248. 2.0
  249. >>> gmean([1, 2, 3, 4, 5, 6, 7])
  250. 3.3800151591412964
  251. """
  252. if not isinstance(a, np.ndarray):
  253. # if not an ndarray object attempt to convert it
  254. log_a = np.log(np.array(a, dtype=dtype))
  255. elif dtype:
  256. # Must change the default dtype allowing array type
  257. if isinstance(a, np.ma.MaskedArray):
  258. log_a = np.log(np.ma.asarray(a, dtype=dtype))
  259. else:
  260. log_a = np.log(np.asarray(a, dtype=dtype))
  261. else:
  262. log_a = np.log(a)
  263. return np.exp(log_a.mean(axis=axis))
  264. def hmean(a, axis=0, dtype=None):
  265. """
  266. Calculate the harmonic mean along the specified axis.
  267. That is: n / (1/x1 + 1/x2 + ... + 1/xn)
  268. Parameters
  269. ----------
  270. a : array_like
  271. Input array, masked array or object that can be converted to an array.
  272. axis : int or None, optional
  273. Axis along which the harmonic mean is computed. Default is 0.
  274. If None, compute over the whole array `a`.
  275. dtype : dtype, optional
  276. Type of the returned array and of the accumulator in which the
  277. elements are summed. If `dtype` is not specified, it defaults to the
  278. dtype of `a`, unless `a` has an integer `dtype` with a precision less
  279. than that of the default platform integer. In that case, the default
  280. platform integer is used.
  281. Returns
  282. -------
  283. hmean : ndarray
  284. see `dtype` parameter above
  285. See Also
  286. --------
  287. numpy.mean : Arithmetic average
  288. numpy.average : Weighted average
  289. gmean : Geometric mean
  290. Notes
  291. -----
  292. The harmonic mean is computed over a single dimension of the input
  293. array, axis=0 by default, or all values in the array if axis=None.
  294. float64 intermediate and return values are used for integer inputs.
  295. Use masked arrays to ignore any non-finite values in the input or that
  296. arise in the calculations such as Not a Number and infinity.
  297. Examples
  298. --------
  299. >>> from scipy.stats import hmean
  300. >>> hmean([1, 4])
  301. 1.6000000000000001
  302. >>> hmean([1, 2, 3, 4, 5, 6, 7])
  303. 2.6997245179063363
  304. """
  305. if not isinstance(a, np.ndarray):
  306. a = np.array(a, dtype=dtype)
  307. if np.all(a > 0):
  308. # Harmonic mean only defined if greater than zero
  309. if isinstance(a, np.ma.MaskedArray):
  310. size = a.count(axis)
  311. else:
  312. if axis is None:
  313. a = a.ravel()
  314. size = a.shape[0]
  315. else:
  316. size = a.shape[axis]
  317. return size / np.sum(1.0 / a, axis=axis, dtype=dtype)
  318. else:
  319. raise ValueError("Harmonic mean only defined if all elements greater "
  320. "than zero")
  321. ModeResult = namedtuple('ModeResult', ('mode', 'count'))
  322. def mode(a, axis=0, nan_policy='propagate'):
  323. """
  324. Return an array of the modal (most common) value in the passed array.
  325. If there is more than one such value, only the smallest is returned.
  326. The bin-count for the modal bins is also returned.
  327. Parameters
  328. ----------
  329. a : array_like
  330. n-dimensional array of which to find mode(s).
  331. axis : int or None, optional
  332. Axis along which to operate. Default is 0. If None, compute over
  333. the whole array `a`.
  334. nan_policy : {'propagate', 'raise', 'omit'}, optional
  335. Defines how to handle when input contains nan. 'propagate' returns nan,
  336. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  337. values. Default is 'propagate'.
  338. Returns
  339. -------
  340. mode : ndarray
  341. Array of modal values.
  342. count : ndarray
  343. Array of counts for each mode.
  344. Examples
  345. --------
  346. >>> a = np.array([[6, 8, 3, 0],
  347. ... [3, 2, 1, 7],
  348. ... [8, 1, 8, 4],
  349. ... [5, 3, 0, 5],
  350. ... [4, 7, 5, 9]])
  351. >>> from scipy import stats
  352. >>> stats.mode(a)
  353. (array([[3, 1, 0, 0]]), array([[1, 1, 1, 1]]))
  354. To get mode of whole array, specify ``axis=None``:
  355. >>> stats.mode(a, axis=None)
  356. (array([3]), array([3]))
  357. """
  358. a, axis = _chk_asarray(a, axis)
  359. if a.size == 0:
  360. return ModeResult(np.array([]), np.array([]))
  361. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  362. if contains_nan and nan_policy == 'omit':
  363. a = ma.masked_invalid(a)
  364. return mstats_basic.mode(a, axis)
  365. if (NumpyVersion(np.__version__) < '1.9.0') or (a.dtype == object and np.nan in set(a)):
  366. # Fall back to a slower method since np.unique does not work with NaN
  367. # or for older numpy which does not support return_counts
  368. scores = set(np.ravel(a)) # get ALL unique values
  369. testshape = list(a.shape)
  370. testshape[axis] = 1
  371. oldmostfreq = np.zeros(testshape, dtype=a.dtype)
  372. oldcounts = np.zeros(testshape, dtype=int)
  373. for score in scores:
  374. template = (a == score)
  375. counts = np.expand_dims(np.sum(template, axis), axis)
  376. mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
  377. oldcounts = np.maximum(counts, oldcounts)
  378. oldmostfreq = mostfrequent
  379. return ModeResult(mostfrequent, oldcounts)
  380. def _mode1D(a):
  381. vals, cnts = np.unique(a, return_counts=True)
  382. return vals[cnts.argmax()], cnts.max()
  383. # np.apply_along_axis will convert the _mode1D tuples to a numpy array, casting types in the process
  384. # This recreates the results without that issue
  385. # View of a, rotated so the requested axis is last
  386. in_dims = list(range(a.ndim))
  387. a_view = np.transpose(a, in_dims[:axis] + in_dims[axis+1:] + [axis])
  388. inds = np.ndindex(a_view.shape[:-1])
  389. modes = np.empty(a_view.shape[:-1], dtype=a.dtype)
  390. counts = np.zeros(a_view.shape[:-1], dtype=np.int)
  391. for ind in inds:
  392. modes[ind], counts[ind] = _mode1D(a_view[ind])
  393. newshape = list(a.shape)
  394. newshape[axis] = 1
  395. return ModeResult(modes.reshape(newshape), counts.reshape(newshape))
  396. def _mask_to_limits(a, limits, inclusive):
  397. """Mask an array for values outside of given limits.
  398. This is primarily a utility function.
  399. Parameters
  400. ----------
  401. a : array
  402. limits : (float or None, float or None)
  403. A tuple consisting of the (lower limit, upper limit). Values in the
  404. input array less than the lower limit or greater than the upper limit
  405. will be masked out. None implies no limit.
  406. inclusive : (bool, bool)
  407. A tuple consisting of the (lower flag, upper flag). These flags
  408. determine whether values exactly equal to lower or upper are allowed.
  409. Returns
  410. -------
  411. A MaskedArray.
  412. Raises
  413. ------
  414. A ValueError if there are no values within the given limits.
  415. """
  416. lower_limit, upper_limit = limits
  417. lower_include, upper_include = inclusive
  418. am = ma.MaskedArray(a)
  419. if lower_limit is not None:
  420. if lower_include:
  421. am = ma.masked_less(am, lower_limit)
  422. else:
  423. am = ma.masked_less_equal(am, lower_limit)
  424. if upper_limit is not None:
  425. if upper_include:
  426. am = ma.masked_greater(am, upper_limit)
  427. else:
  428. am = ma.masked_greater_equal(am, upper_limit)
  429. if am.count() == 0:
  430. raise ValueError("No array values within given limits")
  431. return am
  432. def tmean(a, limits=None, inclusive=(True, True), axis=None):
  433. """
  434. Compute the trimmed mean.
  435. This function finds the arithmetic mean of given values, ignoring values
  436. outside the given `limits`.
  437. Parameters
  438. ----------
  439. a : array_like
  440. Array of values.
  441. limits : None or (lower limit, upper limit), optional
  442. Values in the input array less than the lower limit or greater than the
  443. upper limit will be ignored. When limits is None (default), then all
  444. values are used. Either of the limit values in the tuple can also be
  445. None representing a half-open interval.
  446. inclusive : (bool, bool), optional
  447. A tuple consisting of the (lower flag, upper flag). These flags
  448. determine whether values exactly equal to the lower or upper limits
  449. are included. The default value is (True, True).
  450. axis : int or None, optional
  451. Axis along which to compute test. Default is None.
  452. Returns
  453. -------
  454. tmean : float
  455. See also
  456. --------
  457. trim_mean : returns mean after trimming a proportion from both tails.
  458. Examples
  459. --------
  460. >>> from scipy import stats
  461. >>> x = np.arange(20)
  462. >>> stats.tmean(x)
  463. 9.5
  464. >>> stats.tmean(x, (3,17))
  465. 10.0
  466. """
  467. a = asarray(a)
  468. if limits is None:
  469. return np.mean(a, None)
  470. am = _mask_to_limits(a.ravel(), limits, inclusive)
  471. return am.mean(axis=axis)
  472. def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  473. """
  474. Compute the trimmed variance.
  475. This function computes the sample variance of an array of values,
  476. while ignoring values which are outside of given `limits`.
  477. Parameters
  478. ----------
  479. a : array_like
  480. Array of values.
  481. limits : None or (lower limit, upper limit), optional
  482. Values in the input array less than the lower limit or greater than the
  483. upper limit will be ignored. When limits is None, then all values are
  484. used. Either of the limit values in the tuple can also be None
  485. representing a half-open interval. The default value is None.
  486. inclusive : (bool, bool), optional
  487. A tuple consisting of the (lower flag, upper flag). These flags
  488. determine whether values exactly equal to the lower or upper limits
  489. are included. The default value is (True, True).
  490. axis : int or None, optional
  491. Axis along which to operate. Default is 0. If None, compute over the
  492. whole array `a`.
  493. ddof : int, optional
  494. Delta degrees of freedom. Default is 1.
  495. Returns
  496. -------
  497. tvar : float
  498. Trimmed variance.
  499. Notes
  500. -----
  501. `tvar` computes the unbiased sample variance, i.e. it uses a correction
  502. factor ``n / (n - 1)``.
  503. Examples
  504. --------
  505. >>> from scipy import stats
  506. >>> x = np.arange(20)
  507. >>> stats.tvar(x)
  508. 35.0
  509. >>> stats.tvar(x, (3,17))
  510. 20.0
  511. """
  512. a = asarray(a)
  513. a = a.astype(float).ravel()
  514. if limits is None:
  515. n = len(a)
  516. return a.var() * n / (n - 1.)
  517. am = _mask_to_limits(a, limits, inclusive)
  518. return np.ma.var(am, ddof=ddof, axis=axis)
  519. def tmin(a, lowerlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
  520. """
  521. Compute the trimmed minimum.
  522. This function finds the miminum value of an array `a` along the
  523. specified axis, but only considering values greater than a specified
  524. lower limit.
  525. Parameters
  526. ----------
  527. a : array_like
  528. array of values
  529. lowerlimit : None or float, optional
  530. Values in the input array less than the given limit will be ignored.
  531. When lowerlimit is None, then all values are used. The default value
  532. is None.
  533. axis : int or None, optional
  534. Axis along which to operate. Default is 0. If None, compute over the
  535. whole array `a`.
  536. inclusive : {True, False}, optional
  537. This flag determines whether values exactly equal to the lower limit
  538. are included. The default value is True.
  539. nan_policy : {'propagate', 'raise', 'omit'}, optional
  540. Defines how to handle when input contains nan. 'propagate' returns nan,
  541. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  542. values. Default is 'propagate'.
  543. Returns
  544. -------
  545. tmin : float, int or ndarray
  546. Examples
  547. --------
  548. >>> from scipy import stats
  549. >>> x = np.arange(20)
  550. >>> stats.tmin(x)
  551. 0
  552. >>> stats.tmin(x, 13)
  553. 13
  554. >>> stats.tmin(x, 13, inclusive=False)
  555. 14
  556. """
  557. a, axis = _chk_asarray(a, axis)
  558. am = _mask_to_limits(a, (lowerlimit, None), (inclusive, False))
  559. contains_nan, nan_policy = _contains_nan(am, nan_policy)
  560. if contains_nan and nan_policy == 'omit':
  561. am = ma.masked_invalid(am)
  562. res = ma.minimum.reduce(am, axis).data
  563. if res.ndim == 0:
  564. return res[()]
  565. return res
  566. def tmax(a, upperlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
  567. """
  568. Compute the trimmed maximum.
  569. This function computes the maximum value of an array along a given axis,
  570. while ignoring values larger than a specified upper limit.
  571. Parameters
  572. ----------
  573. a : array_like
  574. array of values
  575. upperlimit : None or float, optional
  576. Values in the input array greater than the given limit will be ignored.
  577. When upperlimit is None, then all values are used. The default value
  578. is None.
  579. axis : int or None, optional
  580. Axis along which to operate. Default is 0. If None, compute over the
  581. whole array `a`.
  582. inclusive : {True, False}, optional
  583. This flag determines whether values exactly equal to the upper limit
  584. are included. The default value is True.
  585. nan_policy : {'propagate', 'raise', 'omit'}, optional
  586. Defines how to handle when input contains nan. 'propagate' returns nan,
  587. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  588. values. Default is 'propagate'.
  589. Returns
  590. -------
  591. tmax : float, int or ndarray
  592. Examples
  593. --------
  594. >>> from scipy import stats
  595. >>> x = np.arange(20)
  596. >>> stats.tmax(x)
  597. 19
  598. >>> stats.tmax(x, 13)
  599. 13
  600. >>> stats.tmax(x, 13, inclusive=False)
  601. 12
  602. """
  603. a, axis = _chk_asarray(a, axis)
  604. am = _mask_to_limits(a, (None, upperlimit), (False, inclusive))
  605. contains_nan, nan_policy = _contains_nan(am, nan_policy)
  606. if contains_nan and nan_policy == 'omit':
  607. am = ma.masked_invalid(am)
  608. res = ma.maximum.reduce(am, axis).data
  609. if res.ndim == 0:
  610. return res[()]
  611. return res
  612. def tstd(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  613. """
  614. Compute the trimmed sample standard deviation.
  615. This function finds the sample standard deviation of given values,
  616. ignoring values outside the given `limits`.
  617. Parameters
  618. ----------
  619. a : array_like
  620. array of values
  621. limits : None or (lower limit, upper limit), optional
  622. Values in the input array less than the lower limit or greater than the
  623. upper limit will be ignored. When limits is None, then all values are
  624. used. Either of the limit values in the tuple can also be None
  625. representing a half-open interval. The default value is None.
  626. inclusive : (bool, bool), optional
  627. A tuple consisting of the (lower flag, upper flag). These flags
  628. determine whether values exactly equal to the lower or upper limits
  629. are included. The default value is (True, True).
  630. axis : int or None, optional
  631. Axis along which to operate. Default is 0. If None, compute over the
  632. whole array `a`.
  633. ddof : int, optional
  634. Delta degrees of freedom. Default is 1.
  635. Returns
  636. -------
  637. tstd : float
  638. Notes
  639. -----
  640. `tstd` computes the unbiased sample standard deviation, i.e. it uses a
  641. correction factor ``n / (n - 1)``.
  642. Examples
  643. --------
  644. >>> from scipy import stats
  645. >>> x = np.arange(20)
  646. >>> stats.tstd(x)
  647. 5.9160797830996161
  648. >>> stats.tstd(x, (3,17))
  649. 4.4721359549995796
  650. """
  651. return np.sqrt(tvar(a, limits, inclusive, axis, ddof))
  652. def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  653. """
  654. Compute the trimmed standard error of the mean.
  655. This function finds the standard error of the mean for given
  656. values, ignoring values outside the given `limits`.
  657. Parameters
  658. ----------
  659. a : array_like
  660. array of values
  661. limits : None or (lower limit, upper limit), optional
  662. Values in the input array less than the lower limit or greater than the
  663. upper limit will be ignored. When limits is None, then all values are
  664. used. Either of the limit values in the tuple can also be None
  665. representing a half-open interval. The default value is None.
  666. inclusive : (bool, bool), optional
  667. A tuple consisting of the (lower flag, upper flag). These flags
  668. determine whether values exactly equal to the lower or upper limits
  669. are included. The default value is (True, True).
  670. axis : int or None, optional
  671. Axis along which to operate. Default is 0. If None, compute over the
  672. whole array `a`.
  673. ddof : int, optional
  674. Delta degrees of freedom. Default is 1.
  675. Returns
  676. -------
  677. tsem : float
  678. Notes
  679. -----
  680. `tsem` uses unbiased sample standard deviation, i.e. it uses a
  681. correction factor ``n / (n - 1)``.
  682. Examples
  683. --------
  684. >>> from scipy import stats
  685. >>> x = np.arange(20)
  686. >>> stats.tsem(x)
  687. 1.3228756555322954
  688. >>> stats.tsem(x, (3,17))
  689. 1.1547005383792515
  690. """
  691. a = np.asarray(a).ravel()
  692. if limits is None:
  693. return a.std(ddof=ddof) / np.sqrt(a.size)
  694. am = _mask_to_limits(a, limits, inclusive)
  695. sd = np.sqrt(np.ma.var(am, ddof=ddof, axis=axis))
  696. return sd / np.sqrt(am.count())
  697. #####################################
  698. # MOMENTS #
  699. #####################################
  700. def moment(a, moment=1, axis=0, nan_policy='propagate'):
  701. r"""
  702. Calculate the nth moment about the mean for a sample.
  703. A moment is a specific quantitative measure of the shape of a set of
  704. points. It is often used to calculate coefficients of skewness and kurtosis
  705. due to its close relationship with them.
  706. Parameters
  707. ----------
  708. a : array_like
  709. data
  710. moment : int or array_like of ints, optional
  711. order of central moment that is returned. Default is 1.
  712. axis : int or None, optional
  713. Axis along which the central moment is computed. Default is 0.
  714. If None, compute over the whole array `a`.
  715. nan_policy : {'propagate', 'raise', 'omit'}, optional
  716. Defines how to handle when input contains nan. 'propagate' returns nan,
  717. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  718. values. Default is 'propagate'.
  719. Returns
  720. -------
  721. n-th central moment : ndarray or float
  722. The appropriate moment along the given axis or over all values if axis
  723. is None. The denominator for the moment calculation is the number of
  724. observations, no degrees of freedom correction is done.
  725. See also
  726. --------
  727. kurtosis, skew, describe
  728. Notes
  729. -----
  730. The k-th central moment of a data sample is:
  731. .. math::
  732. m_k = \frac{1}{n} \sum_{i = 1}^n (x_i - \bar{x})^k
  733. Where n is the number of samples and x-bar is the mean. This function uses
  734. exponentiation by squares [1]_ for efficiency.
  735. References
  736. ----------
  737. .. [1] https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
  738. Examples
  739. --------
  740. >>> from scipy.stats import moment
  741. >>> moment([1, 2, 3, 4, 5], moment=1)
  742. 0.0
  743. >>> moment([1, 2, 3, 4, 5], moment=2)
  744. 2.0
  745. """
  746. a, axis = _chk_asarray(a, axis)
  747. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  748. if contains_nan and nan_policy == 'omit':
  749. a = ma.masked_invalid(a)
  750. return mstats_basic.moment(a, moment, axis)
  751. if a.size == 0:
  752. # empty array, return nan(s) with shape matching `moment`
  753. if np.isscalar(moment):
  754. return np.nan
  755. else:
  756. return np.full(np.asarray(moment).shape, np.nan, dtype=np.float64)
  757. # for array_like moment input, return a value for each.
  758. if not np.isscalar(moment):
  759. mmnt = [_moment(a, i, axis) for i in moment]
  760. return np.array(mmnt)
  761. else:
  762. return _moment(a, moment, axis)
  763. def _moment(a, moment, axis):
  764. if np.abs(moment - np.round(moment)) > 0:
  765. raise ValueError("All moment parameters must be integers")
  766. if moment == 0:
  767. # When moment equals 0, the result is 1, by definition.
  768. shape = list(a.shape)
  769. del shape[axis]
  770. if shape:
  771. # return an actual array of the appropriate shape
  772. return np.ones(shape, dtype=float)
  773. else:
  774. # the input was 1D, so return a scalar instead of a rank-0 array
  775. return 1.0
  776. elif moment == 1:
  777. # By definition the first moment about the mean is 0.
  778. shape = list(a.shape)
  779. del shape[axis]
  780. if shape:
  781. # return an actual array of the appropriate shape
  782. return np.zeros(shape, dtype=float)
  783. else:
  784. # the input was 1D, so return a scalar instead of a rank-0 array
  785. return np.float64(0.0)
  786. else:
  787. # Exponentiation by squares: form exponent sequence
  788. n_list = [moment]
  789. current_n = moment
  790. while current_n > 2:
  791. if current_n % 2:
  792. current_n = (current_n - 1) / 2
  793. else:
  794. current_n /= 2
  795. n_list.append(current_n)
  796. # Starting point for exponentiation by squares
  797. a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis)
  798. if n_list[-1] == 1:
  799. s = a_zero_mean.copy()
  800. else:
  801. s = a_zero_mean**2
  802. # Perform multiplications
  803. for n in n_list[-2::-1]:
  804. s = s**2
  805. if n % 2:
  806. s *= a_zero_mean
  807. return np.mean(s, axis)
  808. def variation(a, axis=0, nan_policy='propagate'):
  809. """
  810. Compute the coefficient of variation, the ratio of the biased standard
  811. deviation to the mean.
  812. Parameters
  813. ----------
  814. a : array_like
  815. Input array.
  816. axis : int or None, optional
  817. Axis along which to calculate the coefficient of variation. Default
  818. is 0. If None, compute over the whole array `a`.
  819. nan_policy : {'propagate', 'raise', 'omit'}, optional
  820. Defines how to handle when input contains nan. 'propagate' returns nan,
  821. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  822. values. Default is 'propagate'.
  823. Returns
  824. -------
  825. variation : ndarray
  826. The calculated variation along the requested axis.
  827. References
  828. ----------
  829. .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
  830. Probability and Statistics Tables and Formulae. Chapman & Hall: New
  831. York. 2000.
  832. Examples
  833. --------
  834. >>> from scipy.stats import variation
  835. >>> variation([1, 2, 3, 4, 5])
  836. 0.47140452079103173
  837. """
  838. a, axis = _chk_asarray(a, axis)
  839. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  840. if contains_nan and nan_policy == 'omit':
  841. a = ma.masked_invalid(a)
  842. return mstats_basic.variation(a, axis)
  843. return a.std(axis) / a.mean(axis)
  844. def skew(a, axis=0, bias=True, nan_policy='propagate'):
  845. """
  846. Compute the skewness of a data set.
  847. For normally distributed data, the skewness should be about 0. For
  848. unimodal continuous distributions, a skewness value > 0 means that
  849. there is more weight in the right tail of the distribution. The
  850. function `skewtest` can be used to determine if the skewness value
  851. is close enough to 0, statistically speaking.
  852. Parameters
  853. ----------
  854. a : ndarray
  855. data
  856. axis : int or None, optional
  857. Axis along which skewness is calculated. Default is 0.
  858. If None, compute over the whole array `a`.
  859. bias : bool, optional
  860. If False, then the calculations are corrected for statistical bias.
  861. nan_policy : {'propagate', 'raise', 'omit'}, optional
  862. Defines how to handle when input contains nan. 'propagate' returns nan,
  863. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  864. values. Default is 'propagate'.
  865. Returns
  866. -------
  867. skewness : ndarray
  868. The skewness of values along an axis, returning 0 where all values are
  869. equal.
  870. References
  871. ----------
  872. .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
  873. Probability and Statistics Tables and Formulae. Chapman & Hall: New
  874. York. 2000.
  875. Section 2.2.24.1
  876. Examples
  877. --------
  878. >>> from scipy.stats import skew
  879. >>> skew([1, 2, 3, 4, 5])
  880. 0.0
  881. >>> skew([2, 8, 0, 4, 1, 9, 9, 0])
  882. 0.2650554122698573
  883. """
  884. a, axis = _chk_asarray(a, axis)
  885. n = a.shape[axis]
  886. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  887. if contains_nan and nan_policy == 'omit':
  888. a = ma.masked_invalid(a)
  889. return mstats_basic.skew(a, axis, bias)
  890. m2 = moment(a, 2, axis)
  891. m3 = moment(a, 3, axis)
  892. zero = (m2 == 0)
  893. vals = _lazywhere(~zero, (m2, m3),
  894. lambda m2, m3: m3 / m2**1.5,
  895. 0.)
  896. if not bias:
  897. can_correct = (n > 2) & (m2 > 0)
  898. if can_correct.any():
  899. m2 = np.extract(can_correct, m2)
  900. m3 = np.extract(can_correct, m3)
  901. nval = np.sqrt((n - 1.0) * n) / (n - 2.0) * m3 / m2**1.5
  902. np.place(vals, can_correct, nval)
  903. if vals.ndim == 0:
  904. return vals.item()
  905. return vals
  906. def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'):
  907. """
  908. Compute the kurtosis (Fisher or Pearson) of a dataset.
  909. Kurtosis is the fourth central moment divided by the square of the
  910. variance. If Fisher's definition is used, then 3.0 is subtracted from
  911. the result to give 0.0 for a normal distribution.
  912. If bias is False then the kurtosis is calculated using k statistics to
  913. eliminate bias coming from biased moment estimators
  914. Use `kurtosistest` to see if result is close enough to normal.
  915. Parameters
  916. ----------
  917. a : array
  918. data for which the kurtosis is calculated
  919. axis : int or None, optional
  920. Axis along which the kurtosis is calculated. Default is 0.
  921. If None, compute over the whole array `a`.
  922. fisher : bool, optional
  923. If True, Fisher's definition is used (normal ==> 0.0). If False,
  924. Pearson's definition is used (normal ==> 3.0).
  925. bias : bool, optional
  926. If False, then the calculations are corrected for statistical bias.
  927. nan_policy : {'propagate', 'raise', 'omit'}, optional
  928. Defines how to handle when input contains nan. 'propagate' returns nan,
  929. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  930. values. Default is 'propagate'.
  931. Returns
  932. -------
  933. kurtosis : array
  934. The kurtosis of values along an axis. If all values are equal,
  935. return -3 for Fisher's definition and 0 for Pearson's definition.
  936. References
  937. ----------
  938. .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
  939. Probability and Statistics Tables and Formulae. Chapman & Hall: New
  940. York. 2000.
  941. Examples
  942. --------
  943. >>> from scipy.stats import kurtosis
  944. >>> kurtosis([1, 2, 3, 4, 5])
  945. -1.3
  946. """
  947. a, axis = _chk_asarray(a, axis)
  948. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  949. if contains_nan and nan_policy == 'omit':
  950. a = ma.masked_invalid(a)
  951. return mstats_basic.kurtosis(a, axis, fisher, bias)
  952. n = a.shape[axis]
  953. m2 = moment(a, 2, axis)
  954. m4 = moment(a, 4, axis)
  955. zero = (m2 == 0)
  956. olderr = np.seterr(all='ignore')
  957. try:
  958. vals = np.where(zero, 0, m4 / m2**2.0)
  959. finally:
  960. np.seterr(**olderr)
  961. if not bias:
  962. can_correct = (n > 3) & (m2 > 0)
  963. if can_correct.any():
  964. m2 = np.extract(can_correct, m2)
  965. m4 = np.extract(can_correct, m4)
  966. nval = 1.0/(n-2)/(n-3) * ((n**2-1.0)*m4/m2**2.0 - 3*(n-1)**2.0)
  967. np.place(vals, can_correct, nval + 3.0)
  968. if vals.ndim == 0:
  969. vals = vals.item() # array scalar
  970. return vals - 3 if fisher else vals
  971. DescribeResult = namedtuple('DescribeResult',
  972. ('nobs', 'minmax', 'mean', 'variance', 'skewness',
  973. 'kurtosis'))
  974. def describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate'):
  975. """
  976. Compute several descriptive statistics of the passed array.
  977. Parameters
  978. ----------
  979. a : array_like
  980. Input data.
  981. axis : int or None, optional
  982. Axis along which statistics are calculated. Default is 0.
  983. If None, compute over the whole array `a`.
  984. ddof : int, optional
  985. Delta degrees of freedom (only for variance). Default is 1.
  986. bias : bool, optional
  987. If False, then the skewness and kurtosis calculations are corrected for
  988. statistical bias.
  989. nan_policy : {'propagate', 'raise', 'omit'}, optional
  990. Defines how to handle when input contains nan. 'propagate' returns nan,
  991. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  992. values. Default is 'propagate'.
  993. Returns
  994. -------
  995. nobs : int or ndarray of ints
  996. Number of observations (length of data along `axis`).
  997. When 'omit' is chosen as nan_policy, each column is counted separately.
  998. minmax: tuple of ndarrays or floats
  999. Minimum and maximum value of data array.
  1000. mean : ndarray or float
  1001. Arithmetic mean of data along axis.
  1002. variance : ndarray or float
  1003. Unbiased variance of the data along axis, denominator is number of
  1004. observations minus one.
  1005. skewness : ndarray or float
  1006. Skewness, based on moment calculations with denominator equal to
  1007. the number of observations, i.e. no degrees of freedom correction.
  1008. kurtosis : ndarray or float
  1009. Kurtosis (Fisher). The kurtosis is normalized so that it is
  1010. zero for the normal distribution. No degrees of freedom are used.
  1011. See Also
  1012. --------
  1013. skew, kurtosis
  1014. Examples
  1015. --------
  1016. >>> from scipy import stats
  1017. >>> a = np.arange(10)
  1018. >>> stats.describe(a)
  1019. DescribeResult(nobs=10, minmax=(0, 9), mean=4.5, variance=9.166666666666666,
  1020. skewness=0.0, kurtosis=-1.2242424242424244)
  1021. >>> b = [[1, 2], [3, 4]]
  1022. >>> stats.describe(b)
  1023. DescribeResult(nobs=2, minmax=(array([1, 2]), array([3, 4])),
  1024. mean=array([2., 3.]), variance=array([2., 2.]),
  1025. skewness=array([0., 0.]), kurtosis=array([-2., -2.]))
  1026. """
  1027. a, axis = _chk_asarray(a, axis)
  1028. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  1029. if contains_nan and nan_policy == 'omit':
  1030. a = ma.masked_invalid(a)
  1031. return mstats_basic.describe(a, axis, ddof, bias)
  1032. if a.size == 0:
  1033. raise ValueError("The input must not be empty.")
  1034. n = a.shape[axis]
  1035. mm = (np.min(a, axis=axis), np.max(a, axis=axis))
  1036. m = np.mean(a, axis=axis)
  1037. v = np.var(a, axis=axis, ddof=ddof)
  1038. sk = skew(a, axis, bias=bias)
  1039. kurt = kurtosis(a, axis, bias=bias)
  1040. return DescribeResult(n, mm, m, v, sk, kurt)
  1041. #####################################
  1042. # NORMALITY TESTS #
  1043. #####################################
  1044. SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
  1045. def skewtest(a, axis=0, nan_policy='propagate'):
  1046. """
  1047. Test whether the skew is different from the normal distribution.
  1048. This function tests the null hypothesis that the skewness of
  1049. the population that the sample was drawn from is the same
  1050. as that of a corresponding normal distribution.
  1051. Parameters
  1052. ----------
  1053. a : array
  1054. The data to be tested
  1055. axis : int or None, optional
  1056. Axis along which statistics are calculated. Default is 0.
  1057. If None, compute over the whole array `a`.
  1058. nan_policy : {'propagate', 'raise', 'omit'}, optional
  1059. Defines how to handle when input contains nan. 'propagate' returns nan,
  1060. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  1061. values. Default is 'propagate'.
  1062. Returns
  1063. -------
  1064. statistic : float
  1065. The computed z-score for this test.
  1066. pvalue : float
  1067. a 2-sided p-value for the hypothesis test
  1068. Notes
  1069. -----
  1070. The sample size must be at least 8.
  1071. References
  1072. ----------
  1073. .. [1] R. B. D'Agostino, A. J. Belanger and R. B. D'Agostino Jr.,
  1074. "A suggestion for using powerful and informative tests of
  1075. normality", American Statistician 44, pp. 316-321, 1990.
  1076. Examples
  1077. --------
  1078. >>> from scipy.stats import skewtest
  1079. >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8])
  1080. SkewtestResult(statistic=1.0108048609177787, pvalue=0.3121098361421897)
  1081. >>> skewtest([2, 8, 0, 4, 1, 9, 9, 0])
  1082. SkewtestResult(statistic=0.44626385374196975, pvalue=0.6554066631275459)
  1083. >>> skewtest([1, 2, 3, 4, 5, 6, 7, 8000])
  1084. SkewtestResult(statistic=3.571773510360407, pvalue=0.0003545719905823133)
  1085. >>> skewtest([100, 100, 100, 100, 100, 100, 100, 101])
  1086. SkewtestResult(statistic=3.5717766638478072, pvalue=0.000354567720281634)
  1087. """
  1088. a, axis = _chk_asarray(a, axis)
  1089. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  1090. if contains_nan and nan_policy == 'omit':
  1091. a = ma.masked_invalid(a)
  1092. return mstats_basic.skewtest(a, axis)
  1093. if axis is None:
  1094. a = np.ravel(a)
  1095. axis = 0
  1096. b2 = skew(a, axis)
  1097. n = a.shape[axis]
  1098. if n < 8:
  1099. raise ValueError(
  1100. "skewtest is not valid with less than 8 samples; %i samples"
  1101. " were given." % int(n))
  1102. y = b2 * math.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2)))
  1103. beta2 = (3.0 * (n**2 + 27*n - 70) * (n+1) * (n+3) /
  1104. ((n-2.0) * (n+5) * (n+7) * (n+9)))
  1105. W2 = -1 + math.sqrt(2 * (beta2 - 1))
  1106. delta = 1 / math.sqrt(0.5 * math.log(W2))
  1107. alpha = math.sqrt(2.0 / (W2 - 1))
  1108. y = np.where(y == 0, 1, y)
  1109. Z = delta * np.log(y / alpha + np.sqrt((y / alpha)**2 + 1))
  1110. return SkewtestResult(Z, 2 * distributions.norm.sf(np.abs(Z)))
  1111. KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
  1112. def kurtosistest(a, axis=0, nan_policy='propagate'):
  1113. """
  1114. Test whether a dataset has normal kurtosis.
  1115. This function tests the null hypothesis that the kurtosis
  1116. of the population from which the sample was drawn is that
  1117. of the normal distribution: ``kurtosis = 3(n-1)/(n+1)``.
  1118. Parameters
  1119. ----------
  1120. a : array
  1121. array of the sample data
  1122. axis : int or None, optional
  1123. Axis along which to compute test. Default is 0. If None,
  1124. compute over the whole array `a`.
  1125. nan_policy : {'propagate', 'raise', 'omit'}, optional
  1126. Defines how to handle when input contains nan. 'propagate' returns nan,
  1127. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  1128. values. Default is 'propagate'.
  1129. Returns
  1130. -------
  1131. statistic : float
  1132. The computed z-score for this test.
  1133. pvalue : float
  1134. The 2-sided p-value for the hypothesis test
  1135. Notes
  1136. -----
  1137. Valid only for n>20. This function uses the method described in [1]_.
  1138. References
  1139. ----------
  1140. .. [1] see e.g. F. J. Anscombe, W. J. Glynn, "Distribution of the kurtosis
  1141. statistic b2 for normal samples", Biometrika, vol. 70, pp. 227-234, 1983.
  1142. Examples
  1143. --------
  1144. >>> from scipy.stats import kurtosistest
  1145. >>> kurtosistest(list(range(20)))
  1146. KurtosistestResult(statistic=-1.7058104152122062, pvalue=0.08804338332528348)
  1147. >>> np.random.seed(28041990)
  1148. >>> s = np.random.normal(0, 1, 1000)
  1149. >>> kurtosistest(s)
  1150. KurtosistestResult(statistic=1.2317590987707365, pvalue=0.21803908613450895)
  1151. """
  1152. a, axis = _chk_asarray(a, axis)
  1153. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  1154. if contains_nan and nan_policy == 'omit':
  1155. a = ma.masked_invalid(a)
  1156. return mstats_basic.kurtosistest(a, axis)
  1157. n = a.shape[axis]
  1158. if n < 5:
  1159. raise ValueError(
  1160. "kurtosistest requires at least 5 observations; %i observations"
  1161. " were given." % int(n))
  1162. if n < 20:
  1163. warnings.warn("kurtosistest only valid for n>=20 ... continuing "
  1164. "anyway, n=%i" % int(n))
  1165. b2 = kurtosis(a, axis, fisher=False)
  1166. E = 3.0*(n-1) / (n+1)
  1167. varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) # [1]_ Eq. 1
  1168. x = (b2-E) / np.sqrt(varb2) # [1]_ Eq. 4
  1169. # [1]_ Eq. 2:
  1170. sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
  1171. (n*(n-2)*(n-3)))
  1172. # [1]_ Eq. 3:
  1173. A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
  1174. term1 = 1 - 2/(9.0*A)
  1175. denom = 1 + x*np.sqrt(2/(A-4.0))
  1176. term2 = np.sign(denom) * np.where(denom == 0.0, np.nan,
  1177. np.power((1-2.0/A)/np.abs(denom), 1/3.0))
  1178. if np.any(denom == 0):
  1179. msg = "Test statistic not defined in some cases due to division by " \
  1180. "zero. Return nan in that case..."
  1181. warnings.warn(msg, RuntimeWarning)
  1182. Z = (term1 - term2) / np.sqrt(2/(9.0*A)) # [1]_ Eq. 5
  1183. if Z.ndim == 0:
  1184. Z = Z[()]
  1185. # zprob uses upper tail, so Z needs to be positive
  1186. return KurtosistestResult(Z, 2 * distributions.norm.sf(np.abs(Z)))
  1187. NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
  1188. def normaltest(a, axis=0, nan_policy='propagate'):
  1189. """
  1190. Test whether a sample differs from a normal distribution.
  1191. This function tests the null hypothesis that a sample comes
  1192. from a normal distribution. It is based on D'Agostino and
  1193. Pearson's [1]_, [2]_ test that combines skew and kurtosis to
  1194. produce an omnibus test of normality.
  1195. Parameters
  1196. ----------
  1197. a : array_like
  1198. The array containing the sample to be tested.
  1199. axis : int or None, optional
  1200. Axis along which to compute test. Default is 0. If None,
  1201. compute over the whole array `a`.
  1202. nan_policy : {'propagate', 'raise', 'omit'}, optional
  1203. Defines how to handle when input contains nan. 'propagate' returns nan,
  1204. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  1205. values. Default is 'propagate'.
  1206. Returns
  1207. -------
  1208. statistic : float or array
  1209. ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
  1210. ``k`` is the z-score returned by `kurtosistest`.
  1211. pvalue : float or array
  1212. A 2-sided chi squared probability for the hypothesis test.
  1213. References
  1214. ----------
  1215. .. [1] D'Agostino, R. B. (1971), "An omnibus test of normality for
  1216. moderate and large sample size", Biometrika, 58, 341-348
  1217. .. [2] D'Agostino, R. and Pearson, E. S. (1973), "Tests for departure from
  1218. normality", Biometrika, 60, 613-622
  1219. Examples
  1220. --------
  1221. >>> from scipy import stats
  1222. >>> pts = 1000
  1223. >>> np.random.seed(28041990)
  1224. >>> a = np.random.normal(0, 1, size=pts)
  1225. >>> b = np.random.normal(2, 1, size=pts)
  1226. >>> x = np.concatenate((a, b))
  1227. >>> k2, p = stats.normaltest(x)
  1228. >>> alpha = 1e-3
  1229. >>> print("p = {:g}".format(p))
  1230. p = 3.27207e-11
  1231. >>> if p < alpha: # null hypothesis: x comes from a normal distribution
  1232. ... print("The null hypothesis can be rejected")
  1233. ... else:
  1234. ... print("The null hypothesis cannot be rejected")
  1235. The null hypothesis can be rejected
  1236. """
  1237. a, axis = _chk_asarray(a, axis)
  1238. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  1239. if contains_nan and nan_policy == 'omit':
  1240. a = ma.masked_invalid(a)
  1241. return mstats_basic.normaltest(a, axis)
  1242. s, _ = skewtest(a, axis)
  1243. k, _ = kurtosistest(a, axis)
  1244. k2 = s*s + k*k
  1245. return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
  1246. def jarque_bera(x):
  1247. """
  1248. Perform the Jarque-Bera goodness of fit test on sample data.
  1249. The Jarque-Bera test tests whether the sample data has the skewness and
  1250. kurtosis matching a normal distribution.
  1251. Note that this test only works for a large enough number of data samples
  1252. (>2000) as the test statistic asymptotically has a Chi-squared distribution
  1253. with 2 degrees of freedom.
  1254. Parameters
  1255. ----------
  1256. x : array_like
  1257. Observations of a random variable.
  1258. Returns
  1259. -------
  1260. jb_value : float
  1261. The test statistic.
  1262. p : float
  1263. The p-value for the hypothesis test.
  1264. References
  1265. ----------
  1266. .. [1] Jarque, C. and Bera, A. (1980) "Efficient tests for normality,
  1267. homoscedasticity and serial independence of regression residuals",
  1268. 6 Econometric Letters 255-259.
  1269. Examples
  1270. --------
  1271. >>> from scipy import stats
  1272. >>> np.random.seed(987654321)
  1273. >>> x = np.random.normal(0, 1, 100000)
  1274. >>> y = np.random.rayleigh(1, 100000)
  1275. >>> stats.jarque_bera(x)
  1276. (4.7165707989581342, 0.09458225503041906)
  1277. >>> stats.jarque_bera(y)
  1278. (6713.7098548143422, 0.0)
  1279. """
  1280. x = np.asarray(x)
  1281. n = x.size
  1282. if n == 0:
  1283. raise ValueError('At least one observation is required.')
  1284. mu = x.mean()
  1285. diffx = x - mu
  1286. skewness = (1 / n * np.sum(diffx**3)) / (1 / n * np.sum(diffx**2))**(3 / 2.)
  1287. kurtosis = (1 / n * np.sum(diffx**4)) / (1 / n * np.sum(diffx**2))**2
  1288. jb_value = n / 6 * (skewness**2 + (kurtosis - 3)**2 / 4)
  1289. p = 1 - distributions.chi2.cdf(jb_value, 2)
  1290. return jb_value, p
  1291. #####################################
  1292. # FREQUENCY FUNCTIONS #
  1293. #####################################
  1294. @np.deprecate(message="`itemfreq` is deprecated and will be removed in a "
  1295. "future version. Use instead `np.unique(..., return_counts=True)`")
  1296. def itemfreq(a):
  1297. """
  1298. Return a 2-D array of item frequencies.
  1299. Parameters
  1300. ----------
  1301. a : (N,) array_like
  1302. Input array.
  1303. Returns
  1304. -------
  1305. itemfreq : (K, 2) ndarray
  1306. A 2-D frequency table. Column 1 contains sorted, unique values from
  1307. `a`, column 2 contains their respective counts.
  1308. Examples
  1309. --------
  1310. >>> from scipy import stats
  1311. >>> a = np.array([1, 1, 5, 0, 1, 2, 2, 0, 1, 4])
  1312. >>> stats.itemfreq(a)
  1313. array([[ 0., 2.],
  1314. [ 1., 4.],
  1315. [ 2., 2.],
  1316. [ 4., 1.],
  1317. [ 5., 1.]])
  1318. >>> np.bincount(a)
  1319. array([2, 4, 2, 0, 1, 1])
  1320. >>> stats.itemfreq(a/10.)
  1321. array([[ 0. , 2. ],
  1322. [ 0.1, 4. ],
  1323. [ 0.2, 2. ],
  1324. [ 0.4, 1. ],
  1325. [ 0.5, 1. ]])
  1326. """
  1327. items, inv = np.unique(a, return_inverse=True)
  1328. freq = np.bincount(inv)
  1329. return np.array([items, freq]).T
  1330. def scoreatpercentile(a, per, limit=(), interpolation_method='fraction',
  1331. axis=None):
  1332. """
  1333. Calculate the score at a given percentile of the input sequence.
  1334. For example, the score at `per=50` is the median. If the desired quantile
  1335. lies between two data points, we interpolate between them, according to
  1336. the value of `interpolation`. If the parameter `limit` is provided, it
  1337. should be a tuple (lower, upper) of two values.
  1338. Parameters
  1339. ----------
  1340. a : array_like
  1341. A 1-D array of values from which to extract score.
  1342. per : array_like
  1343. Percentile(s) at which to extract score. Values should be in range
  1344. [0,100].
  1345. limit : tuple, optional
  1346. Tuple of two scalars, the lower and upper limits within which to
  1347. compute the percentile. Values of `a` outside
  1348. this (closed) interval will be ignored.
  1349. interpolation_method : {'fraction', 'lower', 'higher'}, optional
  1350. This optional parameter specifies the interpolation method to use,
  1351. when the desired quantile lies between two data points `i` and `j`
  1352. - fraction: ``i + (j - i) * fraction`` where ``fraction`` is the
  1353. fractional part of the index surrounded by ``i`` and ``j``.
  1354. - lower: ``i``.
  1355. - higher: ``j``.
  1356. axis : int, optional
  1357. Axis along which the percentiles are computed. Default is None. If
  1358. None, compute over the whole array `a`.
  1359. Returns
  1360. -------
  1361. score : float or ndarray
  1362. Score at percentile(s).
  1363. See Also
  1364. --------
  1365. percentileofscore, numpy.percentile
  1366. Notes
  1367. -----
  1368. This function will become obsolete in the future.
  1369. For Numpy 1.9 and higher, `numpy.percentile` provides all the functionality
  1370. that `scoreatpercentile` provides. And it's significantly faster.
  1371. Therefore it's recommended to use `numpy.percentile` for users that have
  1372. numpy >= 1.9.
  1373. Examples
  1374. --------
  1375. >>> from scipy import stats
  1376. >>> a = np.arange(100)
  1377. >>> stats.scoreatpercentile(a, 50)
  1378. 49.5
  1379. """
  1380. # adapted from NumPy's percentile function. When we require numpy >= 1.8,
  1381. # the implementation of this function can be replaced by np.percentile.
  1382. a = np.asarray(a)
  1383. if a.size == 0:
  1384. # empty array, return nan(s) with shape matching `per`
  1385. if np.isscalar(per):
  1386. return np.nan
  1387. else:
  1388. return np.full(np.asarray(per).shape, np.nan, dtype=np.float64)
  1389. if limit:
  1390. a = a[(limit[0] <= a) & (a <= limit[1])]
  1391. sorted_ = np.sort(a, axis=axis)
  1392. if axis is None:
  1393. axis = 0
  1394. return _compute_qth_percentile(sorted_, per, interpolation_method, axis)
  1395. # handle sequence of per's without calling sort multiple times
  1396. def _compute_qth_percentile(sorted_, per, interpolation_method, axis):
  1397. if not np.isscalar(per):
  1398. score = [_compute_qth_percentile(sorted_, i,
  1399. interpolation_method, axis)
  1400. for i in per]
  1401. return np.array(score)
  1402. if not (0 <= per <= 100):
  1403. raise ValueError("percentile must be in the range [0, 100]")
  1404. indexer = [slice(None)] * sorted_.ndim
  1405. idx = per / 100. * (sorted_.shape[axis] - 1)
  1406. if int(idx) != idx:
  1407. # round fractional indices according to interpolation method
  1408. if interpolation_method == 'lower':
  1409. idx = int(np.floor(idx))
  1410. elif interpolation_method == 'higher':
  1411. idx = int(np.ceil(idx))
  1412. elif interpolation_method == 'fraction':
  1413. pass # keep idx as fraction and interpolate
  1414. else:
  1415. raise ValueError("interpolation_method can only be 'fraction', "
  1416. "'lower' or 'higher'")
  1417. i = int(idx)
  1418. if i == idx:
  1419. indexer[axis] = slice(i, i + 1)
  1420. weights = array(1)
  1421. sumval = 1.0
  1422. else:
  1423. indexer[axis] = slice(i, i + 2)
  1424. j = i + 1
  1425. weights = array([(j - idx), (idx - i)], float)
  1426. wshape = [1] * sorted_.ndim
  1427. wshape[axis] = 2
  1428. weights.shape = wshape
  1429. sumval = weights.sum()
  1430. # Use np.add.reduce (== np.sum but a little faster) to coerce data type
  1431. return np.add.reduce(sorted_[tuple(indexer)] * weights, axis=axis) / sumval
  1432. def percentileofscore(a, score, kind='rank'):
  1433. """
  1434. The percentile rank of a score relative to a list of scores.
  1435. A `percentileofscore` of, for example, 80% means that 80% of the
  1436. scores in `a` are below the given score. In the case of gaps or
  1437. ties, the exact definition depends on the optional keyword, `kind`.
  1438. Parameters
  1439. ----------
  1440. a : array_like
  1441. Array of scores to which `score` is compared.
  1442. score : int or float
  1443. Score that is compared to the elements in `a`.
  1444. kind : {'rank', 'weak', 'strict', 'mean'}, optional
  1445. This optional parameter specifies the interpretation of the
  1446. resulting score:
  1447. - "rank": Average percentage ranking of score. In case of
  1448. multiple matches, average the percentage rankings of
  1449. all matching scores.
  1450. - "weak": This kind corresponds to the definition of a cumulative
  1451. distribution function. A percentileofscore of 80%
  1452. means that 80% of values are less than or equal
  1453. to the provided score.
  1454. - "strict": Similar to "weak", except that only values that are
  1455. strictly less than the given score are counted.
  1456. - "mean": The average of the "weak" and "strict" scores, often used in
  1457. testing. See
  1458. https://en.wikipedia.org/wiki/Percentile_rank
  1459. Returns
  1460. -------
  1461. pcos : float
  1462. Percentile-position of score (0-100) relative to `a`.
  1463. See Also
  1464. --------
  1465. numpy.percentile
  1466. Examples
  1467. --------
  1468. Three-quarters of the given values lie below a given score:
  1469. >>> from scipy import stats
  1470. >>> stats.percentileofscore([1, 2, 3, 4], 3)
  1471. 75.0
  1472. With multiple matches, note how the scores of the two matches, 0.6
  1473. and 0.8 respectively, are averaged:
  1474. >>> stats.percentileofscore([1, 2, 3, 3, 4], 3)
  1475. 70.0
  1476. Only 2/5 values are strictly less than 3:
  1477. >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
  1478. 40.0
  1479. But 4/5 values are less than or equal to 3:
  1480. >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
  1481. 80.0
  1482. The average between the weak and the strict scores is
  1483. >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
  1484. 60.0
  1485. """
  1486. if np.isnan(score):
  1487. return np.nan
  1488. a = np.asarray(a)
  1489. n = len(a)
  1490. if n == 0:
  1491. return 100.0
  1492. if kind == 'rank':
  1493. left = np.count_nonzero(a < score)
  1494. right = np.count_nonzero(a <= score)
  1495. pct = (right + left + (1 if right > left else 0)) * 50.0/n
  1496. return pct
  1497. elif kind == 'strict':
  1498. return np.count_nonzero(a < score) / n * 100
  1499. elif kind == 'weak':
  1500. return np.count_nonzero(a <= score) / n * 100
  1501. elif kind == 'mean':
  1502. pct = (np.count_nonzero(a < score) + np.count_nonzero(a <= score)) / n * 50
  1503. return pct
  1504. else:
  1505. raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'")
  1506. HistogramResult = namedtuple('HistogramResult',
  1507. ('count', 'lowerlimit', 'binsize', 'extrapoints'))
  1508. def _histogram(a, numbins=10, defaultlimits=None, weights=None, printextras=False):
  1509. """
  1510. Separate the range into several bins and return the number of instances
  1511. in each bin.
  1512. Parameters
  1513. ----------
  1514. a : array_like
  1515. Array of scores which will be put into bins.
  1516. numbins : int, optional
  1517. The number of bins to use for the histogram. Default is 10.
  1518. defaultlimits : tuple (lower, upper), optional
  1519. The lower and upper values for the range of the histogram.
  1520. If no value is given, a range slightly larger than the range of the
  1521. values in a is used. Specifically ``(a.min() - s, a.max() + s)``,
  1522. where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
  1523. weights : array_like, optional
  1524. The weights for each value in `a`. Default is None, which gives each
  1525. value a weight of 1.0
  1526. printextras : bool, optional
  1527. If True, if there are extra points (i.e. the points that fall outside
  1528. the bin limits) a warning is raised saying how many of those points
  1529. there are. Default is False.
  1530. Returns
  1531. -------
  1532. count : ndarray
  1533. Number of points (or sum of weights) in each bin.
  1534. lowerlimit : float
  1535. Lowest value of histogram, the lower limit of the first bin.
  1536. binsize : float
  1537. The size of the bins (all bins have the same size).
  1538. extrapoints : int
  1539. The number of points outside the range of the histogram.
  1540. See Also
  1541. --------
  1542. numpy.histogram
  1543. Notes
  1544. -----
  1545. This histogram is based on numpy's histogram but has a larger range by
  1546. default if default limits is not set.
  1547. """
  1548. a = np.ravel(a)
  1549. if defaultlimits is None:
  1550. if a.size == 0:
  1551. # handle empty arrays. Undetermined range, so use 0-1.
  1552. defaultlimits = (0, 1)
  1553. else:
  1554. # no range given, so use values in `a`
  1555. data_min = a.min()
  1556. data_max = a.max()
  1557. # Have bins extend past min and max values slightly
  1558. s = (data_max - data_min) / (2. * (numbins - 1.))
  1559. defaultlimits = (data_min - s, data_max + s)
  1560. # use numpy's histogram method to compute bins
  1561. hist, bin_edges = np.histogram(a, bins=numbins, range=defaultlimits,
  1562. weights=weights)
  1563. # hist are not always floats, convert to keep with old output
  1564. hist = np.array(hist, dtype=float)
  1565. # fixed width for bins is assumed, as numpy's histogram gives
  1566. # fixed width bins for int values for 'bins'
  1567. binsize = bin_edges[1] - bin_edges[0]
  1568. # calculate number of extra points
  1569. extrapoints = len([v for v in a
  1570. if defaultlimits[0] > v or v > defaultlimits[1]])
  1571. if extrapoints > 0 and printextras:
  1572. warnings.warn("Points outside given histogram range = %s"
  1573. % extrapoints)
  1574. return HistogramResult(hist, defaultlimits[0], binsize, extrapoints)
  1575. CumfreqResult = namedtuple('CumfreqResult',
  1576. ('cumcount', 'lowerlimit', 'binsize',
  1577. 'extrapoints'))
  1578. def cumfreq(a, numbins=10, defaultreallimits=None, weights=None):
  1579. """
  1580. Return a cumulative frequency histogram, using the histogram function.
  1581. A cumulative histogram is a mapping that counts the cumulative number of
  1582. observations in all of the bins up to the specified bin.
  1583. Parameters
  1584. ----------
  1585. a : array_like
  1586. Input array.
  1587. numbins : int, optional
  1588. The number of bins to use for the histogram. Default is 10.
  1589. defaultreallimits : tuple (lower, upper), optional
  1590. The lower and upper values for the range of the histogram.
  1591. If no value is given, a range slightly larger than the range of the
  1592. values in `a` is used. Specifically ``(a.min() - s, a.max() + s)``,
  1593. where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
  1594. weights : array_like, optional
  1595. The weights for each value in `a`. Default is None, which gives each
  1596. value a weight of 1.0
  1597. Returns
  1598. -------
  1599. cumcount : ndarray
  1600. Binned values of cumulative frequency.
  1601. lowerlimit : float
  1602. Lower real limit
  1603. binsize : float
  1604. Width of each bin.
  1605. extrapoints : int
  1606. Extra points.
  1607. Examples
  1608. --------
  1609. >>> import matplotlib.pyplot as plt
  1610. >>> from scipy import stats
  1611. >>> x = [1, 4, 2, 1, 3, 1]
  1612. >>> res = stats.cumfreq(x, numbins=4, defaultreallimits=(1.5, 5))
  1613. >>> res.cumcount
  1614. array([ 1., 2., 3., 3.])
  1615. >>> res.extrapoints
  1616. 3
  1617. Create a normal distribution with 1000 random values
  1618. >>> rng = np.random.RandomState(seed=12345)
  1619. >>> samples = stats.norm.rvs(size=1000, random_state=rng)
  1620. Calculate cumulative frequencies
  1621. >>> res = stats.cumfreq(samples, numbins=25)
  1622. Calculate space of values for x
  1623. >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.cumcount.size,
  1624. ... res.cumcount.size)
  1625. Plot histogram and cumulative histogram
  1626. >>> fig = plt.figure(figsize=(10, 4))
  1627. >>> ax1 = fig.add_subplot(1, 2, 1)
  1628. >>> ax2 = fig.add_subplot(1, 2, 2)
  1629. >>> ax1.hist(samples, bins=25)
  1630. >>> ax1.set_title('Histogram')
  1631. >>> ax2.bar(x, res.cumcount, width=res.binsize)
  1632. >>> ax2.set_title('Cumulative histogram')
  1633. >>> ax2.set_xlim([x.min(), x.max()])
  1634. >>> plt.show()
  1635. """
  1636. h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)
  1637. cumhist = np.cumsum(h * 1, axis=0)
  1638. return CumfreqResult(cumhist, l, b, e)
  1639. RelfreqResult = namedtuple('RelfreqResult',
  1640. ('frequency', 'lowerlimit', 'binsize',
  1641. 'extrapoints'))
  1642. def relfreq(a, numbins=10, defaultreallimits=None, weights=None):
  1643. """
  1644. Return a relative frequency histogram, using the histogram function.
  1645. A relative frequency histogram is a mapping of the number of
  1646. observations in each of the bins relative to the total of observations.
  1647. Parameters
  1648. ----------
  1649. a : array_like
  1650. Input array.
  1651. numbins : int, optional
  1652. The number of bins to use for the histogram. Default is 10.
  1653. defaultreallimits : tuple (lower, upper), optional
  1654. The lower and upper values for the range of the histogram.
  1655. If no value is given, a range slightly larger than the range of the
  1656. values in a is used. Specifically ``(a.min() - s, a.max() + s)``,
  1657. where ``s = (1/2)(a.max() - a.min()) / (numbins - 1)``.
  1658. weights : array_like, optional
  1659. The weights for each value in `a`. Default is None, which gives each
  1660. value a weight of 1.0
  1661. Returns
  1662. -------
  1663. frequency : ndarray
  1664. Binned values of relative frequency.
  1665. lowerlimit : float
  1666. Lower real limit
  1667. binsize : float
  1668. Width of each bin.
  1669. extrapoints : int
  1670. Extra points.
  1671. Examples
  1672. --------
  1673. >>> import matplotlib.pyplot as plt
  1674. >>> from scipy import stats
  1675. >>> a = np.array([2, 4, 1, 2, 3, 2])
  1676. >>> res = stats.relfreq(a, numbins=4)
  1677. >>> res.frequency
  1678. array([ 0.16666667, 0.5 , 0.16666667, 0.16666667])
  1679. >>> np.sum(res.frequency) # relative frequencies should add up to 1
  1680. 1.0
  1681. Create a normal distribution with 1000 random values
  1682. >>> rng = np.random.RandomState(seed=12345)
  1683. >>> samples = stats.norm.rvs(size=1000, random_state=rng)
  1684. Calculate relative frequencies
  1685. >>> res = stats.relfreq(samples, numbins=25)
  1686. Calculate space of values for x
  1687. >>> x = res.lowerlimit + np.linspace(0, res.binsize*res.frequency.size,
  1688. ... res.frequency.size)
  1689. Plot relative frequency histogram
  1690. >>> fig = plt.figure(figsize=(5, 4))
  1691. >>> ax = fig.add_subplot(1, 1, 1)
  1692. >>> ax.bar(x, res.frequency, width=res.binsize)
  1693. >>> ax.set_title('Relative frequency histogram')
  1694. >>> ax.set_xlim([x.min(), x.max()])
  1695. >>> plt.show()
  1696. """
  1697. a = np.asanyarray(a)
  1698. h, l, b, e = _histogram(a, numbins, defaultreallimits, weights=weights)
  1699. h = h / a.shape[0]
  1700. return RelfreqResult(h, l, b, e)
  1701. #####################################
  1702. # VARIABILITY FUNCTIONS #
  1703. #####################################
  1704. def obrientransform(*args):
  1705. """
  1706. Compute the O'Brien transform on input data (any number of arrays).
  1707. Used to test for homogeneity of variance prior to running one-way stats.
  1708. Each array in ``*args`` is one level of a factor.
  1709. If `f_oneway` is run on the transformed data and found significant,
  1710. the variances are unequal. From Maxwell and Delaney [1]_, p.112.
  1711. Parameters
  1712. ----------
  1713. args : tuple of array_like
  1714. Any number of arrays.
  1715. Returns
  1716. -------
  1717. obrientransform : ndarray
  1718. Transformed data for use in an ANOVA. The first dimension
  1719. of the result corresponds to the sequence of transformed
  1720. arrays. If the arrays given are all 1-D of the same length,
  1721. the return value is a 2-D array; otherwise it is a 1-D array
  1722. of type object, with each element being an ndarray.
  1723. References
  1724. ----------
  1725. .. [1] S. E. Maxwell and H. D. Delaney, "Designing Experiments and
  1726. Analyzing Data: A Model Comparison Perspective", Wadsworth, 1990.
  1727. Examples
  1728. --------
  1729. We'll test the following data sets for differences in their variance.
  1730. >>> x = [10, 11, 13, 9, 7, 12, 12, 9, 10]
  1731. >>> y = [13, 21, 5, 10, 8, 14, 10, 12, 7, 15]
  1732. Apply the O'Brien transform to the data.
  1733. >>> from scipy.stats import obrientransform
  1734. >>> tx, ty = obrientransform(x, y)
  1735. Use `scipy.stats.f_oneway` to apply a one-way ANOVA test to the
  1736. transformed data.
  1737. >>> from scipy.stats import f_oneway
  1738. >>> F, p = f_oneway(tx, ty)
  1739. >>> p
  1740. 0.1314139477040335
  1741. If we require that ``p < 0.05`` for significance, we cannot conclude
  1742. that the variances are different.
  1743. """
  1744. TINY = np.sqrt(np.finfo(float).eps)
  1745. # `arrays` will hold the transformed arguments.
  1746. arrays = []
  1747. for arg in args:
  1748. a = np.asarray(arg)
  1749. n = len(a)
  1750. mu = np.mean(a)
  1751. sq = (a - mu)**2
  1752. sumsq = sq.sum()
  1753. # The O'Brien transform.
  1754. t = ((n - 1.5) * n * sq - 0.5 * sumsq) / ((n - 1) * (n - 2))
  1755. # Check that the mean of the transformed data is equal to the
  1756. # original variance.
  1757. var = sumsq / (n - 1)
  1758. if abs(var - np.mean(t)) > TINY:
  1759. raise ValueError('Lack of convergence in obrientransform.')
  1760. arrays.append(t)
  1761. return np.array(arrays)
  1762. def sem(a, axis=0, ddof=1, nan_policy='propagate'):
  1763. """
  1764. Calculate the standard error of the mean (or standard error of
  1765. measurement) of the values in the input array.
  1766. Parameters
  1767. ----------
  1768. a : array_like
  1769. An array containing the values for which the standard error is
  1770. returned.
  1771. axis : int or None, optional
  1772. Axis along which to operate. Default is 0. If None, compute over
  1773. the whole array `a`.
  1774. ddof : int, optional
  1775. Delta degrees-of-freedom. How many degrees of freedom to adjust
  1776. for bias in limited samples relative to the population estimate
  1777. of variance. Defaults to 1.
  1778. nan_policy : {'propagate', 'raise', 'omit'}, optional
  1779. Defines how to handle when input contains nan. 'propagate' returns nan,
  1780. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  1781. values. Default is 'propagate'.
  1782. Returns
  1783. -------
  1784. s : ndarray or float
  1785. The standard error of the mean in the sample(s), along the input axis.
  1786. Notes
  1787. -----
  1788. The default value for `ddof` is different to the default (0) used by other
  1789. ddof containing routines, such as np.std and np.nanstd.
  1790. Examples
  1791. --------
  1792. Find standard error along the first axis:
  1793. >>> from scipy import stats
  1794. >>> a = np.arange(20).reshape(5,4)
  1795. >>> stats.sem(a)
  1796. array([ 2.8284, 2.8284, 2.8284, 2.8284])
  1797. Find standard error across the whole array, using n degrees of freedom:
  1798. >>> stats.sem(a, axis=None, ddof=0)
  1799. 1.2893796958227628
  1800. """
  1801. a, axis = _chk_asarray(a, axis)
  1802. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  1803. if contains_nan and nan_policy == 'omit':
  1804. a = ma.masked_invalid(a)
  1805. return mstats_basic.sem(a, axis, ddof)
  1806. n = a.shape[axis]
  1807. s = np.std(a, axis=axis, ddof=ddof) / np.sqrt(n)
  1808. return s
  1809. def zscore(a, axis=0, ddof=0):
  1810. """
  1811. Calculate the z score of each value in the sample, relative to the
  1812. sample mean and standard deviation.
  1813. Parameters
  1814. ----------
  1815. a : array_like
  1816. An array like object containing the sample data.
  1817. axis : int or None, optional
  1818. Axis along which to operate. Default is 0. If None, compute over
  1819. the whole array `a`.
  1820. ddof : int, optional
  1821. Degrees of freedom correction in the calculation of the
  1822. standard deviation. Default is 0.
  1823. Returns
  1824. -------
  1825. zscore : array_like
  1826. The z-scores, standardized by mean and standard deviation of
  1827. input array `a`.
  1828. Notes
  1829. -----
  1830. This function preserves ndarray subclasses, and works also with
  1831. matrices and masked arrays (it uses `asanyarray` instead of
  1832. `asarray` for parameters).
  1833. Examples
  1834. --------
  1835. >>> a = np.array([ 0.7972, 0.0767, 0.4383, 0.7866, 0.8091,
  1836. ... 0.1954, 0.6307, 0.6599, 0.1065, 0.0508])
  1837. >>> from scipy import stats
  1838. >>> stats.zscore(a)
  1839. array([ 1.1273, -1.247 , -0.0552, 1.0923, 1.1664, -0.8559, 0.5786,
  1840. 0.6748, -1.1488, -1.3324])
  1841. Computing along a specified axis, using n-1 degrees of freedom
  1842. (``ddof=1``) to calculate the standard deviation:
  1843. >>> b = np.array([[ 0.3148, 0.0478, 0.6243, 0.4608],
  1844. ... [ 0.7149, 0.0775, 0.6072, 0.9656],
  1845. ... [ 0.6341, 0.1403, 0.9759, 0.4064],
  1846. ... [ 0.5918, 0.6948, 0.904 , 0.3721],
  1847. ... [ 0.0921, 0.2481, 0.1188, 0.1366]])
  1848. >>> stats.zscore(b, axis=1, ddof=1)
  1849. array([[-0.19264823, -1.28415119, 1.07259584, 0.40420358],
  1850. [ 0.33048416, -1.37380874, 0.04251374, 1.00081084],
  1851. [ 0.26796377, -1.12598418, 1.23283094, -0.37481053],
  1852. [-0.22095197, 0.24468594, 1.19042819, -1.21416216],
  1853. [-0.82780366, 1.4457416 , -0.43867764, -0.1792603 ]])
  1854. """
  1855. a = np.asanyarray(a)
  1856. mns = a.mean(axis=axis)
  1857. sstd = a.std(axis=axis, ddof=ddof)
  1858. if axis and mns.ndim < a.ndim:
  1859. return ((a - np.expand_dims(mns, axis=axis)) /
  1860. np.expand_dims(sstd, axis=axis))
  1861. else:
  1862. return (a - mns) / sstd
  1863. def zmap(scores, compare, axis=0, ddof=0):
  1864. """
  1865. Calculate the relative z-scores.
  1866. Return an array of z-scores, i.e., scores that are standardized to
  1867. zero mean and unit variance, where mean and variance are calculated
  1868. from the comparison array.
  1869. Parameters
  1870. ----------
  1871. scores : array_like
  1872. The input for which z-scores are calculated.
  1873. compare : array_like
  1874. The input from which the mean and standard deviation of the
  1875. normalization are taken; assumed to have the same dimension as
  1876. `scores`.
  1877. axis : int or None, optional
  1878. Axis over which mean and variance of `compare` are calculated.
  1879. Default is 0. If None, compute over the whole array `scores`.
  1880. ddof : int, optional
  1881. Degrees of freedom correction in the calculation of the
  1882. standard deviation. Default is 0.
  1883. Returns
  1884. -------
  1885. zscore : array_like
  1886. Z-scores, in the same shape as `scores`.
  1887. Notes
  1888. -----
  1889. This function preserves ndarray subclasses, and works also with
  1890. matrices and masked arrays (it uses `asanyarray` instead of
  1891. `asarray` for parameters).
  1892. Examples
  1893. --------
  1894. >>> from scipy.stats import zmap
  1895. >>> a = [0.5, 2.0, 2.5, 3]
  1896. >>> b = [0, 1, 2, 3, 4]
  1897. >>> zmap(a, b)
  1898. array([-1.06066017, 0. , 0.35355339, 0.70710678])
  1899. """
  1900. scores, compare = map(np.asanyarray, [scores, compare])
  1901. mns = compare.mean(axis=axis)
  1902. sstd = compare.std(axis=axis, ddof=ddof)
  1903. if axis and mns.ndim < compare.ndim:
  1904. return ((scores - np.expand_dims(mns, axis=axis)) /
  1905. np.expand_dims(sstd, axis=axis))
  1906. else:
  1907. return (scores - mns) / sstd
  1908. # Private dictionary initialized only once at module level
  1909. # See https://en.wikipedia.org/wiki/Robust_measures_of_scale
  1910. _scale_conversions = {'raw': 1.0,
  1911. 'normal': special.erfinv(0.5) * 2.0 * math.sqrt(2.0)}
  1912. def iqr(x, axis=None, rng=(25, 75), scale='raw', nan_policy='propagate',
  1913. interpolation='linear', keepdims=False):
  1914. """
  1915. Compute the interquartile range of the data along the specified axis.
  1916. The interquartile range (IQR) is the difference between the 75th and
  1917. 25th percentile of the data. It is a measure of the dispersion
  1918. similar to standard deviation or variance, but is much more robust
  1919. against outliers [2]_.
  1920. The ``rng`` parameter allows this function to compute other
  1921. percentile ranges than the actual IQR. For example, setting
  1922. ``rng=(0, 100)`` is equivalent to `numpy.ptp`.
  1923. The IQR of an empty array is `np.nan`.
  1924. .. versionadded:: 0.18.0
  1925. Parameters
  1926. ----------
  1927. x : array_like
  1928. Input array or object that can be converted to an array.
  1929. axis : int or sequence of int, optional
  1930. Axis along which the range is computed. The default is to
  1931. compute the IQR for the entire array.
  1932. rng : Two-element sequence containing floats in range of [0,100] optional
  1933. Percentiles over which to compute the range. Each must be
  1934. between 0 and 100, inclusive. The default is the true IQR:
  1935. `(25, 75)`. The order of the elements is not important.
  1936. scale : scalar or str, optional
  1937. The numerical value of scale will be divided out of the final
  1938. result. The following string values are recognized:
  1939. 'raw' : No scaling, just return the raw IQR.
  1940. 'normal' : Scale by :math:`2 \\sqrt{2} erf^{-1}(\\frac{1}{2}) \\approx 1.349`.
  1941. The default is 'raw'. Array-like scale is also allowed, as long
  1942. as it broadcasts correctly to the output such that
  1943. ``out / scale`` is a valid operation. The output dimensions
  1944. depend on the input array, `x`, the `axis` argument, and the
  1945. `keepdims` flag.
  1946. nan_policy : {'propagate', 'raise', 'omit'}, optional
  1947. Defines how to handle when input contains nan. 'propagate'
  1948. returns nan, 'raise' throws an error, 'omit' performs the
  1949. calculations ignoring nan values. Default is 'propagate'.
  1950. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional
  1951. Specifies the interpolation method to use when the percentile
  1952. boundaries lie between two data points `i` and `j`:
  1953. * 'linear' : `i + (j - i) * fraction`, where `fraction` is the
  1954. fractional part of the index surrounded by `i` and `j`.
  1955. * 'lower' : `i`.
  1956. * 'higher' : `j`.
  1957. * 'nearest' : `i` or `j` whichever is nearest.
  1958. * 'midpoint' : `(i + j) / 2`.
  1959. Default is 'linear'.
  1960. keepdims : bool, optional
  1961. If this is set to `True`, the reduced axes are left in the
  1962. result as dimensions with size one. With this option, the result
  1963. will broadcast correctly against the original array `x`.
  1964. Returns
  1965. -------
  1966. iqr : scalar or ndarray
  1967. If ``axis=None``, a scalar is returned. If the input contains
  1968. integers or floats of smaller precision than ``np.float64``, then the
  1969. output data-type is ``np.float64``. Otherwise, the output data-type is
  1970. the same as that of the input.
  1971. See Also
  1972. --------
  1973. numpy.std, numpy.var
  1974. Examples
  1975. --------
  1976. >>> from scipy.stats import iqr
  1977. >>> x = np.array([[10, 7, 4], [3, 2, 1]])
  1978. >>> x
  1979. array([[10, 7, 4],
  1980. [ 3, 2, 1]])
  1981. >>> iqr(x)
  1982. 4.0
  1983. >>> iqr(x, axis=0)
  1984. array([ 3.5, 2.5, 1.5])
  1985. >>> iqr(x, axis=1)
  1986. array([ 3., 1.])
  1987. >>> iqr(x, axis=1, keepdims=True)
  1988. array([[ 3.],
  1989. [ 1.]])
  1990. Notes
  1991. -----
  1992. This function is heavily dependent on the version of `numpy` that is
  1993. installed. Versions greater than 1.11.0b3 are highly recommended, as they
  1994. include a number of enhancements and fixes to `numpy.percentile` and
  1995. `numpy.nanpercentile` that affect the operation of this function. The
  1996. following modifications apply:
  1997. Below 1.10.0 : `nan_policy` is poorly defined.
  1998. The default behavior of `numpy.percentile` is used for 'propagate'. This
  1999. is a hybrid of 'omit' and 'propagate' that mostly yields a skewed
  2000. version of 'omit' since NaNs are sorted to the end of the data. A
  2001. warning is raised if there are NaNs in the data.
  2002. Below 1.9.0: `numpy.nanpercentile` does not exist.
  2003. This means that `numpy.percentile` is used regardless of `nan_policy`
  2004. and a warning is issued. See previous item for a description of the
  2005. behavior.
  2006. Below 1.9.0: `keepdims` and `interpolation` are not supported.
  2007. The keywords get ignored with a warning if supplied with non-default
  2008. values. However, multiple axes are still supported.
  2009. References
  2010. ----------
  2011. .. [1] "Interquartile range" https://en.wikipedia.org/wiki/Interquartile_range
  2012. .. [2] "Robust measures of scale" https://en.wikipedia.org/wiki/Robust_measures_of_scale
  2013. .. [3] "Quantile" https://en.wikipedia.org/wiki/Quantile
  2014. """
  2015. x = asarray(x)
  2016. # This check prevents percentile from raising an error later. Also, it is
  2017. # consistent with `np.var` and `np.std`.
  2018. if not x.size:
  2019. return np.nan
  2020. # An error may be raised here, so fail-fast, before doing lengthy
  2021. # computations, even though `scale` is not used until later
  2022. if isinstance(scale, string_types):
  2023. scale_key = scale.lower()
  2024. if scale_key not in _scale_conversions:
  2025. raise ValueError("{0} not a valid scale for `iqr`".format(scale))
  2026. scale = _scale_conversions[scale_key]
  2027. # Select the percentile function to use based on nans and policy
  2028. contains_nan, nan_policy = _contains_nan(x, nan_policy)
  2029. if contains_nan and nan_policy == 'omit':
  2030. percentile_func = _iqr_nanpercentile
  2031. else:
  2032. percentile_func = _iqr_percentile
  2033. if len(rng) != 2:
  2034. raise TypeError("quantile range must be two element sequence")
  2035. rng = sorted(rng)
  2036. pct = percentile_func(x, rng, axis=axis, interpolation=interpolation,
  2037. keepdims=keepdims, contains_nan=contains_nan)
  2038. out = np.subtract(pct[1], pct[0])
  2039. if scale != 1.0:
  2040. out /= scale
  2041. return out
  2042. def _iqr_percentile(x, q, axis=None, interpolation='linear', keepdims=False, contains_nan=False):
  2043. """
  2044. Private wrapper that works around older versions of `numpy`.
  2045. While this function is pretty much necessary for the moment, it
  2046. should be removed as soon as the minimum supported numpy version
  2047. allows.
  2048. """
  2049. if contains_nan and NumpyVersion(np.__version__) < '1.10.0a':
  2050. # I see no way to avoid the version check to ensure that the corrected
  2051. # NaN behavior has been implemented except to call `percentile` on a
  2052. # small array.
  2053. msg = "Keyword nan_policy='propagate' not correctly supported for " \
  2054. "numpy versions < 1.10.x. The default behavior of " \
  2055. "`numpy.percentile` will be used."
  2056. warnings.warn(msg, RuntimeWarning)
  2057. try:
  2058. # For older versions of numpy, there are two things that can cause a
  2059. # problem here: missing keywords and non-scalar axis. The former can be
  2060. # partially handled with a warning, the latter can be handled fully by
  2061. # hacking in an implementation similar to numpy's function for
  2062. # providing multi-axis functionality
  2063. # (`numpy.lib.function_base._ureduce` for the curious).
  2064. result = np.percentile(x, q, axis=axis, keepdims=keepdims,
  2065. interpolation=interpolation)
  2066. except TypeError:
  2067. if interpolation != 'linear' or keepdims:
  2068. # At time or writing, this means np.__version__ < 1.9.0
  2069. warnings.warn("Keywords interpolation and keepdims not supported "
  2070. "for your version of numpy", RuntimeWarning)
  2071. try:
  2072. # Special processing if axis is an iterable
  2073. original_size = len(axis)
  2074. except TypeError:
  2075. # Axis is a scalar at this point
  2076. pass
  2077. else:
  2078. axis = np.unique(np.asarray(axis) % x.ndim)
  2079. if original_size > axis.size:
  2080. # mimic numpy if axes are duplicated
  2081. raise ValueError("duplicate value in axis")
  2082. if axis.size == x.ndim:
  2083. # axis includes all axes: revert to None
  2084. axis = None
  2085. elif axis.size == 1:
  2086. # no rolling necessary
  2087. axis = axis[0]
  2088. else:
  2089. # roll multiple axes to the end and flatten that part out
  2090. for ax in axis[::-1]:
  2091. x = np.rollaxis(x, ax, x.ndim)
  2092. x = x.reshape(x.shape[:-axis.size] +
  2093. (np.prod(x.shape[-axis.size:]),))
  2094. axis = -1
  2095. result = np.percentile(x, q, axis=axis)
  2096. return result
  2097. def _iqr_nanpercentile(x, q, axis=None, interpolation='linear', keepdims=False,
  2098. contains_nan=False):
  2099. """
  2100. Private wrapper that works around the following:
  2101. 1. A bug in `np.nanpercentile` that was around until numpy version
  2102. 1.11.0.
  2103. 2. A bug in `np.percentile` NaN handling that was fixed in numpy
  2104. version 1.10.0.
  2105. 3. The non-existence of `np.nanpercentile` before numpy version
  2106. 1.9.0.
  2107. While this function is pretty much necessary for the moment, it
  2108. should be removed as soon as the minimum supported numpy version
  2109. allows.
  2110. """
  2111. if hasattr(np, 'nanpercentile'):
  2112. # At time or writing, this means np.__version__ < 1.9.0
  2113. result = np.nanpercentile(x, q, axis=axis,
  2114. interpolation=interpolation,
  2115. keepdims=keepdims)
  2116. # If non-scalar result and nanpercentile does not do proper axis roll.
  2117. # I see no way of avoiding the version test since dimensions may just
  2118. # happen to match in the data.
  2119. if result.ndim > 1 and NumpyVersion(np.__version__) < '1.11.0a':
  2120. axis = np.asarray(axis)
  2121. if axis.size == 1:
  2122. # If only one axis specified, reduction happens along that dimension
  2123. if axis.ndim == 0:
  2124. axis = axis[None]
  2125. result = np.rollaxis(result, axis[0])
  2126. else:
  2127. # If multiple axes, reduced dimeision is last
  2128. result = np.rollaxis(result, -1)
  2129. else:
  2130. msg = "Keyword nan_policy='omit' not correctly supported for numpy " \
  2131. "versions < 1.9.x. The default behavior of numpy.percentile " \
  2132. "will be used."
  2133. warnings.warn(msg, RuntimeWarning)
  2134. result = _iqr_percentile(x, q, axis=axis)
  2135. return result
  2136. #####################################
  2137. # TRIMMING FUNCTIONS #
  2138. #####################################
  2139. SigmaclipResult = namedtuple('SigmaclipResult', ('clipped', 'lower', 'upper'))
  2140. def sigmaclip(a, low=4., high=4.):
  2141. """
  2142. Iterative sigma-clipping of array elements.
  2143. Starting from the full sample, all elements outside the critical range are
  2144. removed, i.e. all elements of the input array `c` that satisfy either of
  2145. the following conditions ::
  2146. c < mean(c) - std(c)*low
  2147. c > mean(c) + std(c)*high
  2148. The iteration continues with the updated sample until no
  2149. elements are outside the (updated) range.
  2150. Parameters
  2151. ----------
  2152. a : array_like
  2153. Data array, will be raveled if not 1-D.
  2154. low : float, optional
  2155. Lower bound factor of sigma clipping. Default is 4.
  2156. high : float, optional
  2157. Upper bound factor of sigma clipping. Default is 4.
  2158. Returns
  2159. -------
  2160. clipped : ndarray
  2161. Input array with clipped elements removed.
  2162. lower : float
  2163. Lower threshold value use for clipping.
  2164. upper : float
  2165. Upper threshold value use for clipping.
  2166. Examples
  2167. --------
  2168. >>> from scipy.stats import sigmaclip
  2169. >>> a = np.concatenate((np.linspace(9.5, 10.5, 31),
  2170. ... np.linspace(0, 20, 5)))
  2171. >>> fact = 1.5
  2172. >>> c, low, upp = sigmaclip(a, fact, fact)
  2173. >>> c
  2174. array([ 9.96666667, 10. , 10.03333333, 10. ])
  2175. >>> c.var(), c.std()
  2176. (0.00055555555555555165, 0.023570226039551501)
  2177. >>> low, c.mean() - fact*c.std(), c.min()
  2178. (9.9646446609406727, 9.9646446609406727, 9.9666666666666668)
  2179. >>> upp, c.mean() + fact*c.std(), c.max()
  2180. (10.035355339059327, 10.035355339059327, 10.033333333333333)
  2181. >>> a = np.concatenate((np.linspace(9.5, 10.5, 11),
  2182. ... np.linspace(-100, -50, 3)))
  2183. >>> c, low, upp = sigmaclip(a, 1.8, 1.8)
  2184. >>> (c == np.linspace(9.5, 10.5, 11)).all()
  2185. True
  2186. """
  2187. c = np.asarray(a).ravel()
  2188. delta = 1
  2189. while delta:
  2190. c_std = c.std()
  2191. c_mean = c.mean()
  2192. size = c.size
  2193. critlower = c_mean - c_std * low
  2194. critupper = c_mean + c_std * high
  2195. c = c[(c >= critlower) & (c <= critupper)]
  2196. delta = size - c.size
  2197. return SigmaclipResult(c, critlower, critupper)
  2198. def trimboth(a, proportiontocut, axis=0):
  2199. """
  2200. Slices off a proportion of items from both ends of an array.
  2201. Slices off the passed proportion of items from both ends of the passed
  2202. array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and**
  2203. rightmost 10% of scores). The trimmed values are the lowest and
  2204. highest ones.
  2205. Slices off less if proportion results in a non-integer slice index (i.e.,
  2206. conservatively slices off`proportiontocut`).
  2207. Parameters
  2208. ----------
  2209. a : array_like
  2210. Data to trim.
  2211. proportiontocut : float
  2212. Proportion (in range 0-1) of total data set to trim of each end.
  2213. axis : int or None, optional
  2214. Axis along which to trim data. Default is 0. If None, compute over
  2215. the whole array `a`.
  2216. Returns
  2217. -------
  2218. out : ndarray
  2219. Trimmed version of array `a`. The order of the trimmed content
  2220. is undefined.
  2221. See Also
  2222. --------
  2223. trim_mean
  2224. Examples
  2225. --------
  2226. >>> from scipy import stats
  2227. >>> a = np.arange(20)
  2228. >>> b = stats.trimboth(a, 0.1)
  2229. >>> b.shape
  2230. (16,)
  2231. """
  2232. a = np.asarray(a)
  2233. if a.size == 0:
  2234. return a
  2235. if axis is None:
  2236. a = a.ravel()
  2237. axis = 0
  2238. nobs = a.shape[axis]
  2239. lowercut = int(proportiontocut * nobs)
  2240. uppercut = nobs - lowercut
  2241. if (lowercut >= uppercut):
  2242. raise ValueError("Proportion too big.")
  2243. atmp = np.partition(a, (lowercut, uppercut - 1), axis)
  2244. sl = [slice(None)] * atmp.ndim
  2245. sl[axis] = slice(lowercut, uppercut)
  2246. return atmp[tuple(sl)]
  2247. def trim1(a, proportiontocut, tail='right', axis=0):
  2248. """
  2249. Slices off a proportion from ONE end of the passed array distribution.
  2250. If `proportiontocut` = 0.1, slices off 'leftmost' or 'rightmost'
  2251. 10% of scores. The lowest or highest values are trimmed (depending on
  2252. the tail).
  2253. Slices off less if proportion results in a non-integer slice index
  2254. (i.e., conservatively slices off `proportiontocut` ).
  2255. Parameters
  2256. ----------
  2257. a : array_like
  2258. Input array
  2259. proportiontocut : float
  2260. Fraction to cut off of 'left' or 'right' of distribution
  2261. tail : {'left', 'right'}, optional
  2262. Defaults to 'right'.
  2263. axis : int or None, optional
  2264. Axis along which to trim data. Default is 0. If None, compute over
  2265. the whole array `a`.
  2266. Returns
  2267. -------
  2268. trim1 : ndarray
  2269. Trimmed version of array `a`. The order of the trimmed content is
  2270. undefined.
  2271. """
  2272. a = np.asarray(a)
  2273. if axis is None:
  2274. a = a.ravel()
  2275. axis = 0
  2276. nobs = a.shape[axis]
  2277. # avoid possible corner case
  2278. if proportiontocut >= 1:
  2279. return []
  2280. if tail.lower() == 'right':
  2281. lowercut = 0
  2282. uppercut = nobs - int(proportiontocut * nobs)
  2283. elif tail.lower() == 'left':
  2284. lowercut = int(proportiontocut * nobs)
  2285. uppercut = nobs
  2286. atmp = np.partition(a, (lowercut, uppercut - 1), axis)
  2287. return atmp[lowercut:uppercut]
  2288. def trim_mean(a, proportiontocut, axis=0):
  2289. """
  2290. Return mean of array after trimming distribution from both tails.
  2291. If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of
  2292. scores. The input is sorted before slicing. Slices off less if proportion
  2293. results in a non-integer slice index (i.e., conservatively slices off
  2294. `proportiontocut` ).
  2295. Parameters
  2296. ----------
  2297. a : array_like
  2298. Input array
  2299. proportiontocut : float
  2300. Fraction to cut off of both tails of the distribution
  2301. axis : int or None, optional
  2302. Axis along which the trimmed means are computed. Default is 0.
  2303. If None, compute over the whole array `a`.
  2304. Returns
  2305. -------
  2306. trim_mean : ndarray
  2307. Mean of trimmed array.
  2308. See Also
  2309. --------
  2310. trimboth
  2311. tmean : compute the trimmed mean ignoring values outside given `limits`.
  2312. Examples
  2313. --------
  2314. >>> from scipy import stats
  2315. >>> x = np.arange(20)
  2316. >>> stats.trim_mean(x, 0.1)
  2317. 9.5
  2318. >>> x2 = x.reshape(5, 4)
  2319. >>> x2
  2320. array([[ 0, 1, 2, 3],
  2321. [ 4, 5, 6, 7],
  2322. [ 8, 9, 10, 11],
  2323. [12, 13, 14, 15],
  2324. [16, 17, 18, 19]])
  2325. >>> stats.trim_mean(x2, 0.25)
  2326. array([ 8., 9., 10., 11.])
  2327. >>> stats.trim_mean(x2, 0.25, axis=1)
  2328. array([ 1.5, 5.5, 9.5, 13.5, 17.5])
  2329. """
  2330. a = np.asarray(a)
  2331. if a.size == 0:
  2332. return np.nan
  2333. if axis is None:
  2334. a = a.ravel()
  2335. axis = 0
  2336. nobs = a.shape[axis]
  2337. lowercut = int(proportiontocut * nobs)
  2338. uppercut = nobs - lowercut
  2339. if (lowercut > uppercut):
  2340. raise ValueError("Proportion too big.")
  2341. atmp = np.partition(a, (lowercut, uppercut - 1), axis)
  2342. sl = [slice(None)] * atmp.ndim
  2343. sl[axis] = slice(lowercut, uppercut)
  2344. return np.mean(atmp[tuple(sl)], axis=axis)
  2345. F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
  2346. def f_oneway(*args):
  2347. """
  2348. Performs a 1-way ANOVA.
  2349. The one-way ANOVA tests the null hypothesis that two or more groups have
  2350. the same population mean. The test is applied to samples from two or
  2351. more groups, possibly with differing sizes.
  2352. Parameters
  2353. ----------
  2354. sample1, sample2, ... : array_like
  2355. The sample measurements for each group.
  2356. Returns
  2357. -------
  2358. statistic : float
  2359. The computed F-value of the test.
  2360. pvalue : float
  2361. The associated p-value from the F-distribution.
  2362. Notes
  2363. -----
  2364. The ANOVA test has important assumptions that must be satisfied in order
  2365. for the associated p-value to be valid.
  2366. 1. The samples are independent.
  2367. 2. Each sample is from a normally distributed population.
  2368. 3. The population standard deviations of the groups are all equal. This
  2369. property is known as homoscedasticity.
  2370. If these assumptions are not true for a given set of data, it may still be
  2371. possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`) although
  2372. with some loss of power.
  2373. The algorithm is from Heiman[2], pp.394-7.
  2374. References
  2375. ----------
  2376. .. [1] Lowry, Richard. "Concepts and Applications of Inferential
  2377. Statistics". Chapter 14.
  2378. https://web.archive.org/web/20171027235250/http://vassarstats.net:80/textbook/ch14pt1.html
  2379. .. [2] Heiman, G.W. Research Methods in Statistics. 2002.
  2380. .. [3] McDonald, G. H. "Handbook of Biological Statistics", One-way ANOVA.
  2381. http://www.biostathandbook.com/onewayanova.html
  2382. Examples
  2383. --------
  2384. >>> import scipy.stats as stats
  2385. [3]_ Here are some data on a shell measurement (the length of the anterior
  2386. adductor muscle scar, standardized by dividing by length) in the mussel
  2387. Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon;
  2388. Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, taken from a
  2389. much larger data set used in McDonald et al. (1991).
  2390. >>> tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735,
  2391. ... 0.0659, 0.0923, 0.0836]
  2392. >>> newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835,
  2393. ... 0.0725]
  2394. >>> petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
  2395. >>> magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764,
  2396. ... 0.0689]
  2397. >>> tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]
  2398. >>> stats.f_oneway(tillamook, newport, petersburg, magadan, tvarminne)
  2399. (7.1210194716424473, 0.00028122423145345439)
  2400. """
  2401. args = [np.asarray(arg, dtype=float) for arg in args]
  2402. # ANOVA on N groups, each in its own array
  2403. num_groups = len(args)
  2404. alldata = np.concatenate(args)
  2405. bign = len(alldata)
  2406. # Determine the mean of the data, and subtract that from all inputs to a
  2407. # variance (via sum_of_sq / sq_of_sum) calculation. Variance is invariance
  2408. # to a shift in location, and centering all data around zero vastly
  2409. # improves numerical stability.
  2410. offset = alldata.mean()
  2411. alldata -= offset
  2412. sstot = _sum_of_squares(alldata) - (_square_of_sums(alldata) / bign)
  2413. ssbn = 0
  2414. for a in args:
  2415. ssbn += _square_of_sums(a - offset) / len(a)
  2416. # Naming: variables ending in bn/b are for "between treatments", wn/w are
  2417. # for "within treatments"
  2418. ssbn -= _square_of_sums(alldata) / bign
  2419. sswn = sstot - ssbn
  2420. dfbn = num_groups - 1
  2421. dfwn = bign - num_groups
  2422. msb = ssbn / dfbn
  2423. msw = sswn / dfwn
  2424. f = msb / msw
  2425. prob = special.fdtrc(dfbn, dfwn, f) # equivalent to stats.f.sf
  2426. return F_onewayResult(f, prob)
  2427. def pearsonr(x, y):
  2428. r"""
  2429. Calculate a Pearson correlation coefficient and the p-value for testing
  2430. non-correlation.
  2431. The Pearson correlation coefficient measures the linear relationship
  2432. between two datasets. Strictly speaking, Pearson's correlation requires
  2433. that each dataset be normally distributed, and not necessarily zero-mean.
  2434. Like other correlation coefficients, this one varies between -1 and +1
  2435. with 0 implying no correlation. Correlations of -1 or +1 imply an exact
  2436. linear relationship. Positive correlations imply that as x increases, so
  2437. does y. Negative correlations imply that as x increases, y decreases.
  2438. The p-value roughly indicates the probability of an uncorrelated system
  2439. producing datasets that have a Pearson correlation at least as extreme
  2440. as the one computed from these datasets. The p-values are not entirely
  2441. reliable but are probably reasonable for datasets larger than 500 or so.
  2442. Parameters
  2443. ----------
  2444. x : (N,) array_like
  2445. Input
  2446. y : (N,) array_like
  2447. Input
  2448. Returns
  2449. -------
  2450. r : float
  2451. Pearson's correlation coefficient
  2452. p-value : float
  2453. 2-tailed p-value
  2454. Notes
  2455. -----
  2456. The correlation coefficient is calculated as follows:
  2457. .. math::
  2458. r_{pb} = \frac{\sum (x - m_x) (y - m_y)}
  2459. {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
  2460. where :math:`m_x` is the mean of the vector :math:`x` and :math:`m_y` is
  2461. the mean of the vector :math:`y`.
  2462. Under the assumption that x and y are drawn from independent normal
  2463. distributions (so the population correlation coefficient is 0), the
  2464. probability density function of the sample correlation coefficient r
  2465. is ([1]_, [2]_)::
  2466. (1 - r**2)**(n/2 - 2)
  2467. f(r) = ---------------------
  2468. B(1/2, n/2 - 1)
  2469. where n is the number of samples, and B is the beta function. This
  2470. is sometimes referred to as the exact distribution of r. This is
  2471. the distribution that is used in `pearsonr` to compute the p-value.
  2472. The distribution is a beta distribution on the interval [-1, 1],
  2473. with equal shape parameters a = b = n/2 - 1. In terms of SciPy's
  2474. implementation of the beta distribution, the distribution of r is::
  2475. dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
  2476. The p-value returned by `pearsonr` is a two-sided p-value. For a
  2477. given sample with correlation coefficient r, the p-value is
  2478. the probability that abs(r') of a random sample x' and y' drawn from
  2479. the population with zero correlation would be greater than or equal
  2480. to abs(r). In terms of the object ``dist`` shown above, the p-value
  2481. for a given r and length n can be computed as::
  2482. p = 2*dist.cdf(-abs(r))
  2483. When n is 2, the above continuous distribution is not well-defined.
  2484. One can interpret the limit of the beta distribution as the shape
  2485. parameters a and b approach a = b = 0 as a discrete distribution with
  2486. equal probability masses at r = 1 and r = -1. More directly, one
  2487. can observe that, given the data x = [x1, x2] and y = [y1, y2], and
  2488. assuming x1 != x2 and y1 != y2, the only possible values for r are 1
  2489. and -1. Because abs(r') for any sample x' and y' with length 2 will
  2490. be 1, the two-sided p-value for a sample of length 2 is always 1.
  2491. References
  2492. ----------
  2493. .. [1] "Pearson correlation coefficient", Wikipedia,
  2494. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
  2495. .. [2] Student, "Probable error of a correlation coefficient",
  2496. Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
  2497. Examples
  2498. --------
  2499. >>> from scipy import stats
  2500. >>> a = np.array([0, 0, 0, 1, 1, 1, 1])
  2501. >>> b = np.arange(7)
  2502. >>> stats.pearsonr(a, b)
  2503. (0.8660254037844386, 0.011724811003954654)
  2504. >>> stats.pearsonr([1,2,3,4,5], [5,6,7,8,7])
  2505. (0.83205029433784372, 0.080509573298498519)
  2506. """
  2507. # x and y should have same length.
  2508. x = np.asarray(x)
  2509. y = np.asarray(y)
  2510. n = len(x)
  2511. mx = x.mean()
  2512. my = y.mean()
  2513. xm, ym = x - mx, y - my
  2514. r_num = np.add.reduce(xm * ym)
  2515. r_den = np.sqrt(_sum_of_squares(xm) * _sum_of_squares(ym))
  2516. r = r_num / r_den
  2517. # Presumably, if abs(r) > 1, then it is only some small artifact of
  2518. # floating point arithmetic.
  2519. r = max(min(r, 1.0), -1.0)
  2520. df = n - 2
  2521. if abs(r) == 1.0:
  2522. prob = 0.0
  2523. else:
  2524. t_squared = r**2 * (df / ((1.0 - r) * (1.0 + r)))
  2525. prob = special.betainc(
  2526. 0.5*df, 0.5, np.fmin(np.asarray(df / (df + t_squared)), 1.0)
  2527. )
  2528. return r, prob
  2529. def fisher_exact(table, alternative='two-sided'):
  2530. """Performs a Fisher exact test on a 2x2 contingency table.
  2531. Parameters
  2532. ----------
  2533. table : array_like of ints
  2534. A 2x2 contingency table. Elements should be non-negative integers.
  2535. alternative : {'two-sided', 'less', 'greater'}, optional
  2536. Which alternative hypothesis to the null hypothesis the test uses.
  2537. Default is 'two-sided'.
  2538. Returns
  2539. -------
  2540. oddsratio : float
  2541. This is prior odds ratio and not a posterior estimate.
  2542. p_value : float
  2543. P-value, the probability of obtaining a distribution at least as
  2544. extreme as the one that was actually observed, assuming that the
  2545. null hypothesis is true.
  2546. See Also
  2547. --------
  2548. chi2_contingency : Chi-square test of independence of variables in a
  2549. contingency table.
  2550. Notes
  2551. -----
  2552. The calculated odds ratio is different from the one R uses. This scipy
  2553. implementation returns the (more common) "unconditional Maximum
  2554. Likelihood Estimate", while R uses the "conditional Maximum Likelihood
  2555. Estimate".
  2556. For tables with large numbers, the (inexact) chi-square test implemented
  2557. in the function `chi2_contingency` can also be used.
  2558. Examples
  2559. --------
  2560. Say we spend a few days counting whales and sharks in the Atlantic and
  2561. Indian oceans. In the Atlantic ocean we find 8 whales and 1 shark, in the
  2562. Indian ocean 2 whales and 5 sharks. Then our contingency table is::
  2563. Atlantic Indian
  2564. whales 8 2
  2565. sharks 1 5
  2566. We use this table to find the p-value:
  2567. >>> import scipy.stats as stats
  2568. >>> oddsratio, pvalue = stats.fisher_exact([[8, 2], [1, 5]])
  2569. >>> pvalue
  2570. 0.0349...
  2571. The probability that we would observe this or an even more imbalanced ratio
  2572. by chance is about 3.5%. A commonly used significance level is 5%--if we
  2573. adopt that, we can therefore conclude that our observed imbalance is
  2574. statistically significant; whales prefer the Atlantic while sharks prefer
  2575. the Indian ocean.
  2576. """
  2577. hypergeom = distributions.hypergeom
  2578. c = np.asarray(table, dtype=np.int64) # int32 is not enough for the algorithm
  2579. if not c.shape == (2, 2):
  2580. raise ValueError("The input `table` must be of shape (2, 2).")
  2581. if np.any(c < 0):
  2582. raise ValueError("All values in `table` must be nonnegative.")
  2583. if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
  2584. # If both values in a row or column are zero, the p-value is 1 and
  2585. # the odds ratio is NaN.
  2586. return np.nan, 1.0
  2587. if c[1, 0] > 0 and c[0, 1] > 0:
  2588. oddsratio = c[0, 0] * c[1, 1] / (c[1, 0] * c[0, 1])
  2589. else:
  2590. oddsratio = np.inf
  2591. n1 = c[0, 0] + c[0, 1]
  2592. n2 = c[1, 0] + c[1, 1]
  2593. n = c[0, 0] + c[1, 0]
  2594. def binary_search(n, n1, n2, side):
  2595. """Binary search for where to begin lower/upper halves in two-sided
  2596. test.
  2597. """
  2598. if side == "upper":
  2599. minval = mode
  2600. maxval = n
  2601. else:
  2602. minval = 0
  2603. maxval = mode
  2604. guess = -1
  2605. while maxval - minval > 1:
  2606. if maxval == minval + 1 and guess == minval:
  2607. guess = maxval
  2608. else:
  2609. guess = (maxval + minval) // 2
  2610. pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
  2611. if side == "upper":
  2612. ng = guess - 1
  2613. else:
  2614. ng = guess + 1
  2615. if pguess <= pexact < hypergeom.pmf(ng, n1 + n2, n1, n):
  2616. break
  2617. elif pguess < pexact:
  2618. maxval = guess
  2619. else:
  2620. minval = guess
  2621. if guess == -1:
  2622. guess = minval
  2623. if side == "upper":
  2624. while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
  2625. guess -= 1
  2626. while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
  2627. guess += 1
  2628. else:
  2629. while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
  2630. guess += 1
  2631. while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
  2632. guess -= 1
  2633. return guess
  2634. if alternative == 'less':
  2635. pvalue = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)
  2636. elif alternative == 'greater':
  2637. # Same formula as the 'less' case, but with the second column.
  2638. pvalue = hypergeom.cdf(c[0, 1], n1 + n2, n1, c[0, 1] + c[1, 1])
  2639. elif alternative == 'two-sided':
  2640. mode = int((n + 1) * (n1 + 1) / (n1 + n2 + 2))
  2641. pexact = hypergeom.pmf(c[0, 0], n1 + n2, n1, n)
  2642. pmode = hypergeom.pmf(mode, n1 + n2, n1, n)
  2643. epsilon = 1 - 1e-4
  2644. if np.abs(pexact - pmode) / np.maximum(pexact, pmode) <= 1 - epsilon:
  2645. return oddsratio, 1.
  2646. elif c[0, 0] < mode:
  2647. plower = hypergeom.cdf(c[0, 0], n1 + n2, n1, n)
  2648. if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon:
  2649. return oddsratio, plower
  2650. guess = binary_search(n, n1, n2, "upper")
  2651. pvalue = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n)
  2652. else:
  2653. pupper = hypergeom.sf(c[0, 0] - 1, n1 + n2, n1, n)
  2654. if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon:
  2655. return oddsratio, pupper
  2656. guess = binary_search(n, n1, n2, "lower")
  2657. pvalue = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)
  2658. else:
  2659. msg = "`alternative` should be one of {'two-sided', 'less', 'greater'}"
  2660. raise ValueError(msg)
  2661. pvalue = min(pvalue, 1.0)
  2662. return oddsratio, pvalue
  2663. SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue'))
  2664. def spearmanr(a, b=None, axis=0, nan_policy='propagate'):
  2665. """
  2666. Calculate a Spearman rank-order correlation coefficient and the p-value
  2667. to test for non-correlation.
  2668. The Spearman correlation is a nonparametric measure of the monotonicity
  2669. of the relationship between two datasets. Unlike the Pearson correlation,
  2670. the Spearman correlation does not assume that both datasets are normally
  2671. distributed. Like other correlation coefficients, this one varies
  2672. between -1 and +1 with 0 implying no correlation. Correlations of -1 or
  2673. +1 imply an exact monotonic relationship. Positive correlations imply that
  2674. as x increases, so does y. Negative correlations imply that as x
  2675. increases, y decreases.
  2676. The p-value roughly indicates the probability of an uncorrelated system
  2677. producing datasets that have a Spearman correlation at least as extreme
  2678. as the one computed from these datasets. The p-values are not entirely
  2679. reliable but are probably reasonable for datasets larger than 500 or so.
  2680. Parameters
  2681. ----------
  2682. a, b : 1D or 2D array_like, b is optional
  2683. One or two 1-D or 2-D arrays containing multiple variables and
  2684. observations. When these are 1-D, each represents a vector of
  2685. observations of a single variable. For the behavior in the 2-D case,
  2686. see under ``axis``, below.
  2687. Both arrays need to have the same length in the ``axis`` dimension.
  2688. axis : int or None, optional
  2689. If axis=0 (default), then each column represents a variable, with
  2690. observations in the rows. If axis=1, the relationship is transposed:
  2691. each row represents a variable, while the columns contain observations.
  2692. If axis=None, then both arrays will be raveled.
  2693. nan_policy : {'propagate', 'raise', 'omit'}, optional
  2694. Defines how to handle when input contains nan. 'propagate' returns nan,
  2695. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  2696. values. Default is 'propagate'.
  2697. Returns
  2698. -------
  2699. correlation : float or ndarray (2-D square)
  2700. Spearman correlation matrix or correlation coefficient (if only 2
  2701. variables are given as parameters. Correlation matrix is square with
  2702. length equal to total number of variables (columns or rows) in ``a``
  2703. and ``b`` combined.
  2704. pvalue : float
  2705. The two-sided p-value for a hypothesis test whose null hypothesis is
  2706. that two sets of data are uncorrelated, has same dimension as rho.
  2707. References
  2708. ----------
  2709. .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
  2710. Probability and Statistics Tables and Formulae. Chapman & Hall: New
  2711. York. 2000.
  2712. Section 14.7
  2713. Examples
  2714. --------
  2715. >>> from scipy import stats
  2716. >>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
  2717. (0.82078268166812329, 0.088587005313543798)
  2718. >>> np.random.seed(1234321)
  2719. >>> x2n = np.random.randn(100, 2)
  2720. >>> y2n = np.random.randn(100, 2)
  2721. >>> stats.spearmanr(x2n)
  2722. (0.059969996999699973, 0.55338590803773591)
  2723. >>> stats.spearmanr(x2n[:,0], x2n[:,1])
  2724. (0.059969996999699973, 0.55338590803773591)
  2725. >>> rho, pval = stats.spearmanr(x2n, y2n)
  2726. >>> rho
  2727. array([[ 1. , 0.05997 , 0.18569457, 0.06258626],
  2728. [ 0.05997 , 1. , 0.110003 , 0.02534653],
  2729. [ 0.18569457, 0.110003 , 1. , 0.03488749],
  2730. [ 0.06258626, 0.02534653, 0.03488749, 1. ]])
  2731. >>> pval
  2732. array([[ 0. , 0.55338591, 0.06435364, 0.53617935],
  2733. [ 0.55338591, 0. , 0.27592895, 0.80234077],
  2734. [ 0.06435364, 0.27592895, 0. , 0.73039992],
  2735. [ 0.53617935, 0.80234077, 0.73039992, 0. ]])
  2736. >>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
  2737. >>> rho
  2738. array([[ 1. , 0.05997 , 0.18569457, 0.06258626],
  2739. [ 0.05997 , 1. , 0.110003 , 0.02534653],
  2740. [ 0.18569457, 0.110003 , 1. , 0.03488749],
  2741. [ 0.06258626, 0.02534653, 0.03488749, 1. ]])
  2742. >>> stats.spearmanr(x2n, y2n, axis=None)
  2743. (0.10816770419260482, 0.1273562188027364)
  2744. >>> stats.spearmanr(x2n.ravel(), y2n.ravel())
  2745. (0.10816770419260482, 0.1273562188027364)
  2746. >>> xint = np.random.randint(10, size=(100, 2))
  2747. >>> stats.spearmanr(xint)
  2748. (0.052760927029710199, 0.60213045837062351)
  2749. """
  2750. a, axisout = _chk_asarray(a, axis)
  2751. if a.ndim > 2:
  2752. raise ValueError("spearmanr only handles 1-D or 2-D arrays")
  2753. if b is None:
  2754. if a.ndim < 2:
  2755. raise ValueError("`spearmanr` needs at least 2 variables to compare")
  2756. else:
  2757. # Concatenate a and b, so that we now only have to handle the case
  2758. # of a 2-D `a`.
  2759. b, _ = _chk_asarray(b, axis)
  2760. if axisout == 0:
  2761. a = np.column_stack((a, b))
  2762. else:
  2763. a = np.row_stack((a, b))
  2764. n_vars = a.shape[1 - axisout]
  2765. n_obs = a.shape[axisout]
  2766. if n_obs <= 1:
  2767. # Handle empty arrays or single observations.
  2768. return SpearmanrResult(np.nan, np.nan)
  2769. a_contains_nan, nan_policy = _contains_nan(a, nan_policy)
  2770. variable_has_nan = np.zeros(n_vars, dtype=bool)
  2771. if a_contains_nan:
  2772. if nan_policy == 'omit':
  2773. return mstats_basic.spearmanr(a, axis=axis, nan_policy=nan_policy)
  2774. elif nan_policy == 'propagate':
  2775. if a.ndim == 1 or n_vars <= 2:
  2776. return SpearmanrResult(np.nan, np.nan)
  2777. else:
  2778. # Keep track of variables with NaNs, set the outputs to NaN
  2779. # only for those variables
  2780. variable_has_nan = np.isnan(a).sum(axis=axisout)
  2781. a_ranked = np.apply_along_axis(rankdata, axisout, a)
  2782. rs = np.corrcoef(a_ranked, rowvar=axisout)
  2783. dof = n_obs - 2 # degrees of freedom
  2784. # rs can have elements equal to 1, so avoid zero division warnings
  2785. olderr = np.seterr(divide='ignore')
  2786. try:
  2787. # clip the small negative values possibly caused by rounding
  2788. # errors before taking the square root
  2789. t = rs * np.sqrt((dof/((rs+1.0)*(1.0-rs))).clip(0))
  2790. finally:
  2791. np.seterr(**olderr)
  2792. prob = 2 * distributions.t.sf(np.abs(t), dof)
  2793. # For backwards compatibility, return scalars when comparing 2 columns
  2794. if rs.shape == (2, 2):
  2795. return SpearmanrResult(rs[1, 0], prob[1, 0])
  2796. else:
  2797. rs[variable_has_nan, :] = np.nan
  2798. rs[:, variable_has_nan] = np.nan
  2799. return SpearmanrResult(rs, prob)
  2800. PointbiserialrResult = namedtuple('PointbiserialrResult',
  2801. ('correlation', 'pvalue'))
  2802. def pointbiserialr(x, y):
  2803. r"""
  2804. Calculate a point biserial correlation coefficient and its p-value.
  2805. The point biserial correlation is used to measure the relationship
  2806. between a binary variable, x, and a continuous variable, y. Like other
  2807. correlation coefficients, this one varies between -1 and +1 with 0
  2808. implying no correlation. Correlations of -1 or +1 imply a determinative
  2809. relationship.
  2810. This function uses a shortcut formula but produces the same result as
  2811. `pearsonr`.
  2812. Parameters
  2813. ----------
  2814. x : array_like of bools
  2815. Input array.
  2816. y : array_like
  2817. Input array.
  2818. Returns
  2819. -------
  2820. correlation : float
  2821. R value
  2822. pvalue : float
  2823. 2-tailed p-value
  2824. Notes
  2825. -----
  2826. `pointbiserialr` uses a t-test with ``n-1`` degrees of freedom.
  2827. It is equivalent to `pearsonr.`
  2828. The value of the point-biserial correlation can be calculated from:
  2829. .. math::
  2830. r_{pb} = \frac{\overline{Y_{1}} -
  2831. \overline{Y_{0}}}{s_{y}}\sqrt{\frac{N_{1} N_{2}}{N (N - 1))}}
  2832. Where :math:`Y_{0}` and :math:`Y_{1}` are means of the metric
  2833. observations coded 0 and 1 respectively; :math:`N_{0}` and :math:`N_{1}`
  2834. are number of observations coded 0 and 1 respectively; :math:`N` is the
  2835. total number of observations and :math:`s_{y}` is the standard
  2836. deviation of all the metric observations.
  2837. A value of :math:`r_{pb}` that is significantly different from zero is
  2838. completely equivalent to a significant difference in means between the two
  2839. groups. Thus, an independent groups t Test with :math:`N-2` degrees of
  2840. freedom may be used to test whether :math:`r_{pb}` is nonzero. The
  2841. relation between the t-statistic for comparing two independent groups and
  2842. :math:`r_{pb}` is given by:
  2843. .. math::
  2844. t = \sqrt{N - 2}\frac{r_{pb}}{\sqrt{1 - r^{2}_{pb}}}
  2845. References
  2846. ----------
  2847. .. [1] J. Lev, "The Point Biserial Coefficient of Correlation", Ann. Math.
  2848. Statist., Vol. 20, no.1, pp. 125-126, 1949.
  2849. .. [2] R.F. Tate, "Correlation Between a Discrete and a Continuous
  2850. Variable. Point-Biserial Correlation.", Ann. Math. Statist., Vol. 25,
  2851. np. 3, pp. 603-607, 1954.
  2852. .. [3] D. Kornbrot "Point Biserial Correlation", In Wiley StatsRef:
  2853. Statistics Reference Online (eds N. Balakrishnan, et al.), 2014.
  2854. https://doi.org/10.1002/9781118445112.stat06227
  2855. Examples
  2856. --------
  2857. >>> from scipy import stats
  2858. >>> a = np.array([0, 0, 0, 1, 1, 1, 1])
  2859. >>> b = np.arange(7)
  2860. >>> stats.pointbiserialr(a, b)
  2861. (0.8660254037844386, 0.011724811003954652)
  2862. >>> stats.pearsonr(a, b)
  2863. (0.86602540378443871, 0.011724811003954626)
  2864. >>> np.corrcoef(a, b)
  2865. array([[ 1. , 0.8660254],
  2866. [ 0.8660254, 1. ]])
  2867. """
  2868. rpb, prob = pearsonr(x, y)
  2869. return PointbiserialrResult(rpb, prob)
  2870. KendalltauResult = namedtuple('KendalltauResult', ('correlation', 'pvalue'))
  2871. def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate', method='auto'):
  2872. """
  2873. Calculate Kendall's tau, a correlation measure for ordinal data.
  2874. Kendall's tau is a measure of the correspondence between two rankings.
  2875. Values close to 1 indicate strong agreement, values close to -1 indicate
  2876. strong disagreement. This is the 1945 "tau-b" version of Kendall's
  2877. tau [2]_, which can account for ties and which reduces to the 1938 "tau-a"
  2878. version [1]_ in absence of ties.
  2879. Parameters
  2880. ----------
  2881. x, y : array_like
  2882. Arrays of rankings, of the same shape. If arrays are not 1-D, they will
  2883. be flattened to 1-D.
  2884. initial_lexsort : bool, optional
  2885. Unused (deprecated).
  2886. nan_policy : {'propagate', 'raise', 'omit'}, optional
  2887. Defines how to handle when input contains nan. 'propagate' returns nan,
  2888. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  2889. values. Default is 'propagate'. Note that if the input contains nan
  2890. 'omit' delegates to mstats_basic.kendalltau(), which has a different
  2891. implementation.
  2892. method: {'auto', 'asymptotic', 'exact'}, optional
  2893. Defines which method is used to calculate the p-value [5]_.
  2894. 'asymptotic' uses a normal approximation valid for large samples.
  2895. 'exact' computes the exact p-value, but can only be used if no ties
  2896. are present. 'auto' is the default and selects the appropriate
  2897. method based on a trade-off between speed and accuracy.
  2898. Returns
  2899. -------
  2900. correlation : float
  2901. The tau statistic.
  2902. pvalue : float
  2903. The two-sided p-value for a hypothesis test whose null hypothesis is
  2904. an absence of association, tau = 0.
  2905. See also
  2906. --------
  2907. spearmanr : Calculates a Spearman rank-order correlation coefficient.
  2908. theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
  2909. weightedtau : Computes a weighted version of Kendall's tau.
  2910. Notes
  2911. -----
  2912. The definition of Kendall's tau that is used is [2]_::
  2913. tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U))
  2914. where P is the number of concordant pairs, Q the number of discordant
  2915. pairs, T the number of ties only in `x`, and U the number of ties only in
  2916. `y`. If a tie occurs for the same pair in both `x` and `y`, it is not
  2917. added to either T or U.
  2918. References
  2919. ----------
  2920. .. [1] Maurice G. Kendall, "A New Measure of Rank Correlation", Biometrika
  2921. Vol. 30, No. 1/2, pp. 81-93, 1938.
  2922. .. [2] Maurice G. Kendall, "The treatment of ties in ranking problems",
  2923. Biometrika Vol. 33, No. 3, pp. 239-251. 1945.
  2924. .. [3] Gottfried E. Noether, "Elements of Nonparametric Statistics", John
  2925. Wiley & Sons, 1967.
  2926. .. [4] Peter M. Fenwick, "A new data structure for cumulative frequency
  2927. tables", Software: Practice and Experience, Vol. 24, No. 3,
  2928. pp. 327-336, 1994.
  2929. .. [5] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
  2930. Charles Griffin & Co., 1970.
  2931. Examples
  2932. --------
  2933. >>> from scipy import stats
  2934. >>> x1 = [12, 2, 1, 12, 2]
  2935. >>> x2 = [1, 4, 7, 1, 0]
  2936. >>> tau, p_value = stats.kendalltau(x1, x2)
  2937. >>> tau
  2938. -0.47140452079103173
  2939. >>> p_value
  2940. 0.2827454599327748
  2941. """
  2942. x = np.asarray(x).ravel()
  2943. y = np.asarray(y).ravel()
  2944. if x.size != y.size:
  2945. raise ValueError("All inputs to `kendalltau` must be of the same size, "
  2946. "found x-size %s and y-size %s" % (x.size, y.size))
  2947. elif not x.size or not y.size:
  2948. return KendalltauResult(np.nan, np.nan) # Return NaN if arrays are empty
  2949. # check both x and y
  2950. cnx, npx = _contains_nan(x, nan_policy)
  2951. cny, npy = _contains_nan(y, nan_policy)
  2952. contains_nan = cnx or cny
  2953. if npx == 'omit' or npy == 'omit':
  2954. nan_policy = 'omit'
  2955. if contains_nan and nan_policy == 'propagate':
  2956. return KendalltauResult(np.nan, np.nan)
  2957. elif contains_nan and nan_policy == 'omit':
  2958. x = ma.masked_invalid(x)
  2959. y = ma.masked_invalid(y)
  2960. return mstats_basic.kendalltau(x, y, method=method)
  2961. if initial_lexsort is not None: # deprecate to drop!
  2962. warnings.warn('"initial_lexsort" is gone!')
  2963. def count_rank_tie(ranks):
  2964. cnt = np.bincount(ranks).astype('int64', copy=False)
  2965. cnt = cnt[cnt > 1]
  2966. return ((cnt * (cnt - 1) // 2).sum(),
  2967. (cnt * (cnt - 1.) * (cnt - 2)).sum(),
  2968. (cnt * (cnt - 1.) * (2*cnt + 5)).sum())
  2969. size = x.size
  2970. perm = np.argsort(y) # sort on y and convert y to dense ranks
  2971. x, y = x[perm], y[perm]
  2972. y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp)
  2973. # stable sort on x and convert x to dense ranks
  2974. perm = np.argsort(x, kind='mergesort')
  2975. x, y = x[perm], y[perm]
  2976. x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp)
  2977. dis = _kendall_dis(x, y) # discordant pairs
  2978. obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True]
  2979. cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False)
  2980. ntie = (cnt * (cnt - 1) // 2).sum() # joint ties
  2981. xtie, x0, x1 = count_rank_tie(x) # ties in x, stats
  2982. ytie, y0, y1 = count_rank_tie(y) # ties in y, stats
  2983. tot = (size * (size - 1)) // 2
  2984. if xtie == tot or ytie == tot:
  2985. return KendalltauResult(np.nan, np.nan)
  2986. # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
  2987. # = con + dis + xtie + ytie - ntie
  2988. con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
  2989. tau = con_minus_dis / np.sqrt(tot - xtie) / np.sqrt(tot - ytie)
  2990. # Limit range to fix computational errors
  2991. tau = min(1., max(-1., tau))
  2992. if method == 'exact' and (xtie != 0 or ytie != 0):
  2993. raise ValueError("Ties found, exact method cannot be used.")
  2994. if method == 'auto':
  2995. if (xtie == 0 and ytie == 0) and (size <= 33 or min(dis, tot-dis) <= 1):
  2996. method = 'exact'
  2997. else:
  2998. method = 'asymptotic'
  2999. if xtie == 0 and ytie == 0 and method == 'exact':
  3000. # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), Charles Griffin & Co., 1970.
  3001. c = min(dis, tot-dis)
  3002. if size <= 0:
  3003. raise ValueError
  3004. elif c < 0 or 2*c > size*(size-1):
  3005. raise ValueError
  3006. elif size == 1:
  3007. pvalue = 1.0
  3008. elif size == 2:
  3009. pvalue = 1.0
  3010. elif c == 0:
  3011. pvalue = 2.0/math.factorial(size) if size < 171 else 0.0
  3012. elif c == 1:
  3013. pvalue = 2.0/math.factorial(size-1) if (size-1) < 171 else 0.0
  3014. else:
  3015. new = [0.0]*(c+1)
  3016. new[0] = 1.0
  3017. new[1] = 1.0
  3018. for j in range(3,size+1):
  3019. old = new[:]
  3020. for k in range(1,min(j,c+1)):
  3021. new[k] += new[k-1]
  3022. for k in range(j,c+1):
  3023. new[k] += new[k-1] - old[k-j]
  3024. pvalue = 2.0*sum(new)/math.factorial(size) if size < 171 else 0.0
  3025. elif method == 'asymptotic':
  3026. # con_minus_dis is approx normally distributed with this variance [3]_
  3027. var = (size * (size - 1) * (2.*size + 5) - x1 - y1) / 18. + (
  3028. 2. * xtie * ytie) / (size * (size - 1)) + x0 * y0 / (9. *
  3029. size * (size - 1) * (size - 2))
  3030. pvalue = special.erfc(np.abs(con_minus_dis) / np.sqrt(var) / np.sqrt(2))
  3031. else:
  3032. raise ValueError("Unknown method "+str(method)+" specified, please use auto, exact or asymptotic.")
  3033. return KendalltauResult(tau, pvalue)
  3034. WeightedTauResult = namedtuple('WeightedTauResult', ('correlation', 'pvalue'))
  3035. def weightedtau(x, y, rank=True, weigher=None, additive=True):
  3036. r"""
  3037. Compute a weighted version of Kendall's :math:`\tau`.
  3038. The weighted :math:`\tau` is a weighted version of Kendall's
  3039. :math:`\tau` in which exchanges of high weight are more influential than
  3040. exchanges of low weight. The default parameters compute the additive
  3041. hyperbolic version of the index, :math:`\tau_\mathrm h`, which has
  3042. been shown to provide the best balance between important and
  3043. unimportant elements [1]_.
  3044. The weighting is defined by means of a rank array, which assigns a
  3045. nonnegative rank to each element, and a weigher function, which
  3046. assigns a weight based from the rank to each element. The weight of an
  3047. exchange is then the sum or the product of the weights of the ranks of
  3048. the exchanged elements. The default parameters compute
  3049. :math:`\tau_\mathrm h`: an exchange between elements with rank
  3050. :math:`r` and :math:`s` (starting from zero) has weight
  3051. :math:`1/(r+1) + 1/(s+1)`.
  3052. Specifying a rank array is meaningful only if you have in mind an
  3053. external criterion of importance. If, as it usually happens, you do
  3054. not have in mind a specific rank, the weighted :math:`\tau` is
  3055. defined by averaging the values obtained using the decreasing
  3056. lexicographical rank by (`x`, `y`) and by (`y`, `x`). This is the
  3057. behavior with default parameters.
  3058. Note that if you are computing the weighted :math:`\tau` on arrays of
  3059. ranks, rather than of scores (i.e., a larger value implies a lower
  3060. rank) you must negate the ranks, so that elements of higher rank are
  3061. associated with a larger value.
  3062. Parameters
  3063. ----------
  3064. x, y : array_like
  3065. Arrays of scores, of the same shape. If arrays are not 1-D, they will
  3066. be flattened to 1-D.
  3067. rank: array_like of ints or bool, optional
  3068. A nonnegative rank assigned to each element. If it is None, the
  3069. decreasing lexicographical rank by (`x`, `y`) will be used: elements of
  3070. higher rank will be those with larger `x`-values, using `y`-values to
  3071. break ties (in particular, swapping `x` and `y` will give a different
  3072. result). If it is False, the element indices will be used
  3073. directly as ranks. The default is True, in which case this
  3074. function returns the average of the values obtained using the
  3075. decreasing lexicographical rank by (`x`, `y`) and by (`y`, `x`).
  3076. weigher : callable, optional
  3077. The weigher function. Must map nonnegative integers (zero
  3078. representing the most important element) to a nonnegative weight.
  3079. The default, None, provides hyperbolic weighing, that is,
  3080. rank :math:`r` is mapped to weight :math:`1/(r+1)`.
  3081. additive : bool, optional
  3082. If True, the weight of an exchange is computed by adding the
  3083. weights of the ranks of the exchanged elements; otherwise, the weights
  3084. are multiplied. The default is True.
  3085. Returns
  3086. -------
  3087. correlation : float
  3088. The weighted :math:`\tau` correlation index.
  3089. pvalue : float
  3090. Presently ``np.nan``, as the null statistics is unknown (even in the
  3091. additive hyperbolic case).
  3092. See also
  3093. --------
  3094. kendalltau : Calculates Kendall's tau.
  3095. spearmanr : Calculates a Spearman rank-order correlation coefficient.
  3096. theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
  3097. Notes
  3098. -----
  3099. This function uses an :math:`O(n \log n)`, mergesort-based algorithm
  3100. [1]_ that is a weighted extension of Knight's algorithm for Kendall's
  3101. :math:`\tau` [2]_. It can compute Shieh's weighted :math:`\tau` [3]_
  3102. between rankings without ties (i.e., permutations) by setting
  3103. `additive` and `rank` to False, as the definition given in [1]_ is a
  3104. generalization of Shieh's.
  3105. NaNs are considered the smallest possible score.
  3106. .. versionadded:: 0.19.0
  3107. References
  3108. ----------
  3109. .. [1] Sebastiano Vigna, "A weighted correlation index for rankings with
  3110. ties", Proceedings of the 24th international conference on World
  3111. Wide Web, pp. 1166-1176, ACM, 2015.
  3112. .. [2] W.R. Knight, "A Computer Method for Calculating Kendall's Tau with
  3113. Ungrouped Data", Journal of the American Statistical Association,
  3114. Vol. 61, No. 314, Part 1, pp. 436-439, 1966.
  3115. .. [3] Grace S. Shieh. "A weighted Kendall's tau statistic", Statistics &
  3116. Probability Letters, Vol. 39, No. 1, pp. 17-24, 1998.
  3117. Examples
  3118. --------
  3119. >>> from scipy import stats
  3120. >>> x = [12, 2, 1, 12, 2]
  3121. >>> y = [1, 4, 7, 1, 0]
  3122. >>> tau, p_value = stats.weightedtau(x, y)
  3123. >>> tau
  3124. -0.56694968153682723
  3125. >>> p_value
  3126. nan
  3127. >>> tau, p_value = stats.weightedtau(x, y, additive=False)
  3128. >>> tau
  3129. -0.62205716951801038
  3130. NaNs are considered the smallest possible score:
  3131. >>> x = [12, 2, 1, 12, 2]
  3132. >>> y = [1, 4, 7, 1, np.nan]
  3133. >>> tau, _ = stats.weightedtau(x, y)
  3134. >>> tau
  3135. -0.56694968153682723
  3136. This is exactly Kendall's tau:
  3137. >>> x = [12, 2, 1, 12, 2]
  3138. >>> y = [1, 4, 7, 1, 0]
  3139. >>> tau, _ = stats.weightedtau(x, y, weigher=lambda x: 1)
  3140. >>> tau
  3141. -0.47140452079103173
  3142. >>> x = [12, 2, 1, 12, 2]
  3143. >>> y = [1, 4, 7, 1, 0]
  3144. >>> stats.weightedtau(x, y, rank=None)
  3145. WeightedTauResult(correlation=-0.4157652301037516, pvalue=nan)
  3146. >>> stats.weightedtau(y, x, rank=None)
  3147. WeightedTauResult(correlation=-0.7181341329699028, pvalue=nan)
  3148. """
  3149. x = np.asarray(x).ravel()
  3150. y = np.asarray(y).ravel()
  3151. if x.size != y.size:
  3152. raise ValueError("All inputs to `weightedtau` must be of the same size, "
  3153. "found x-size %s and y-size %s" % (x.size, y.size))
  3154. if not x.size:
  3155. return WeightedTauResult(np.nan, np.nan) # Return NaN if arrays are empty
  3156. # If there are NaNs we apply _toint64()
  3157. if np.isnan(np.sum(x)):
  3158. x = _toint64(x)
  3159. if np.isnan(np.sum(x)):
  3160. y = _toint64(y)
  3161. # Reduce to ranks unsupported types
  3162. if x.dtype != y.dtype:
  3163. if x.dtype != np.int64:
  3164. x = _toint64(x)
  3165. if y.dtype != np.int64:
  3166. y = _toint64(y)
  3167. else:
  3168. if x.dtype not in (np.int32, np.int64, np.float32, np.float64):
  3169. x = _toint64(x)
  3170. y = _toint64(y)
  3171. if rank is True:
  3172. return WeightedTauResult((
  3173. _weightedrankedtau(x, y, None, weigher, additive) +
  3174. _weightedrankedtau(y, x, None, weigher, additive)
  3175. ) / 2, np.nan)
  3176. if rank is False:
  3177. rank = np.arange(x.size, dtype=np.intp)
  3178. elif rank is not None:
  3179. rank = np.asarray(rank).ravel()
  3180. if rank.size != x.size:
  3181. raise ValueError("All inputs to `weightedtau` must be of the same size, "
  3182. "found x-size %s and rank-size %s" % (x.size, rank.size))
  3183. return WeightedTauResult(_weightedrankedtau(x, y, rank, weigher, additive), np.nan)
  3184. #####################################
  3185. # INFERENTIAL STATISTICS #
  3186. #####################################
  3187. Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
  3188. def ttest_1samp(a, popmean, axis=0, nan_policy='propagate'):
  3189. """
  3190. Calculate the T-test for the mean of ONE group of scores.
  3191. This is a two-sided test for the null hypothesis that the expected value
  3192. (mean) of a sample of independent observations `a` is equal to the given
  3193. population mean, `popmean`.
  3194. Parameters
  3195. ----------
  3196. a : array_like
  3197. sample observation
  3198. popmean : float or array_like
  3199. expected value in null hypothesis. If array_like, then it must have the
  3200. same shape as `a` excluding the axis dimension
  3201. axis : int or None, optional
  3202. Axis along which to compute test. If None, compute over the whole
  3203. array `a`.
  3204. nan_policy : {'propagate', 'raise', 'omit'}, optional
  3205. Defines how to handle when input contains nan. 'propagate' returns nan,
  3206. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  3207. values. Default is 'propagate'.
  3208. Returns
  3209. -------
  3210. statistic : float or array
  3211. t-statistic
  3212. pvalue : float or array
  3213. two-tailed p-value
  3214. Examples
  3215. --------
  3216. >>> from scipy import stats
  3217. >>> np.random.seed(7654567) # fix seed to get the same result
  3218. >>> rvs = stats.norm.rvs(loc=5, scale=10, size=(50,2))
  3219. Test if mean of random sample is equal to true mean, and different mean.
  3220. We reject the null hypothesis in the second case and don't reject it in
  3221. the first case.
  3222. >>> stats.ttest_1samp(rvs,5.0)
  3223. (array([-0.68014479, -0.04323899]), array([ 0.49961383, 0.96568674]))
  3224. >>> stats.ttest_1samp(rvs,0.0)
  3225. (array([ 2.77025808, 4.11038784]), array([ 0.00789095, 0.00014999]))
  3226. Examples using axis and non-scalar dimension for population mean.
  3227. >>> stats.ttest_1samp(rvs,[5.0,0.0])
  3228. (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04]))
  3229. >>> stats.ttest_1samp(rvs.T,[5.0,0.0],axis=1)
  3230. (array([-0.68014479, 4.11038784]), array([ 4.99613833e-01, 1.49986458e-04]))
  3231. >>> stats.ttest_1samp(rvs,[[5.0],[0.0]])
  3232. (array([[-0.68014479, -0.04323899],
  3233. [ 2.77025808, 4.11038784]]), array([[ 4.99613833e-01, 9.65686743e-01],
  3234. [ 7.89094663e-03, 1.49986458e-04]]))
  3235. """
  3236. a, axis = _chk_asarray(a, axis)
  3237. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  3238. if contains_nan and nan_policy == 'omit':
  3239. a = ma.masked_invalid(a)
  3240. return mstats_basic.ttest_1samp(a, popmean, axis)
  3241. n = a.shape[axis]
  3242. df = n - 1
  3243. d = np.mean(a, axis) - popmean
  3244. v = np.var(a, axis, ddof=1)
  3245. denom = np.sqrt(v / n)
  3246. with np.errstate(divide='ignore', invalid='ignore'):
  3247. t = np.divide(d, denom)
  3248. t, prob = _ttest_finish(df, t)
  3249. return Ttest_1sampResult(t, prob)
  3250. def _ttest_finish(df, t):
  3251. """Common code between all 3 t-test functions."""
  3252. prob = distributions.t.sf(np.abs(t), df) * 2 # use np.abs to get upper tail
  3253. if t.ndim == 0:
  3254. t = t[()]
  3255. return t, prob
  3256. def _ttest_ind_from_stats(mean1, mean2, denom, df):
  3257. d = mean1 - mean2
  3258. with np.errstate(divide='ignore', invalid='ignore'):
  3259. t = np.divide(d, denom)
  3260. t, prob = _ttest_finish(df, t)
  3261. return (t, prob)
  3262. def _unequal_var_ttest_denom(v1, n1, v2, n2):
  3263. vn1 = v1 / n1
  3264. vn2 = v2 / n2
  3265. with np.errstate(divide='ignore', invalid='ignore'):
  3266. df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
  3267. # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
  3268. # Hence it doesn't matter what df is as long as it's not NaN.
  3269. df = np.where(np.isnan(df), 1, df)
  3270. denom = np.sqrt(vn1 + vn2)
  3271. return df, denom
  3272. def _equal_var_ttest_denom(v1, n1, v2, n2):
  3273. df = n1 + n2 - 2.0
  3274. svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
  3275. denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2))
  3276. return df, denom
  3277. Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
  3278. def ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2,
  3279. equal_var=True):
  3280. """
  3281. T-test for means of two independent samples from descriptive statistics.
  3282. This is a two-sided test for the null hypothesis that two independent
  3283. samples have identical average (expected) values.
  3284. Parameters
  3285. ----------
  3286. mean1 : array_like
  3287. The mean(s) of sample 1.
  3288. std1 : array_like
  3289. The standard deviation(s) of sample 1.
  3290. nobs1 : array_like
  3291. The number(s) of observations of sample 1.
  3292. mean2 : array_like
  3293. The mean(s) of sample 2
  3294. std2 : array_like
  3295. The standard deviations(s) of sample 2.
  3296. nobs2 : array_like
  3297. The number(s) of observations of sample 2.
  3298. equal_var : bool, optional
  3299. If True (default), perform a standard independent 2 sample test
  3300. that assumes equal population variances [1]_.
  3301. If False, perform Welch's t-test, which does not assume equal
  3302. population variance [2]_.
  3303. Returns
  3304. -------
  3305. statistic : float or array
  3306. The calculated t-statistics
  3307. pvalue : float or array
  3308. The two-tailed p-value.
  3309. See Also
  3310. --------
  3311. scipy.stats.ttest_ind
  3312. Notes
  3313. -----
  3314. .. versionadded:: 0.16.0
  3315. References
  3316. ----------
  3317. .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test
  3318. .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test
  3319. Examples
  3320. --------
  3321. Suppose we have the summary data for two samples, as follows::
  3322. Sample Sample
  3323. Size Mean Variance
  3324. Sample 1 13 15.0 87.5
  3325. Sample 2 11 12.0 39.0
  3326. Apply the t-test to this data (with the assumption that the population
  3327. variances are equal):
  3328. >>> from scipy.stats import ttest_ind_from_stats
  3329. >>> ttest_ind_from_stats(mean1=15.0, std1=np.sqrt(87.5), nobs1=13,
  3330. ... mean2=12.0, std2=np.sqrt(39.0), nobs2=11)
  3331. Ttest_indResult(statistic=0.9051358093310269, pvalue=0.3751996797581487)
  3332. For comparison, here is the data from which those summary statistics
  3333. were taken. With this data, we can compute the same result using
  3334. `scipy.stats.ttest_ind`:
  3335. >>> a = np.array([1, 3, 4, 6, 11, 13, 15, 19, 22, 24, 25, 26, 26])
  3336. >>> b = np.array([2, 4, 6, 9, 11, 13, 14, 15, 18, 19, 21])
  3337. >>> from scipy.stats import ttest_ind
  3338. >>> ttest_ind(a, b)
  3339. Ttest_indResult(statistic=0.905135809331027, pvalue=0.3751996797581486)
  3340. """
  3341. if equal_var:
  3342. df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2)
  3343. else:
  3344. df, denom = _unequal_var_ttest_denom(std1**2, nobs1,
  3345. std2**2, nobs2)
  3346. res = _ttest_ind_from_stats(mean1, mean2, denom, df)
  3347. return Ttest_indResult(*res)
  3348. def ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate'):
  3349. """
  3350. Calculate the T-test for the means of *two independent* samples of scores.
  3351. This is a two-sided test for the null hypothesis that 2 independent samples
  3352. have identical average (expected) values. This test assumes that the
  3353. populations have identical variances by default.
  3354. Parameters
  3355. ----------
  3356. a, b : array_like
  3357. The arrays must have the same shape, except in the dimension
  3358. corresponding to `axis` (the first, by default).
  3359. axis : int or None, optional
  3360. Axis along which to compute test. If None, compute over the whole
  3361. arrays, `a`, and `b`.
  3362. equal_var : bool, optional
  3363. If True (default), perform a standard independent 2 sample test
  3364. that assumes equal population variances [1]_.
  3365. If False, perform Welch's t-test, which does not assume equal
  3366. population variance [2]_.
  3367. .. versionadded:: 0.11.0
  3368. nan_policy : {'propagate', 'raise', 'omit'}, optional
  3369. Defines how to handle when input contains nan. 'propagate' returns nan,
  3370. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  3371. values. Default is 'propagate'.
  3372. Returns
  3373. -------
  3374. statistic : float or array
  3375. The calculated t-statistic.
  3376. pvalue : float or array
  3377. The two-tailed p-value.
  3378. Notes
  3379. -----
  3380. We can use this test, if we observe two independent samples from
  3381. the same or different population, e.g. exam scores of boys and
  3382. girls or of two ethnic groups. The test measures whether the
  3383. average (expected) value differs significantly across samples. If
  3384. we observe a large p-value, for example larger than 0.05 or 0.1,
  3385. then we cannot reject the null hypothesis of identical average scores.
  3386. If the p-value is smaller than the threshold, e.g. 1%, 5% or 10%,
  3387. then we reject the null hypothesis of equal averages.
  3388. References
  3389. ----------
  3390. .. [1] https://en.wikipedia.org/wiki/T-test#Independent_two-sample_t-test
  3391. .. [2] https://en.wikipedia.org/wiki/Welch%27s_t-test
  3392. Examples
  3393. --------
  3394. >>> from scipy import stats
  3395. >>> np.random.seed(12345678)
  3396. Test with sample with identical means:
  3397. >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500)
  3398. >>> rvs2 = stats.norm.rvs(loc=5,scale=10,size=500)
  3399. >>> stats.ttest_ind(rvs1,rvs2)
  3400. (0.26833823296239279, 0.78849443369564776)
  3401. >>> stats.ttest_ind(rvs1,rvs2, equal_var = False)
  3402. (0.26833823296239279, 0.78849452749500748)
  3403. `ttest_ind` underestimates p for unequal variances:
  3404. >>> rvs3 = stats.norm.rvs(loc=5, scale=20, size=500)
  3405. >>> stats.ttest_ind(rvs1, rvs3)
  3406. (-0.46580283298287162, 0.64145827413436174)
  3407. >>> stats.ttest_ind(rvs1, rvs3, equal_var = False)
  3408. (-0.46580283298287162, 0.64149646246569292)
  3409. When n1 != n2, the equal variance t-statistic is no longer equal to the
  3410. unequal variance t-statistic:
  3411. >>> rvs4 = stats.norm.rvs(loc=5, scale=20, size=100)
  3412. >>> stats.ttest_ind(rvs1, rvs4)
  3413. (-0.99882539442782481, 0.3182832709103896)
  3414. >>> stats.ttest_ind(rvs1, rvs4, equal_var = False)
  3415. (-0.69712570584654099, 0.48716927725402048)
  3416. T-test with different means, variance, and n:
  3417. >>> rvs5 = stats.norm.rvs(loc=8, scale=20, size=100)
  3418. >>> stats.ttest_ind(rvs1, rvs5)
  3419. (-1.4679669854490653, 0.14263895620529152)
  3420. >>> stats.ttest_ind(rvs1, rvs5, equal_var = False)
  3421. (-0.94365973617132992, 0.34744170334794122)
  3422. """
  3423. a, b, axis = _chk2_asarray(a, b, axis)
  3424. # check both a and b
  3425. cna, npa = _contains_nan(a, nan_policy)
  3426. cnb, npb = _contains_nan(b, nan_policy)
  3427. contains_nan = cna or cnb
  3428. if npa == 'omit' or npb == 'omit':
  3429. nan_policy = 'omit'
  3430. if contains_nan and nan_policy == 'omit':
  3431. a = ma.masked_invalid(a)
  3432. b = ma.masked_invalid(b)
  3433. return mstats_basic.ttest_ind(a, b, axis, equal_var)
  3434. if a.size == 0 or b.size == 0:
  3435. return Ttest_indResult(np.nan, np.nan)
  3436. v1 = np.var(a, axis, ddof=1)
  3437. v2 = np.var(b, axis, ddof=1)
  3438. n1 = a.shape[axis]
  3439. n2 = b.shape[axis]
  3440. if equal_var:
  3441. df, denom = _equal_var_ttest_denom(v1, n1, v2, n2)
  3442. else:
  3443. df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2)
  3444. res = _ttest_ind_from_stats(np.mean(a, axis), np.mean(b, axis), denom, df)
  3445. return Ttest_indResult(*res)
  3446. Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
  3447. def ttest_rel(a, b, axis=0, nan_policy='propagate'):
  3448. """
  3449. Calculate the T-test on TWO RELATED samples of scores, a and b.
  3450. This is a two-sided test for the null hypothesis that 2 related or
  3451. repeated samples have identical average (expected) values.
  3452. Parameters
  3453. ----------
  3454. a, b : array_like
  3455. The arrays must have the same shape.
  3456. axis : int or None, optional
  3457. Axis along which to compute test. If None, compute over the whole
  3458. arrays, `a`, and `b`.
  3459. nan_policy : {'propagate', 'raise', 'omit'}, optional
  3460. Defines how to handle when input contains nan. 'propagate' returns nan,
  3461. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  3462. values. Default is 'propagate'.
  3463. Returns
  3464. -------
  3465. statistic : float or array
  3466. t-statistic
  3467. pvalue : float or array
  3468. two-tailed p-value
  3469. Notes
  3470. -----
  3471. Examples for the use are scores of the same set of student in
  3472. different exams, or repeated sampling from the same units. The
  3473. test measures whether the average score differs significantly
  3474. across samples (e.g. exams). If we observe a large p-value, for
  3475. example greater than 0.05 or 0.1 then we cannot reject the null
  3476. hypothesis of identical average scores. If the p-value is smaller
  3477. than the threshold, e.g. 1%, 5% or 10%, then we reject the null
  3478. hypothesis of equal averages. Small p-values are associated with
  3479. large t-statistics.
  3480. References
  3481. ----------
  3482. https://en.wikipedia.org/wiki/T-test#Dependent_t-test_for_paired_samples
  3483. Examples
  3484. --------
  3485. >>> from scipy import stats
  3486. >>> np.random.seed(12345678) # fix random seed to get same numbers
  3487. >>> rvs1 = stats.norm.rvs(loc=5,scale=10,size=500)
  3488. >>> rvs2 = (stats.norm.rvs(loc=5,scale=10,size=500) +
  3489. ... stats.norm.rvs(scale=0.2,size=500))
  3490. >>> stats.ttest_rel(rvs1,rvs2)
  3491. (0.24101764965300962, 0.80964043445811562)
  3492. >>> rvs3 = (stats.norm.rvs(loc=8,scale=10,size=500) +
  3493. ... stats.norm.rvs(scale=0.2,size=500))
  3494. >>> stats.ttest_rel(rvs1,rvs3)
  3495. (-3.9995108708727933, 7.3082402191726459e-005)
  3496. """
  3497. a, b, axis = _chk2_asarray(a, b, axis)
  3498. cna, npa = _contains_nan(a, nan_policy)
  3499. cnb, npb = _contains_nan(b, nan_policy)
  3500. contains_nan = cna or cnb
  3501. if npa == 'omit' or npb == 'omit':
  3502. nan_policy = 'omit'
  3503. if contains_nan and nan_policy == 'omit':
  3504. a = ma.masked_invalid(a)
  3505. b = ma.masked_invalid(b)
  3506. m = ma.mask_or(ma.getmask(a), ma.getmask(b))
  3507. aa = ma.array(a, mask=m, copy=True)
  3508. bb = ma.array(b, mask=m, copy=True)
  3509. return mstats_basic.ttest_rel(aa, bb, axis)
  3510. if a.shape[axis] != b.shape[axis]:
  3511. raise ValueError('unequal length arrays')
  3512. if a.size == 0 or b.size == 0:
  3513. return np.nan, np.nan
  3514. n = a.shape[axis]
  3515. df = n - 1
  3516. d = (a - b).astype(np.float64)
  3517. v = np.var(d, axis, ddof=1)
  3518. dm = np.mean(d, axis)
  3519. denom = np.sqrt(v / n)
  3520. with np.errstate(divide='ignore', invalid='ignore'):
  3521. t = np.divide(dm, denom)
  3522. t, prob = _ttest_finish(df, t)
  3523. return Ttest_relResult(t, prob)
  3524. KstestResult = namedtuple('KstestResult', ('statistic', 'pvalue'))
  3525. def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='approx'):
  3526. """
  3527. Perform the Kolmogorov-Smirnov test for goodness of fit.
  3528. This performs a test of the distribution G(x) of an observed
  3529. random variable against a given distribution F(x). Under the null
  3530. hypothesis the two distributions are identical, G(x)=F(x). The
  3531. alternative hypothesis can be either 'two-sided' (default), 'less'
  3532. or 'greater'. The KS test is only valid for continuous distributions.
  3533. Parameters
  3534. ----------
  3535. rvs : str, array or callable
  3536. If a string, it should be the name of a distribution in `scipy.stats`.
  3537. If an array, it should be a 1-D array of observations of random
  3538. variables.
  3539. If a callable, it should be a function to generate random variables;
  3540. it is required to have a keyword argument `size`.
  3541. cdf : str or callable
  3542. If a string, it should be the name of a distribution in `scipy.stats`.
  3543. If `rvs` is a string then `cdf` can be False or the same as `rvs`.
  3544. If a callable, that callable is used to calculate the cdf.
  3545. args : tuple, sequence, optional
  3546. Distribution parameters, used if `rvs` or `cdf` are strings.
  3547. N : int, optional
  3548. Sample size if `rvs` is string or callable. Default is 20.
  3549. alternative : {'two-sided', 'less','greater'}, optional
  3550. Defines the alternative hypothesis (see explanation above).
  3551. Default is 'two-sided'.
  3552. mode : 'approx' (default) or 'asymp', optional
  3553. Defines the distribution used for calculating the p-value.
  3554. - 'approx' : use approximation to exact distribution of test statistic
  3555. - 'asymp' : use asymptotic distribution of test statistic
  3556. Returns
  3557. -------
  3558. statistic : float
  3559. KS test statistic, either D, D+ or D-.
  3560. pvalue : float
  3561. One-tailed or two-tailed p-value.
  3562. Notes
  3563. -----
  3564. In the one-sided test, the alternative is that the empirical
  3565. cumulative distribution function of the random variable is "less"
  3566. or "greater" than the cumulative distribution function F(x) of the
  3567. hypothesis, ``G(x)<=F(x)``, resp. ``G(x)>=F(x)``.
  3568. Examples
  3569. --------
  3570. >>> from scipy import stats
  3571. >>> x = np.linspace(-15, 15, 9)
  3572. >>> stats.kstest(x, 'norm')
  3573. (0.44435602715924361, 0.038850142705171065)
  3574. >>> np.random.seed(987654321) # set random seed to get the same result
  3575. >>> stats.kstest('norm', False, N=100)
  3576. (0.058352892479417884, 0.88531190944151261)
  3577. The above lines are equivalent to:
  3578. >>> np.random.seed(987654321)
  3579. >>> stats.kstest(stats.norm.rvs(size=100), 'norm')
  3580. (0.058352892479417884, 0.88531190944151261)
  3581. *Test against one-sided alternative hypothesis*
  3582. Shift distribution to larger values, so that ``cdf_dgp(x) < norm.cdf(x)``:
  3583. >>> np.random.seed(987654321)
  3584. >>> x = stats.norm.rvs(loc=0.2, size=100)
  3585. >>> stats.kstest(x,'norm', alternative = 'less')
  3586. (0.12464329735846891, 0.040989164077641749)
  3587. Reject equal distribution against alternative hypothesis: less
  3588. >>> stats.kstest(x,'norm', alternative = 'greater')
  3589. (0.0072115233216311081, 0.98531158590396395)
  3590. Don't reject equal distribution against alternative hypothesis: greater
  3591. >>> stats.kstest(x,'norm', mode='asymp')
  3592. (0.12464329735846891, 0.08944488871182088)
  3593. *Testing t distributed random variables against normal distribution*
  3594. With 100 degrees of freedom the t distribution looks close to the normal
  3595. distribution, and the K-S test does not reject the hypothesis that the
  3596. sample came from the normal distribution:
  3597. >>> np.random.seed(987654321)
  3598. >>> stats.kstest(stats.t.rvs(100,size=100),'norm')
  3599. (0.072018929165471257, 0.67630062862479168)
  3600. With 3 degrees of freedom the t distribution looks sufficiently different
  3601. from the normal distribution, that we can reject the hypothesis that the
  3602. sample came from the normal distribution at the 10% level:
  3603. >>> np.random.seed(987654321)
  3604. >>> stats.kstest(stats.t.rvs(3,size=100),'norm')
  3605. (0.131016895759829, 0.058826222555312224)
  3606. """
  3607. if isinstance(rvs, string_types):
  3608. if (not cdf) or (cdf == rvs):
  3609. cdf = getattr(distributions, rvs).cdf
  3610. rvs = getattr(distributions, rvs).rvs
  3611. else:
  3612. raise AttributeError("if rvs is string, cdf has to be the "
  3613. "same distribution")
  3614. if isinstance(cdf, string_types):
  3615. cdf = getattr(distributions, cdf).cdf
  3616. if callable(rvs):
  3617. kwds = {'size': N}
  3618. vals = np.sort(rvs(*args, **kwds))
  3619. else:
  3620. vals = np.sort(rvs)
  3621. N = len(vals)
  3622. cdfvals = cdf(vals, *args)
  3623. # to not break compatibility with existing code
  3624. if alternative == 'two_sided':
  3625. alternative = 'two-sided'
  3626. if alternative in ['two-sided', 'greater']:
  3627. Dplus = (np.arange(1.0, N + 1)/N - cdfvals).max()
  3628. if alternative == 'greater':
  3629. return KstestResult(Dplus, distributions.ksone.sf(Dplus, N))
  3630. if alternative in ['two-sided', 'less']:
  3631. Dmin = (cdfvals - np.arange(0.0, N)/N).max()
  3632. if alternative == 'less':
  3633. return KstestResult(Dmin, distributions.ksone.sf(Dmin, N))
  3634. if alternative == 'two-sided':
  3635. D = np.max([Dplus, Dmin])
  3636. if mode == 'asymp':
  3637. return KstestResult(D, distributions.kstwobign.sf(D * np.sqrt(N)))
  3638. if mode == 'approx':
  3639. pval_two = distributions.kstwobign.sf(D * np.sqrt(N))
  3640. if N > 2666 or pval_two > 0.80 - N*0.3/1000:
  3641. return KstestResult(D, pval_two)
  3642. else:
  3643. return KstestResult(D, 2 * distributions.ksone.sf(D, N))
  3644. # Map from names to lambda_ values used in power_divergence().
  3645. _power_div_lambda_names = {
  3646. "pearson": 1,
  3647. "log-likelihood": 0,
  3648. "freeman-tukey": -0.5,
  3649. "mod-log-likelihood": -1,
  3650. "neyman": -2,
  3651. "cressie-read": 2/3,
  3652. }
  3653. def _count(a, axis=None):
  3654. """
  3655. Count the number of non-masked elements of an array.
  3656. This function behaves like np.ma.count(), but is much faster
  3657. for ndarrays.
  3658. """
  3659. if hasattr(a, 'count'):
  3660. num = a.count(axis=axis)
  3661. if isinstance(num, np.ndarray) and num.ndim == 0:
  3662. # In some cases, the `count` method returns a scalar array (e.g.
  3663. # np.array(3)), but we want a plain integer.
  3664. num = int(num)
  3665. else:
  3666. if axis is None:
  3667. num = a.size
  3668. else:
  3669. num = a.shape[axis]
  3670. return num
  3671. Power_divergenceResult = namedtuple('Power_divergenceResult',
  3672. ('statistic', 'pvalue'))
  3673. def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None):
  3674. """
  3675. Cressie-Read power divergence statistic and goodness of fit test.
  3676. This function tests the null hypothesis that the categorical data
  3677. has the given frequencies, using the Cressie-Read power divergence
  3678. statistic.
  3679. Parameters
  3680. ----------
  3681. f_obs : array_like
  3682. Observed frequencies in each category.
  3683. f_exp : array_like, optional
  3684. Expected frequencies in each category. By default the categories are
  3685. assumed to be equally likely.
  3686. ddof : int, optional
  3687. "Delta degrees of freedom": adjustment to the degrees of freedom
  3688. for the p-value. The p-value is computed using a chi-squared
  3689. distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
  3690. is the number of observed frequencies. The default value of `ddof`
  3691. is 0.
  3692. axis : int or None, optional
  3693. The axis of the broadcast result of `f_obs` and `f_exp` along which to
  3694. apply the test. If axis is None, all values in `f_obs` are treated
  3695. as a single data set. Default is 0.
  3696. lambda_ : float or str, optional
  3697. `lambda_` gives the power in the Cressie-Read power divergence
  3698. statistic. The default is 1. For convenience, `lambda_` may be
  3699. assigned one of the following strings, in which case the
  3700. corresponding numerical value is used::
  3701. String Value Description
  3702. "pearson" 1 Pearson's chi-squared statistic.
  3703. In this case, the function is
  3704. equivalent to `stats.chisquare`.
  3705. "log-likelihood" 0 Log-likelihood ratio. Also known as
  3706. the G-test [3]_.
  3707. "freeman-tukey" -1/2 Freeman-Tukey statistic.
  3708. "mod-log-likelihood" -1 Modified log-likelihood ratio.
  3709. "neyman" -2 Neyman's statistic.
  3710. "cressie-read" 2/3 The power recommended in [5]_.
  3711. Returns
  3712. -------
  3713. statistic : float or ndarray
  3714. The Cressie-Read power divergence test statistic. The value is
  3715. a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D.
  3716. pvalue : float or ndarray
  3717. The p-value of the test. The value is a float if `ddof` and the
  3718. return value `stat` are scalars.
  3719. See Also
  3720. --------
  3721. chisquare
  3722. Notes
  3723. -----
  3724. This test is invalid when the observed or expected frequencies in each
  3725. category are too small. A typical rule is that all of the observed
  3726. and expected frequencies should be at least 5.
  3727. When `lambda_` is less than zero, the formula for the statistic involves
  3728. dividing by `f_obs`, so a warning or error may be generated if any value
  3729. in `f_obs` is 0.
  3730. Similarly, a warning or error may be generated if any value in `f_exp` is
  3731. zero when `lambda_` >= 0.
  3732. The default degrees of freedom, k-1, are for the case when no parameters
  3733. of the distribution are estimated. If p parameters are estimated by
  3734. efficient maximum likelihood then the correct degrees of freedom are
  3735. k-1-p. If the parameters are estimated in a different way, then the
  3736. dof can be between k-1-p and k-1. However, it is also possible that
  3737. the asymptotic distribution is not a chisquare, in which case this
  3738. test is not appropriate.
  3739. This function handles masked arrays. If an element of `f_obs` or `f_exp`
  3740. is masked, then data at that position is ignored, and does not count
  3741. towards the size of the data set.
  3742. .. versionadded:: 0.13.0
  3743. References
  3744. ----------
  3745. .. [1] Lowry, Richard. "Concepts and Applications of Inferential
  3746. Statistics". Chapter 8.
  3747. https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html
  3748. .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
  3749. .. [3] "G-test", https://en.wikipedia.org/wiki/G-test
  3750. .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and
  3751. practice of statistics in biological research", New York: Freeman
  3752. (1981)
  3753. .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
  3754. Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
  3755. pp. 440-464.
  3756. Examples
  3757. --------
  3758. (See `chisquare` for more examples.)
  3759. When just `f_obs` is given, it is assumed that the expected frequencies
  3760. are uniform and given by the mean of the observed frequencies. Here we
  3761. perform a G-test (i.e. use the log-likelihood ratio statistic):
  3762. >>> from scipy.stats import power_divergence
  3763. >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood')
  3764. (2.006573162632538, 0.84823476779463769)
  3765. The expected frequencies can be given with the `f_exp` argument:
  3766. >>> power_divergence([16, 18, 16, 14, 12, 12],
  3767. ... f_exp=[16, 16, 16, 16, 16, 8],
  3768. ... lambda_='log-likelihood')
  3769. (3.3281031458963746, 0.6495419288047497)
  3770. When `f_obs` is 2-D, by default the test is applied to each column.
  3771. >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
  3772. >>> obs.shape
  3773. (6, 2)
  3774. >>> power_divergence(obs, lambda_="log-likelihood")
  3775. (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225]))
  3776. By setting ``axis=None``, the test is applied to all data in the array,
  3777. which is equivalent to applying the test to the flattened array.
  3778. >>> power_divergence(obs, axis=None)
  3779. (23.31034482758621, 0.015975692534127565)
  3780. >>> power_divergence(obs.ravel())
  3781. (23.31034482758621, 0.015975692534127565)
  3782. `ddof` is the change to make to the default degrees of freedom.
  3783. >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1)
  3784. (2.0, 0.73575888234288467)
  3785. The calculation of the p-values is done by broadcasting the
  3786. test statistic with `ddof`.
  3787. >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=[0,1,2])
  3788. (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ]))
  3789. `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has
  3790. shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
  3791. `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared
  3792. statistics, we must use ``axis=1``:
  3793. >>> power_divergence([16, 18, 16, 14, 12, 12],
  3794. ... f_exp=[[16, 16, 16, 16, 16, 8],
  3795. ... [8, 20, 20, 16, 12, 12]],
  3796. ... axis=1)
  3797. (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846]))
  3798. """
  3799. # Convert the input argument `lambda_` to a numerical value.
  3800. if isinstance(lambda_, string_types):
  3801. if lambda_ not in _power_div_lambda_names:
  3802. names = repr(list(_power_div_lambda_names.keys()))[1:-1]
  3803. raise ValueError("invalid string for lambda_: {0!r}. Valid strings "
  3804. "are {1}".format(lambda_, names))
  3805. lambda_ = _power_div_lambda_names[lambda_]
  3806. elif lambda_ is None:
  3807. lambda_ = 1
  3808. f_obs = np.asanyarray(f_obs)
  3809. if f_exp is not None:
  3810. f_exp = np.atleast_1d(np.asanyarray(f_exp))
  3811. else:
  3812. # Compute the equivalent of
  3813. # f_exp = f_obs.mean(axis=axis, keepdims=True)
  3814. # Older versions of numpy do not have the 'keepdims' argument, so
  3815. # we have to do a little work to achieve the same result.
  3816. # Ignore 'invalid' errors so the edge case of a data set with length 0
  3817. # is handled without spurious warnings.
  3818. with np.errstate(invalid='ignore'):
  3819. f_exp = np.atleast_1d(f_obs.mean(axis=axis))
  3820. if axis is not None:
  3821. reduced_shape = list(f_obs.shape)
  3822. reduced_shape[axis] = 1
  3823. f_exp.shape = reduced_shape
  3824. # `terms` is the array of terms that are summed along `axis` to create
  3825. # the test statistic. We use some specialized code for a few special
  3826. # cases of lambda_.
  3827. if lambda_ == 1:
  3828. # Pearson's chi-squared statistic
  3829. terms = (f_obs - f_exp)**2 / f_exp
  3830. elif lambda_ == 0:
  3831. # Log-likelihood ratio (i.e. G-test)
  3832. terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
  3833. elif lambda_ == -1:
  3834. # Modified log-likelihood ratio
  3835. terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs)
  3836. else:
  3837. # General Cressie-Read power divergence.
  3838. terms = f_obs * ((f_obs / f_exp)**lambda_ - 1)
  3839. terms /= 0.5 * lambda_ * (lambda_ + 1)
  3840. stat = terms.sum(axis=axis)
  3841. num_obs = _count(terms, axis=axis)
  3842. ddof = asarray(ddof)
  3843. p = distributions.chi2.sf(stat, num_obs - 1 - ddof)
  3844. return Power_divergenceResult(stat, p)
  3845. def chisquare(f_obs, f_exp=None, ddof=0, axis=0):
  3846. """
  3847. Calculate a one-way chi square test.
  3848. The chi square test tests the null hypothesis that the categorical data
  3849. has the given frequencies.
  3850. Parameters
  3851. ----------
  3852. f_obs : array_like
  3853. Observed frequencies in each category.
  3854. f_exp : array_like, optional
  3855. Expected frequencies in each category. By default the categories are
  3856. assumed to be equally likely.
  3857. ddof : int, optional
  3858. "Delta degrees of freedom": adjustment to the degrees of freedom
  3859. for the p-value. The p-value is computed using a chi-squared
  3860. distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
  3861. is the number of observed frequencies. The default value of `ddof`
  3862. is 0.
  3863. axis : int or None, optional
  3864. The axis of the broadcast result of `f_obs` and `f_exp` along which to
  3865. apply the test. If axis is None, all values in `f_obs` are treated
  3866. as a single data set. Default is 0.
  3867. Returns
  3868. -------
  3869. chisq : float or ndarray
  3870. The chi-squared test statistic. The value is a float if `axis` is
  3871. None or `f_obs` and `f_exp` are 1-D.
  3872. p : float or ndarray
  3873. The p-value of the test. The value is a float if `ddof` and the
  3874. return value `chisq` are scalars.
  3875. See Also
  3876. --------
  3877. power_divergence
  3878. mstats.chisquare
  3879. Notes
  3880. -----
  3881. This test is invalid when the observed or expected frequencies in each
  3882. category are too small. A typical rule is that all of the observed
  3883. and expected frequencies should be at least 5.
  3884. The default degrees of freedom, k-1, are for the case when no parameters
  3885. of the distribution are estimated. If p parameters are estimated by
  3886. efficient maximum likelihood then the correct degrees of freedom are
  3887. k-1-p. If the parameters are estimated in a different way, then the
  3888. dof can be between k-1-p and k-1. However, it is also possible that
  3889. the asymptotic distribution is not a chisquare, in which case this
  3890. test is not appropriate.
  3891. References
  3892. ----------
  3893. .. [1] Lowry, Richard. "Concepts and Applications of Inferential
  3894. Statistics". Chapter 8.
  3895. https://web.archive.org/web/20171022032306/http://vassarstats.net:80/textbook/ch8pt1.html
  3896. .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
  3897. Examples
  3898. --------
  3899. When just `f_obs` is given, it is assumed that the expected frequencies
  3900. are uniform and given by the mean of the observed frequencies.
  3901. >>> from scipy.stats import chisquare
  3902. >>> chisquare([16, 18, 16, 14, 12, 12])
  3903. (2.0, 0.84914503608460956)
  3904. With `f_exp` the expected frequencies can be given.
  3905. >>> chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8])
  3906. (3.5, 0.62338762774958223)
  3907. When `f_obs` is 2-D, by default the test is applied to each column.
  3908. >>> obs = np.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
  3909. >>> obs.shape
  3910. (6, 2)
  3911. >>> chisquare(obs)
  3912. (array([ 2. , 6.66666667]), array([ 0.84914504, 0.24663415]))
  3913. By setting ``axis=None``, the test is applied to all data in the array,
  3914. which is equivalent to applying the test to the flattened array.
  3915. >>> chisquare(obs, axis=None)
  3916. (23.31034482758621, 0.015975692534127565)
  3917. >>> chisquare(obs.ravel())
  3918. (23.31034482758621, 0.015975692534127565)
  3919. `ddof` is the change to make to the default degrees of freedom.
  3920. >>> chisquare([16, 18, 16, 14, 12, 12], ddof=1)
  3921. (2.0, 0.73575888234288467)
  3922. The calculation of the p-values is done by broadcasting the
  3923. chi-squared statistic with `ddof`.
  3924. >>> chisquare([16, 18, 16, 14, 12, 12], ddof=[0,1,2])
  3925. (2.0, array([ 0.84914504, 0.73575888, 0.5724067 ]))
  3926. `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has
  3927. shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
  3928. `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared
  3929. statistics, we use ``axis=1``:
  3930. >>> chisquare([16, 18, 16, 14, 12, 12],
  3931. ... f_exp=[[16, 16, 16, 16, 16, 8], [8, 20, 20, 16, 12, 12]],
  3932. ... axis=1)
  3933. (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846]))
  3934. """
  3935. return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, axis=axis,
  3936. lambda_="pearson")
  3937. Ks_2sampResult = namedtuple('Ks_2sampResult', ('statistic', 'pvalue'))
  3938. def ks_2samp(data1, data2):
  3939. """
  3940. Compute the Kolmogorov-Smirnov statistic on 2 samples.
  3941. This is a two-sided test for the null hypothesis that 2 independent samples
  3942. are drawn from the same continuous distribution.
  3943. Parameters
  3944. ----------
  3945. data1, data2 : sequence of 1-D ndarrays
  3946. two arrays of sample observations assumed to be drawn from a continuous
  3947. distribution, sample sizes can be different
  3948. Returns
  3949. -------
  3950. statistic : float
  3951. KS statistic
  3952. pvalue : float
  3953. two-tailed p-value
  3954. Notes
  3955. -----
  3956. This tests whether 2 samples are drawn from the same distribution. Note
  3957. that, like in the case of the one-sample K-S test, the distribution is
  3958. assumed to be continuous.
  3959. This is the two-sided test, one-sided tests are not implemented.
  3960. The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.
  3961. If the K-S statistic is small or the p-value is high, then we cannot
  3962. reject the hypothesis that the distributions of the two samples
  3963. are the same.
  3964. Examples
  3965. --------
  3966. >>> from scipy import stats
  3967. >>> np.random.seed(12345678) #fix random seed to get the same result
  3968. >>> n1 = 200 # size of first sample
  3969. >>> n2 = 300 # size of second sample
  3970. For a different distribution, we can reject the null hypothesis since the
  3971. pvalue is below 1%:
  3972. >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
  3973. >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
  3974. >>> stats.ks_2samp(rvs1, rvs2)
  3975. (0.20833333333333337, 4.6674975515806989e-005)
  3976. For a slightly different distribution, we cannot reject the null hypothesis
  3977. at a 10% or lower alpha since the p-value at 0.144 is higher than 10%
  3978. >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
  3979. >>> stats.ks_2samp(rvs1, rvs3)
  3980. (0.10333333333333333, 0.14498781825751686)
  3981. For an identical distribution, we cannot reject the null hypothesis since
  3982. the p-value is high, 41%:
  3983. >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
  3984. >>> stats.ks_2samp(rvs1, rvs4)
  3985. (0.07999999999999996, 0.41126949729859719)
  3986. """
  3987. data1 = np.sort(data1)
  3988. data2 = np.sort(data2)
  3989. n1 = data1.shape[0]
  3990. n2 = data2.shape[0]
  3991. data_all = np.concatenate([data1, data2])
  3992. cdf1 = np.searchsorted(data1, data_all, side='right') / n1
  3993. cdf2 = np.searchsorted(data2, data_all, side='right') / n2
  3994. d = np.max(np.absolute(cdf1 - cdf2))
  3995. # Note: d absolute not signed distance
  3996. en = np.sqrt(n1 * n2 / (n1 + n2))
  3997. try:
  3998. prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d)
  3999. except Exception:
  4000. warnings.warn('This should not happen! Please open an issue at '
  4001. 'https://github.com/scipy/scipy/issues and provide the code '
  4002. 'you used to trigger this warning.\n')
  4003. prob = 1.0
  4004. return Ks_2sampResult(d, prob)
  4005. def tiecorrect(rankvals):
  4006. """
  4007. Tie correction factor for ties in the Mann-Whitney U and
  4008. Kruskal-Wallis H tests.
  4009. Parameters
  4010. ----------
  4011. rankvals : array_like
  4012. A 1-D sequence of ranks. Typically this will be the array
  4013. returned by `stats.rankdata`.
  4014. Returns
  4015. -------
  4016. factor : float
  4017. Correction factor for U or H.
  4018. See Also
  4019. --------
  4020. rankdata : Assign ranks to the data
  4021. mannwhitneyu : Mann-Whitney rank test
  4022. kruskal : Kruskal-Wallis H test
  4023. References
  4024. ----------
  4025. .. [1] Siegel, S. (1956) Nonparametric Statistics for the Behavioral
  4026. Sciences. New York: McGraw-Hill.
  4027. Examples
  4028. --------
  4029. >>> from scipy.stats import tiecorrect, rankdata
  4030. >>> tiecorrect([1, 2.5, 2.5, 4])
  4031. 0.9
  4032. >>> ranks = rankdata([1, 3, 2, 4, 5, 7, 2, 8, 4])
  4033. >>> ranks
  4034. array([ 1. , 4. , 2.5, 5.5, 7. , 8. , 2.5, 9. , 5.5])
  4035. >>> tiecorrect(ranks)
  4036. 0.9833333333333333
  4037. """
  4038. arr = np.sort(rankvals)
  4039. idx = np.nonzero(np.r_[True, arr[1:] != arr[:-1], True])[0]
  4040. cnt = np.diff(idx).astype(np.float64)
  4041. size = np.float64(arr.size)
  4042. return 1.0 if size < 2 else 1.0 - (cnt**3 - cnt).sum() / (size**3 - size)
  4043. MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
  4044. def mannwhitneyu(x, y, use_continuity=True, alternative=None):
  4045. """
  4046. Compute the Mann-Whitney rank test on samples x and y.
  4047. Parameters
  4048. ----------
  4049. x, y : array_like
  4050. Array of samples, should be one-dimensional.
  4051. use_continuity : bool, optional
  4052. Whether a continuity correction (1/2.) should be taken into
  4053. account. Default is True.
  4054. alternative : None (deprecated), 'less', 'two-sided', or 'greater'
  4055. Whether to get the p-value for the one-sided hypothesis ('less'
  4056. or 'greater') or for the two-sided hypothesis ('two-sided').
  4057. Defaults to None, which results in a p-value half the size of
  4058. the 'two-sided' p-value and a different U statistic. The
  4059. default behavior is not the same as using 'less' or 'greater':
  4060. it only exists for backward compatibility and is deprecated.
  4061. Returns
  4062. -------
  4063. statistic : float
  4064. The Mann-Whitney U statistic, equal to min(U for x, U for y) if
  4065. `alternative` is equal to None (deprecated; exists for backward
  4066. compatibility), and U for y otherwise.
  4067. pvalue : float
  4068. p-value assuming an asymptotic normal distribution. One-sided or
  4069. two-sided, depending on the choice of `alternative`.
  4070. Notes
  4071. -----
  4072. Use only when the number of observation in each sample is > 20 and
  4073. you have 2 independent samples of ranks. Mann-Whitney U is
  4074. significant if the u-obtained is LESS THAN or equal to the critical
  4075. value of U.
  4076. This test corrects for ties and by default uses a continuity correction.
  4077. References
  4078. ----------
  4079. .. [1] https://en.wikipedia.org/wiki/Mann-Whitney_U_test
  4080. .. [2] H.B. Mann and D.R. Whitney, "On a Test of Whether one of Two Random
  4081. Variables is Stochastically Larger than the Other," The Annals of
  4082. Mathematical Statistics, vol. 18, no. 1, pp. 50-60, 1947.
  4083. """
  4084. if alternative is None:
  4085. warnings.warn("Calling `mannwhitneyu` without specifying "
  4086. "`alternative` is deprecated.", DeprecationWarning)
  4087. x = np.asarray(x)
  4088. y = np.asarray(y)
  4089. n1 = len(x)
  4090. n2 = len(y)
  4091. ranked = rankdata(np.concatenate((x, y)))
  4092. rankx = ranked[0:n1] # get the x-ranks
  4093. u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx, axis=0) # calc U for x
  4094. u2 = n1*n2 - u1 # remainder is U for y
  4095. T = tiecorrect(ranked)
  4096. if T == 0:
  4097. raise ValueError('All numbers are identical in mannwhitneyu')
  4098. sd = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0)
  4099. meanrank = n1*n2/2.0 + 0.5 * use_continuity
  4100. if alternative is None or alternative == 'two-sided':
  4101. bigu = max(u1, u2)
  4102. elif alternative == 'less':
  4103. bigu = u1
  4104. elif alternative == 'greater':
  4105. bigu = u2
  4106. else:
  4107. raise ValueError("alternative should be None, 'less', 'greater' "
  4108. "or 'two-sided'")
  4109. z = (bigu - meanrank) / sd
  4110. if alternative is None:
  4111. # This behavior, equal to half the size of the two-sided
  4112. # p-value, is deprecated.
  4113. p = distributions.norm.sf(abs(z))
  4114. elif alternative == 'two-sided':
  4115. p = 2 * distributions.norm.sf(abs(z))
  4116. else:
  4117. p = distributions.norm.sf(z)
  4118. u = u2
  4119. # This behavior is deprecated.
  4120. if alternative is None:
  4121. u = min(u1, u2)
  4122. return MannwhitneyuResult(u, p)
  4123. RanksumsResult = namedtuple('RanksumsResult', ('statistic', 'pvalue'))
  4124. def ranksums(x, y):
  4125. """
  4126. Compute the Wilcoxon rank-sum statistic for two samples.
  4127. The Wilcoxon rank-sum test tests the null hypothesis that two sets
  4128. of measurements are drawn from the same distribution. The alternative
  4129. hypothesis is that values in one sample are more likely to be
  4130. larger than the values in the other sample.
  4131. This test should be used to compare two samples from continuous
  4132. distributions. It does not handle ties between measurements
  4133. in x and y. For tie-handling and an optional continuity correction
  4134. see `scipy.stats.mannwhitneyu`.
  4135. Parameters
  4136. ----------
  4137. x,y : array_like
  4138. The data from the two samples
  4139. Returns
  4140. -------
  4141. statistic : float
  4142. The test statistic under the large-sample approximation that the
  4143. rank sum statistic is normally distributed
  4144. pvalue : float
  4145. The two-sided p-value of the test
  4146. References
  4147. ----------
  4148. .. [1] https://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test
  4149. """
  4150. x, y = map(np.asarray, (x, y))
  4151. n1 = len(x)
  4152. n2 = len(y)
  4153. alldata = np.concatenate((x, y))
  4154. ranked = rankdata(alldata)
  4155. x = ranked[:n1]
  4156. s = np.sum(x, axis=0)
  4157. expected = n1 * (n1+n2+1) / 2.0
  4158. z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
  4159. prob = 2 * distributions.norm.sf(abs(z))
  4160. return RanksumsResult(z, prob)
  4161. KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
  4162. def kruskal(*args, **kwargs):
  4163. """
  4164. Compute the Kruskal-Wallis H-test for independent samples
  4165. The Kruskal-Wallis H-test tests the null hypothesis that the population
  4166. median of all of the groups are equal. It is a non-parametric version of
  4167. ANOVA. The test works on 2 or more independent samples, which may have
  4168. different sizes. Note that rejecting the null hypothesis does not
  4169. indicate which of the groups differs. Post-hoc comparisons between
  4170. groups are required to determine which groups are different.
  4171. Parameters
  4172. ----------
  4173. sample1, sample2, ... : array_like
  4174. Two or more arrays with the sample measurements can be given as
  4175. arguments.
  4176. nan_policy : {'propagate', 'raise', 'omit'}, optional
  4177. Defines how to handle when input contains nan. 'propagate' returns nan,
  4178. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  4179. values. Default is 'propagate'.
  4180. Returns
  4181. -------
  4182. statistic : float
  4183. The Kruskal-Wallis H statistic, corrected for ties
  4184. pvalue : float
  4185. The p-value for the test using the assumption that H has a chi
  4186. square distribution
  4187. See Also
  4188. --------
  4189. f_oneway : 1-way ANOVA
  4190. mannwhitneyu : Mann-Whitney rank test on two samples.
  4191. friedmanchisquare : Friedman test for repeated measurements
  4192. Notes
  4193. -----
  4194. Due to the assumption that H has a chi square distribution, the number
  4195. of samples in each group must not be too small. A typical rule is
  4196. that each sample must have at least 5 measurements.
  4197. References
  4198. ----------
  4199. .. [1] W. H. Kruskal & W. W. Wallis, "Use of Ranks in
  4200. One-Criterion Variance Analysis", Journal of the American Statistical
  4201. Association, Vol. 47, Issue 260, pp. 583-621, 1952.
  4202. .. [2] https://en.wikipedia.org/wiki/Kruskal-Wallis_one-way_analysis_of_variance
  4203. Examples
  4204. --------
  4205. >>> from scipy import stats
  4206. >>> x = [1, 3, 5, 7, 9]
  4207. >>> y = [2, 4, 6, 8, 10]
  4208. >>> stats.kruskal(x, y)
  4209. KruskalResult(statistic=0.2727272727272734, pvalue=0.6015081344405895)
  4210. >>> x = [1, 1, 1]
  4211. >>> y = [2, 2, 2]
  4212. >>> z = [2, 2]
  4213. >>> stats.kruskal(x, y, z)
  4214. KruskalResult(statistic=7.0, pvalue=0.0301973834223185)
  4215. """
  4216. args = list(map(np.asarray, args))
  4217. num_groups = len(args)
  4218. if num_groups < 2:
  4219. raise ValueError("Need at least two groups in stats.kruskal()")
  4220. for arg in args:
  4221. if arg.size == 0:
  4222. return KruskalResult(np.nan, np.nan)
  4223. n = np.asarray(list(map(len, args)))
  4224. if 'nan_policy' in kwargs.keys():
  4225. if kwargs['nan_policy'] not in ('propagate', 'raise', 'omit'):
  4226. raise ValueError("nan_policy must be 'propagate', "
  4227. "'raise' or'omit'")
  4228. else:
  4229. nan_policy = kwargs['nan_policy']
  4230. else:
  4231. nan_policy = 'propagate'
  4232. contains_nan = False
  4233. for arg in args:
  4234. cn = _contains_nan(arg, nan_policy)
  4235. if cn[0]:
  4236. contains_nan = True
  4237. break
  4238. if contains_nan and nan_policy == 'omit':
  4239. for a in args:
  4240. a = ma.masked_invalid(a)
  4241. return mstats_basic.kruskal(*args)
  4242. if contains_nan and nan_policy == 'propagate':
  4243. return KruskalResult(np.nan, np.nan)
  4244. alldata = np.concatenate(args)
  4245. ranked = rankdata(alldata)
  4246. ties = tiecorrect(ranked)
  4247. if ties == 0:
  4248. raise ValueError('All numbers are identical in kruskal')
  4249. # Compute sum^2/n for each group and sum
  4250. j = np.insert(np.cumsum(n), 0, 0)
  4251. ssbn = 0
  4252. for i in range(num_groups):
  4253. ssbn += _square_of_sums(ranked[j[i]:j[i+1]]) / n[i]
  4254. totaln = np.sum(n)
  4255. h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1)
  4256. df = num_groups - 1
  4257. h /= ties
  4258. return KruskalResult(h, distributions.chi2.sf(h, df))
  4259. FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
  4260. ('statistic', 'pvalue'))
  4261. def friedmanchisquare(*args):
  4262. """
  4263. Compute the Friedman test for repeated measurements
  4264. The Friedman test tests the null hypothesis that repeated measurements of
  4265. the same individuals have the same distribution. It is often used
  4266. to test for consistency among measurements obtained in different ways.
  4267. For example, if two measurement techniques are used on the same set of
  4268. individuals, the Friedman test can be used to determine if the two
  4269. measurement techniques are consistent.
  4270. Parameters
  4271. ----------
  4272. measurements1, measurements2, measurements3... : array_like
  4273. Arrays of measurements. All of the arrays must have the same number
  4274. of elements. At least 3 sets of measurements must be given.
  4275. Returns
  4276. -------
  4277. statistic : float
  4278. the test statistic, correcting for ties
  4279. pvalue : float
  4280. the associated p-value assuming that the test statistic has a chi
  4281. squared distribution
  4282. Notes
  4283. -----
  4284. Due to the assumption that the test statistic has a chi squared
  4285. distribution, the p-value is only reliable for n > 10 and more than
  4286. 6 repeated measurements.
  4287. References
  4288. ----------
  4289. .. [1] https://en.wikipedia.org/wiki/Friedman_test
  4290. """
  4291. k = len(args)
  4292. if k < 3:
  4293. raise ValueError('Less than 3 levels. Friedman test not appropriate.')
  4294. n = len(args[0])
  4295. for i in range(1, k):
  4296. if len(args[i]) != n:
  4297. raise ValueError('Unequal N in friedmanchisquare. Aborting.')
  4298. # Rank data
  4299. data = np.vstack(args).T
  4300. data = data.astype(float)
  4301. for i in range(len(data)):
  4302. data[i] = rankdata(data[i])
  4303. # Handle ties
  4304. ties = 0
  4305. for i in range(len(data)):
  4306. replist, repnum = find_repeats(array(data[i]))
  4307. for t in repnum:
  4308. ties += t * (t*t - 1)
  4309. c = 1 - ties / (k*(k*k - 1)*n)
  4310. ssbn = np.sum(data.sum(axis=0)**2)
  4311. chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c
  4312. return FriedmanchisquareResult(chisq, distributions.chi2.sf(chisq, k - 1))
  4313. BrunnerMunzelResult = namedtuple('BrunnerMunzelResult',
  4314. ('statistic', 'pvalue'))
  4315. def brunnermunzel(x, y, alternative="two-sided", distribution="t",
  4316. nan_policy='propagate'):
  4317. """
  4318. Computes the Brunner-Munzel test on samples x and y
  4319. The Brunner-Munzel test is a nonparametric test of the null hypothesis that
  4320. when values are taken one by one from each group, the probabilities of
  4321. getting large values in both groups are equal.
  4322. Unlike the Wilcoxon-Mann-Whitney's U test, this does not require the
  4323. assumption of equivariance of two groups. Note that this does not assume
  4324. the distributions are same. This test works on two independent samples,
  4325. which may have different sizes.
  4326. Parameters
  4327. ----------
  4328. x, y : array_like
  4329. Array of samples, should be one-dimensional.
  4330. alternative : 'less', 'two-sided', or 'greater', optional
  4331. Whether to get the p-value for the one-sided hypothesis ('less'
  4332. or 'greater') or for the two-sided hypothesis ('two-sided').
  4333. Defaults value is 'two-sided' .
  4334. distribution: 't' or 'normal', optional
  4335. Whether to get the p-value by t-distribution or by standard normal
  4336. distribution.
  4337. Defaults value is 't' .
  4338. nan_policy : {'propagate', 'raise', 'omit'}, optional
  4339. Defines how to handle when input contains nan. 'propagate' returns nan,
  4340. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  4341. values. Default is 'propagate'.
  4342. Returns
  4343. -------
  4344. statistic : float
  4345. The Brunner-Munzer W statistic.
  4346. pvalue : float
  4347. p-value assuming an t distribution. One-sided or
  4348. two-sided, depending on the choice of `alternative` and `distribution`.
  4349. See Also
  4350. --------
  4351. mannwhitneyu : Mann-Whitney rank test on two samples.
  4352. Notes
  4353. -------
  4354. Brunner and Munzel recommended to estimate the p-value by t-distribution
  4355. when the size of data is 50 or less. If the size is lower than 10, it would
  4356. be better to use permuted Brunner Munzel test (see [2]_).
  4357. References
  4358. ----------
  4359. .. [1] Brunner, E. and Munzel, U. "The nonparametric Benhrens-Fisher
  4360. problem: Asymptotic theory and a small-sample approximation".
  4361. Biometrical Journal. Vol. 42(2000): 17-25.
  4362. .. [2] Neubert, K. and Brunner, E. "A studentized permutation test for the
  4363. non-parametric Behrens-Fisher problem". Computational Statistics and
  4364. Data Analysis. Vol. 51(2007): 5192-5204.
  4365. Examples
  4366. --------
  4367. >>> from scipy import stats
  4368. >>> x1 = [1,2,1,1,1,1,1,1,1,1,2,4,1,1]
  4369. >>> x2 = [3,3,4,3,1,2,3,1,1,5,4]
  4370. >>> w, p_value = stats.brunnermunzel(x1, x2)
  4371. >>> w
  4372. 3.1374674823029505
  4373. >>> p_value
  4374. 0.0057862086661515377
  4375. """
  4376. x = np.asarray(x)
  4377. y = np.asarray(y)
  4378. # check both x and y
  4379. cnx, npx = _contains_nan(x, nan_policy)
  4380. cny, npy = _contains_nan(y, nan_policy)
  4381. contains_nan = cnx or cny
  4382. if npx == "omit" or npy == "omit":
  4383. nan_policy = "omit"
  4384. if contains_nan and nan_policy == "propagate":
  4385. return BrunnerMunzelResult(np.nan, np.nan)
  4386. elif contains_nan and nan_policy == "omit":
  4387. x = ma.masked_invalid(x)
  4388. y = ma.masked_invalid(y)
  4389. return mstats_basic.brunnermunzel(x, y, alternative, distribution)
  4390. nx = len(x)
  4391. ny = len(y)
  4392. if nx == 0 or ny == 0:
  4393. return BrunnerMunzelResult(np.nan, np.nan)
  4394. rankc = rankdata(np.concatenate((x, y)))
  4395. rankcx = rankc[0:nx]
  4396. rankcy = rankc[nx:nx+ny]
  4397. rankcx_mean = np.mean(rankcx)
  4398. rankcy_mean = np.mean(rankcy)
  4399. rankx = rankdata(x)
  4400. ranky = rankdata(y)
  4401. rankx_mean = np.mean(rankx)
  4402. ranky_mean = np.mean(ranky)
  4403. Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
  4404. Sx /= nx - 1
  4405. Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
  4406. Sy /= ny - 1
  4407. wbfn = nx * ny * (rankcy_mean - rankcx_mean)
  4408. wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
  4409. if distribution == "t":
  4410. df_numer = np.power(nx * Sx + ny * Sy, 2.0)
  4411. df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
  4412. df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
  4413. df = df_numer / df_denom
  4414. p = distributions.t.cdf(wbfn, df)
  4415. elif distribution == "normal":
  4416. p = distributions.norm.cdf(wbfn)
  4417. else:
  4418. raise ValueError(
  4419. "distribution should be 't' or 'normal'")
  4420. if alternative == "greater":
  4421. p = p
  4422. elif alternative == "less":
  4423. p = 1 - p
  4424. elif alternative == "two-sided":
  4425. p = 2 * np.min([p, 1-p])
  4426. else:
  4427. raise ValueError(
  4428. "alternative should be 'less', 'greater' or 'two-sided'")
  4429. return BrunnerMunzelResult(wbfn, p)
  4430. def combine_pvalues(pvalues, method='fisher', weights=None):
  4431. """
  4432. Methods for combining the p-values of independent tests bearing upon the
  4433. same hypothesis.
  4434. Parameters
  4435. ----------
  4436. pvalues : array_like, 1-D
  4437. Array of p-values assumed to come from independent tests.
  4438. method : {'fisher', 'stouffer'}, optional
  4439. Name of method to use to combine p-values. The following methods are
  4440. available:
  4441. - "fisher": Fisher's method (Fisher's combined probability test),
  4442. the default.
  4443. - "stouffer": Stouffer's Z-score method.
  4444. weights : array_like, 1-D, optional
  4445. Optional array of weights used only for Stouffer's Z-score method.
  4446. Returns
  4447. -------
  4448. statistic: float
  4449. The statistic calculated by the specified method:
  4450. - "fisher": The chi-squared statistic
  4451. - "stouffer": The Z-score
  4452. pval: float
  4453. The combined p-value.
  4454. Notes
  4455. -----
  4456. Fisher's method (also known as Fisher's combined probability test) [1]_ uses
  4457. a chi-squared statistic to compute a combined p-value. The closely related
  4458. Stouffer's Z-score method [2]_ uses Z-scores rather than p-values. The
  4459. advantage of Stouffer's method is that it is straightforward to introduce
  4460. weights, which can make Stouffer's method more powerful than Fisher's
  4461. method when the p-values are from studies of different size [3]_ [4]_.
  4462. Fisher's method may be extended to combine p-values from dependent tests
  4463. [5]_. Extensions such as Brown's method and Kost's method are not currently
  4464. implemented.
  4465. .. versionadded:: 0.15.0
  4466. References
  4467. ----------
  4468. .. [1] https://en.wikipedia.org/wiki/Fisher%27s_method
  4469. .. [2] https://en.wikipedia.org/wiki/Fisher%27s_method#Relation_to_Stouffer.27s_Z-score_method
  4470. .. [3] Whitlock, M. C. "Combining probability from independent tests: the
  4471. weighted Z-method is superior to Fisher's approach." Journal of
  4472. Evolutionary Biology 18, no. 5 (2005): 1368-1373.
  4473. .. [4] Zaykin, Dmitri V. "Optimally weighted Z-test is a powerful method
  4474. for combining probabilities in meta-analysis." Journal of
  4475. Evolutionary Biology 24, no. 8 (2011): 1836-1841.
  4476. .. [5] https://en.wikipedia.org/wiki/Extensions_of_Fisher%27s_method
  4477. """
  4478. pvalues = np.asarray(pvalues)
  4479. if pvalues.ndim != 1:
  4480. raise ValueError("pvalues is not 1-D")
  4481. if method == 'fisher':
  4482. Xsq = -2 * np.sum(np.log(pvalues))
  4483. pval = distributions.chi2.sf(Xsq, 2 * len(pvalues))
  4484. return (Xsq, pval)
  4485. elif method == 'stouffer':
  4486. if weights is None:
  4487. weights = np.ones_like(pvalues)
  4488. elif len(weights) != len(pvalues):
  4489. raise ValueError("pvalues and weights must be of the same size.")
  4490. weights = np.asarray(weights)
  4491. if weights.ndim != 1:
  4492. raise ValueError("weights is not 1-D")
  4493. Zi = distributions.norm.isf(pvalues)
  4494. Z = np.dot(weights, Zi) / np.linalg.norm(weights)
  4495. pval = distributions.norm.sf(Z)
  4496. return (Z, pval)
  4497. else:
  4498. raise ValueError(
  4499. "Invalid method '%s'. Options are 'fisher' or 'stouffer'", method)
  4500. #####################################
  4501. # STATISTICAL DISTANCES #
  4502. #####################################
  4503. def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):
  4504. r"""
  4505. Compute the first Wasserstein distance between two 1D distributions.
  4506. This distance is also known as the earth mover's distance, since it can be
  4507. seen as the minimum amount of "work" required to transform :math:`u` into
  4508. :math:`v`, where "work" is measured as the amount of distribution weight
  4509. that must be moved, multiplied by the distance it has to be moved.
  4510. .. versionadded:: 1.0.0
  4511. Parameters
  4512. ----------
  4513. u_values, v_values : array_like
  4514. Values observed in the (empirical) distribution.
  4515. u_weights, v_weights : array_like, optional
  4516. Weight for each value. If unspecified, each value is assigned the same
  4517. weight.
  4518. `u_weights` (resp. `v_weights`) must have the same length as
  4519. `u_values` (resp. `v_values`). If the weight sum differs from 1, it
  4520. must still be positive and finite so that the weights can be normalized
  4521. to sum to 1.
  4522. Returns
  4523. -------
  4524. distance : float
  4525. The computed distance between the distributions.
  4526. Notes
  4527. -----
  4528. The first Wasserstein distance between the distributions :math:`u` and
  4529. :math:`v` is:
  4530. .. math::
  4531. l_1 (u, v) = \inf_{\pi \in \Gamma (u, v)} \int_{\mathbb{R} \times
  4532. \mathbb{R}} |x-y| \mathrm{d} \pi (x, y)
  4533. where :math:`\Gamma (u, v)` is the set of (probability) distributions on
  4534. :math:`\mathbb{R} \times \mathbb{R}` whose marginals are :math:`u` and
  4535. :math:`v` on the first and second factors respectively.
  4536. If :math:`U` and :math:`V` are the respective CDFs of :math:`u` and
  4537. :math:`v`, this distance also equals to:
  4538. .. math::
  4539. l_1(u, v) = \int_{-\infty}^{+\infty} |U-V|
  4540. See [2]_ for a proof of the equivalence of both definitions.
  4541. The input distributions can be empirical, therefore coming from samples
  4542. whose values are effectively inputs of the function, or they can be seen as
  4543. generalized functions, in which case they are weighted sums of Dirac delta
  4544. functions located at the specified values.
  4545. References
  4546. ----------
  4547. .. [1] "Wasserstein metric", https://en.wikipedia.org/wiki/Wasserstein_metric
  4548. .. [2] Ramdas, Garcia, Cuturi "On Wasserstein Two Sample Testing and Related
  4549. Families of Nonparametric Tests" (2015). :arXiv:`1509.02237`.
  4550. Examples
  4551. --------
  4552. >>> from scipy.stats import wasserstein_distance
  4553. >>> wasserstein_distance([0, 1, 3], [5, 6, 8])
  4554. 5.0
  4555. >>> wasserstein_distance([0, 1], [0, 1], [3, 1], [2, 2])
  4556. 0.25
  4557. >>> wasserstein_distance([3.4, 3.9, 7.5, 7.8], [4.5, 1.4],
  4558. ... [1.4, 0.9, 3.1, 7.2], [3.2, 3.5])
  4559. 4.0781331438047861
  4560. """
  4561. return _cdf_distance(1, u_values, v_values, u_weights, v_weights)
  4562. def energy_distance(u_values, v_values, u_weights=None, v_weights=None):
  4563. r"""
  4564. Compute the energy distance between two 1D distributions.
  4565. .. versionadded:: 1.0.0
  4566. Parameters
  4567. ----------
  4568. u_values, v_values : array_like
  4569. Values observed in the (empirical) distribution.
  4570. u_weights, v_weights : array_like, optional
  4571. Weight for each value. If unspecified, each value is assigned the same
  4572. weight.
  4573. `u_weights` (resp. `v_weights`) must have the same length as
  4574. `u_values` (resp. `v_values`). If the weight sum differs from 1, it
  4575. must still be positive and finite so that the weights can be normalized
  4576. to sum to 1.
  4577. Returns
  4578. -------
  4579. distance : float
  4580. The computed distance between the distributions.
  4581. Notes
  4582. -----
  4583. The energy distance between two distributions :math:`u` and :math:`v`, whose
  4584. respective CDFs are :math:`U` and :math:`V`, equals to:
  4585. .. math::
  4586. D(u, v) = \left( 2\mathbb E|X - Y| - \mathbb E|X - X'| -
  4587. \mathbb E|Y - Y'| \right)^{1/2}
  4588. where :math:`X` and :math:`X'` (resp. :math:`Y` and :math:`Y'`) are
  4589. independent random variables whose probability distribution is :math:`u`
  4590. (resp. :math:`v`).
  4591. As shown in [2]_, for one-dimensional real-valued variables, the energy
  4592. distance is linked to the non-distribution-free version of the Cramer-von
  4593. Mises distance:
  4594. .. math::
  4595. D(u, v) = \sqrt{2} l_2(u, v) = \left( 2 \int_{-\infty}^{+\infty} (U-V)^2
  4596. \right)^{1/2}
  4597. Note that the common Cramer-von Mises criterion uses the distribution-free
  4598. version of the distance. See [2]_ (section 2), for more details about both
  4599. versions of the distance.
  4600. The input distributions can be empirical, therefore coming from samples
  4601. whose values are effectively inputs of the function, or they can be seen as
  4602. generalized functions, in which case they are weighted sums of Dirac delta
  4603. functions located at the specified values.
  4604. References
  4605. ----------
  4606. .. [1] "Energy distance", https://en.wikipedia.org/wiki/Energy_distance
  4607. .. [2] Szekely "E-statistics: The energy of statistical samples." Bowling
  4608. Green State University, Department of Mathematics and Statistics,
  4609. Technical Report 02-16 (2002).
  4610. .. [3] Rizzo, Szekely "Energy distance." Wiley Interdisciplinary Reviews:
  4611. Computational Statistics, 8(1):27-38 (2015).
  4612. .. [4] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
  4613. Munos "The Cramer Distance as a Solution to Biased Wasserstein
  4614. Gradients" (2017). :arXiv:`1705.10743`.
  4615. Examples
  4616. --------
  4617. >>> from scipy.stats import energy_distance
  4618. >>> energy_distance([0], [2])
  4619. 2.0000000000000004
  4620. >>> energy_distance([0, 8], [0, 8], [3, 1], [2, 2])
  4621. 1.0000000000000002
  4622. >>> energy_distance([0.7, 7.4, 2.4, 6.8], [1.4, 8. ],
  4623. ... [2.1, 4.2, 7.4, 8. ], [7.6, 8.8])
  4624. 0.88003340976158217
  4625. """
  4626. return np.sqrt(2) * _cdf_distance(2, u_values, v_values,
  4627. u_weights, v_weights)
  4628. def _cdf_distance(p, u_values, v_values, u_weights=None, v_weights=None):
  4629. r"""
  4630. Compute, between two one-dimensional distributions :math:`u` and
  4631. :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the
  4632. statistical distance that is defined as:
  4633. .. math::
  4634. l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
  4635. p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2
  4636. gives the energy distance.
  4637. Parameters
  4638. ----------
  4639. u_values, v_values : array_like
  4640. Values observed in the (empirical) distribution.
  4641. u_weights, v_weights : array_like, optional
  4642. Weight for each value. If unspecified, each value is assigned the same
  4643. weight.
  4644. `u_weights` (resp. `v_weights`) must have the same length as
  4645. `u_values` (resp. `v_values`). If the weight sum differs from 1, it
  4646. must still be positive and finite so that the weights can be normalized
  4647. to sum to 1.
  4648. Returns
  4649. -------
  4650. distance : float
  4651. The computed distance between the distributions.
  4652. Notes
  4653. -----
  4654. The input distributions can be empirical, therefore coming from samples
  4655. whose values are effectively inputs of the function, or they can be seen as
  4656. generalized functions, in which case they are weighted sums of Dirac delta
  4657. functions located at the specified values.
  4658. References
  4659. ----------
  4660. .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
  4661. Munos "The Cramer Distance as a Solution to Biased Wasserstein
  4662. Gradients" (2017). :arXiv:`1705.10743`.
  4663. """
  4664. u_values, u_weights = _validate_distribution(u_values, u_weights)
  4665. v_values, v_weights = _validate_distribution(v_values, v_weights)
  4666. u_sorter = np.argsort(u_values)
  4667. v_sorter = np.argsort(v_values)
  4668. all_values = np.concatenate((u_values, v_values))
  4669. all_values.sort(kind='mergesort')
  4670. # Compute the differences between pairs of successive values of u and v.
  4671. deltas = np.diff(all_values)
  4672. # Get the respective positions of the values of u and v among the values of
  4673. # both distributions.
  4674. u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], 'right')
  4675. v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], 'right')
  4676. # Calculate the CDFs of u and v using their weights, if specified.
  4677. if u_weights is None:
  4678. u_cdf = u_cdf_indices / u_values.size
  4679. else:
  4680. u_sorted_cumweights = np.concatenate(([0],
  4681. np.cumsum(u_weights[u_sorter])))
  4682. u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1]
  4683. if v_weights is None:
  4684. v_cdf = v_cdf_indices / v_values.size
  4685. else:
  4686. v_sorted_cumweights = np.concatenate(([0],
  4687. np.cumsum(v_weights[v_sorter])))
  4688. v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1]
  4689. # Compute the value of the integral based on the CDFs.
  4690. # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead
  4691. # of about 15%.
  4692. if p == 1:
  4693. return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas))
  4694. if p == 2:
  4695. return np.sqrt(np.sum(np.multiply(np.square(u_cdf - v_cdf), deltas)))
  4696. return np.power(np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p),
  4697. deltas)), 1/p)
  4698. def _validate_distribution(values, weights):
  4699. """
  4700. Validate the values and weights from a distribution input of `cdf_distance`
  4701. and return them as ndarray objects.
  4702. Parameters
  4703. ----------
  4704. values : array_like
  4705. Values observed in the (empirical) distribution.
  4706. weights : array_like
  4707. Weight for each value.
  4708. Returns
  4709. -------
  4710. values : ndarray
  4711. Values as ndarray.
  4712. weights : ndarray
  4713. Weights as ndarray.
  4714. """
  4715. # Validate the value array.
  4716. values = np.asarray(values, dtype=float)
  4717. if len(values) == 0:
  4718. raise ValueError("Distribution can't be empty.")
  4719. # Validate the weight array, if specified.
  4720. if weights is not None:
  4721. weights = np.asarray(weights, dtype=float)
  4722. if len(weights) != len(values):
  4723. raise ValueError('Value and weight array-likes for the same '
  4724. 'empirical distribution must be of the same size.')
  4725. if np.any(weights < 0):
  4726. raise ValueError('All weights must be non-negative.')
  4727. if not 0 < np.sum(weights) < np.inf:
  4728. raise ValueError('Weight array-like sum must be positive and '
  4729. 'finite. Set as None for an equal distribution of '
  4730. 'weight.')
  4731. return values, weights
  4732. return values, None
  4733. #####################################
  4734. # SUPPORT FUNCTIONS #
  4735. #####################################
  4736. RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))
  4737. def find_repeats(arr):
  4738. """
  4739. Find repeats and repeat counts.
  4740. Parameters
  4741. ----------
  4742. arr : array_like
  4743. Input array. This is cast to float64.
  4744. Returns
  4745. -------
  4746. values : ndarray
  4747. The unique values from the (flattened) input that are repeated.
  4748. counts : ndarray
  4749. Number of times the corresponding 'value' is repeated.
  4750. Notes
  4751. -----
  4752. In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
  4753. difference is that `find_repeats` only returns repeated values.
  4754. Examples
  4755. --------
  4756. >>> from scipy import stats
  4757. >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
  4758. RepeatedResults(values=array([2.]), counts=array([4]))
  4759. >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
  4760. RepeatedResults(values=array([4., 5.]), counts=array([2, 2]))
  4761. """
  4762. # Note: always copies.
  4763. return RepeatedResults(*_find_repeats(np.array(arr, dtype=np.float64)))
  4764. def _sum_of_squares(a, axis=0):
  4765. """
  4766. Square each element of the input array, and return the sum(s) of that.
  4767. Parameters
  4768. ----------
  4769. a : array_like
  4770. Input array.
  4771. axis : int or None, optional
  4772. Axis along which to calculate. Default is 0. If None, compute over
  4773. the whole array `a`.
  4774. Returns
  4775. -------
  4776. sum_of_squares : ndarray
  4777. The sum along the given axis for (a**2).
  4778. See also
  4779. --------
  4780. _square_of_sums : The square(s) of the sum(s) (the opposite of
  4781. `_sum_of_squares`).
  4782. """
  4783. a, axis = _chk_asarray(a, axis)
  4784. return np.sum(a*a, axis)
  4785. def _square_of_sums(a, axis=0):
  4786. """
  4787. Sum elements of the input array, and return the square(s) of that sum.
  4788. Parameters
  4789. ----------
  4790. a : array_like
  4791. Input array.
  4792. axis : int or None, optional
  4793. Axis along which to calculate. Default is 0. If None, compute over
  4794. the whole array `a`.
  4795. Returns
  4796. -------
  4797. square_of_sums : float or ndarray
  4798. The square of the sum over `axis`.
  4799. See also
  4800. --------
  4801. _sum_of_squares : The sum of squares (the opposite of `square_of_sums`).
  4802. """
  4803. a, axis = _chk_asarray(a, axis)
  4804. s = np.sum(a, axis)
  4805. if not np.isscalar(s):
  4806. return s.astype(float) * s
  4807. else:
  4808. return float(s) * s
  4809. def rankdata(a, method='average'):
  4810. """
  4811. Assign ranks to data, dealing with ties appropriately.
  4812. Ranks begin at 1. The `method` argument controls how ranks are assigned
  4813. to equal values. See [1]_ for further discussion of ranking methods.
  4814. Parameters
  4815. ----------
  4816. a : array_like
  4817. The array of values to be ranked. The array is first flattened.
  4818. method : str, optional
  4819. The method used to assign ranks to tied elements.
  4820. The options are 'average', 'min', 'max', 'dense' and 'ordinal'.
  4821. 'average':
  4822. The average of the ranks that would have been assigned to
  4823. all the tied values is assigned to each value.
  4824. 'min':
  4825. The minimum of the ranks that would have been assigned to all
  4826. the tied values is assigned to each value. (This is also
  4827. referred to as "competition" ranking.)
  4828. 'max':
  4829. The maximum of the ranks that would have been assigned to all
  4830. the tied values is assigned to each value.
  4831. 'dense':
  4832. Like 'min', but the rank of the next highest element is assigned
  4833. the rank immediately after those assigned to the tied elements.
  4834. 'ordinal':
  4835. All values are given a distinct rank, corresponding to the order
  4836. that the values occur in `a`.
  4837. The default is 'average'.
  4838. Returns
  4839. -------
  4840. ranks : ndarray
  4841. An array of length equal to the size of `a`, containing rank
  4842. scores.
  4843. References
  4844. ----------
  4845. .. [1] "Ranking", https://en.wikipedia.org/wiki/Ranking
  4846. Examples
  4847. --------
  4848. >>> from scipy.stats import rankdata
  4849. >>> rankdata([0, 2, 3, 2])
  4850. array([ 1. , 2.5, 4. , 2.5])
  4851. >>> rankdata([0, 2, 3, 2], method='min')
  4852. array([ 1, 2, 4, 2])
  4853. >>> rankdata([0, 2, 3, 2], method='max')
  4854. array([ 1, 3, 4, 3])
  4855. >>> rankdata([0, 2, 3, 2], method='dense')
  4856. array([ 1, 2, 3, 2])
  4857. >>> rankdata([0, 2, 3, 2], method='ordinal')
  4858. array([ 1, 2, 4, 3])
  4859. """
  4860. if method not in ('average', 'min', 'max', 'dense', 'ordinal'):
  4861. raise ValueError('unknown method "{0}"'.format(method))
  4862. arr = np.ravel(np.asarray(a))
  4863. algo = 'mergesort' if method == 'ordinal' else 'quicksort'
  4864. sorter = np.argsort(arr, kind=algo)
  4865. inv = np.empty(sorter.size, dtype=np.intp)
  4866. inv[sorter] = np.arange(sorter.size, dtype=np.intp)
  4867. if method == 'ordinal':
  4868. return inv + 1
  4869. arr = arr[sorter]
  4870. obs = np.r_[True, arr[1:] != arr[:-1]]
  4871. dense = obs.cumsum()[inv]
  4872. if method == 'dense':
  4873. return dense
  4874. # cumulative counts of each unique value
  4875. count = np.r_[np.nonzero(obs)[0], len(obs)]
  4876. if method == 'max':
  4877. return count[dense]
  4878. if method == 'min':
  4879. return count[dense - 1] + 1
  4880. # average method
  4881. return .5 * (count[dense] + count[dense - 1] + 1)