frame.py 286 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976
  1. # pylint: disable=E1101
  2. # pylint: disable=W0212,W0703,W0622
  3. """
  4. DataFrame
  5. ---------
  6. An efficient 2D container for potentially mixed-type time series or other
  7. labeled data series.
  8. Similar to its R counterpart, data.frame, except providing automatic data
  9. alignment and a host of useful data manipulation methods having to do with the
  10. labeling information
  11. """
  12. from __future__ import division
  13. import collections
  14. import functools
  15. import itertools
  16. import sys
  17. import warnings
  18. from distutils.version import LooseVersion
  19. from textwrap import dedent
  20. import numpy as np
  21. import numpy.ma as ma
  22. from pandas._libs import lib, algos as libalgos
  23. from pandas.util._decorators import (Appender, Substitution,
  24. rewrite_axis_style_signature,
  25. deprecate_kwarg)
  26. from pandas.util._validators import (validate_bool_kwarg,
  27. validate_axis_style_args)
  28. from pandas import compat
  29. from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u,
  30. OrderedDict, PY36, raise_with_traceback,
  31. string_and_binary_types)
  32. from pandas.compat.numpy import function as nv
  33. from pandas.core.dtypes.cast import (
  34. maybe_upcast,
  35. cast_scalar_to_array,
  36. infer_dtype_from_scalar,
  37. maybe_cast_to_datetime,
  38. maybe_infer_to_datetimelike,
  39. maybe_convert_platform,
  40. maybe_downcast_to_dtype,
  41. invalidate_string_dtypes,
  42. coerce_to_dtypes,
  43. maybe_upcast_putmask,
  44. find_common_type)
  45. from pandas.core.dtypes.common import (
  46. is_dict_like,
  47. is_datetime64tz_dtype,
  48. is_object_dtype,
  49. is_extension_type,
  50. is_extension_array_dtype,
  51. is_datetime64_any_dtype,
  52. is_bool_dtype,
  53. is_integer_dtype,
  54. is_float_dtype,
  55. is_integer,
  56. is_scalar,
  57. is_dtype_equal,
  58. needs_i8_conversion,
  59. infer_dtype_from_object,
  60. ensure_float64,
  61. ensure_int64,
  62. ensure_platform_int,
  63. is_list_like,
  64. is_nested_list_like,
  65. is_iterator,
  66. is_sequence,
  67. is_named_tuple)
  68. from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
  69. from pandas.core.dtypes.missing import isna, notna
  70. from pandas.core import algorithms
  71. from pandas.core import common as com
  72. from pandas.core import nanops
  73. from pandas.core import ops
  74. from pandas.core.accessor import CachedAccessor
  75. from pandas.core.arrays import Categorical, ExtensionArray
  76. from pandas.core.arrays.datetimelike import (
  77. DatetimeLikeArrayMixin as DatetimeLikeArray
  78. )
  79. from pandas.core.config import get_option
  80. from pandas.core.generic import NDFrame, _shared_docs
  81. from pandas.core.index import (Index, MultiIndex, ensure_index,
  82. ensure_index_from_sequences)
  83. from pandas.core.indexes import base as ibase
  84. from pandas.core.indexes.datetimes import DatetimeIndex
  85. from pandas.core.indexes.period import PeriodIndex
  86. from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
  87. check_bool_indexer)
  88. from pandas.core.internals import BlockManager
  89. from pandas.core.internals.construction import (
  90. masked_rec_array_to_mgr, get_names_from_index, to_arrays,
  91. reorder_arrays, init_ndarray, init_dict,
  92. arrays_to_mgr, sanitize_index)
  93. from pandas.core.series import Series
  94. from pandas.io.formats import console
  95. from pandas.io.formats import format as fmt
  96. from pandas.io.formats.printing import pprint_thing
  97. import pandas.plotting._core as gfx
  98. # ---------------------------------------------------------------------
  99. # Docstring templates
  100. _shared_doc_kwargs = dict(
  101. axes='index, columns', klass='DataFrame',
  102. axes_single_arg="{0 or 'index', 1 or 'columns'}",
  103. axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
  104. If 0 or 'index': apply function to each column.
  105. If 1 or 'columns': apply function to each row.""",
  106. optional_by="""
  107. by : str or list of str
  108. Name or list of names to sort by.
  109. - if `axis` is 0 or `'index'` then `by` may contain index
  110. levels and/or column labels
  111. - if `axis` is 1 or `'columns'` then `by` may contain column
  112. levels and/or index labels
  113. .. versionchanged:: 0.23.0
  114. Allow specifying index or column level names.""",
  115. versionadded_to_excel='',
  116. optional_labels="""labels : array-like, optional
  117. New labels / index to conform the axis specified by 'axis' to.""",
  118. optional_axis="""axis : int or str, optional
  119. Axis to target. Can be either the axis name ('index', 'columns')
  120. or number (0, 1).""",
  121. )
  122. _numeric_only_doc = """numeric_only : boolean, default None
  123. Include only float, int, boolean data. If None, will attempt to use
  124. everything, then use only numeric data
  125. """
  126. _merge_doc = """
  127. Merge DataFrame or named Series objects with a database-style join.
  128. The join is done on columns or indexes. If joining columns on
  129. columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
  130. on indexes or indexes on a column or columns, the index will be passed on.
  131. Parameters
  132. ----------%s
  133. right : DataFrame or named Series
  134. Object to merge with.
  135. how : {'left', 'right', 'outer', 'inner'}, default 'inner'
  136. Type of merge to be performed.
  137. * left: use only keys from left frame, similar to a SQL left outer join;
  138. preserve key order.
  139. * right: use only keys from right frame, similar to a SQL right outer join;
  140. preserve key order.
  141. * outer: use union of keys from both frames, similar to a SQL full outer
  142. join; sort keys lexicographically.
  143. * inner: use intersection of keys from both frames, similar to a SQL inner
  144. join; preserve the order of the left keys.
  145. on : label or list
  146. Column or index level names to join on. These must be found in both
  147. DataFrames. If `on` is None and not merging on indexes then this defaults
  148. to the intersection of the columns in both DataFrames.
  149. left_on : label or list, or array-like
  150. Column or index level names to join on in the left DataFrame. Can also
  151. be an array or list of arrays of the length of the left DataFrame.
  152. These arrays are treated as if they are columns.
  153. right_on : label or list, or array-like
  154. Column or index level names to join on in the right DataFrame. Can also
  155. be an array or list of arrays of the length of the right DataFrame.
  156. These arrays are treated as if they are columns.
  157. left_index : bool, default False
  158. Use the index from the left DataFrame as the join key(s). If it is a
  159. MultiIndex, the number of keys in the other DataFrame (either the index
  160. or a number of columns) must match the number of levels.
  161. right_index : bool, default False
  162. Use the index from the right DataFrame as the join key. Same caveats as
  163. left_index.
  164. sort : bool, default False
  165. Sort the join keys lexicographically in the result DataFrame. If False,
  166. the order of the join keys depends on the join type (how keyword).
  167. suffixes : tuple of (str, str), default ('_x', '_y')
  168. Suffix to apply to overlapping column names in the left and right
  169. side, respectively. To raise an exception on overlapping columns use
  170. (False, False).
  171. copy : bool, default True
  172. If False, avoid copy if possible.
  173. indicator : bool or str, default False
  174. If True, adds a column to output DataFrame called "_merge" with
  175. information on the source of each row.
  176. If string, column with information on source of each row will be added to
  177. output DataFrame, and column will be named value of string.
  178. Information column is Categorical-type and takes on a value of "left_only"
  179. for observations whose merge key only appears in 'left' DataFrame,
  180. "right_only" for observations whose merge key only appears in 'right'
  181. DataFrame, and "both" if the observation's merge key is found in both.
  182. validate : str, optional
  183. If specified, checks if merge is of specified type.
  184. * "one_to_one" or "1:1": check if merge keys are unique in both
  185. left and right datasets.
  186. * "one_to_many" or "1:m": check if merge keys are unique in left
  187. dataset.
  188. * "many_to_one" or "m:1": check if merge keys are unique in right
  189. dataset.
  190. * "many_to_many" or "m:m": allowed, but does not result in checks.
  191. .. versionadded:: 0.21.0
  192. Returns
  193. -------
  194. DataFrame
  195. A DataFrame of the two merged objects.
  196. See Also
  197. --------
  198. merge_ordered : Merge with optional filling/interpolation.
  199. merge_asof : Merge on nearest keys.
  200. DataFrame.join : Similar method using indices.
  201. Notes
  202. -----
  203. Support for specifying index levels as the `on`, `left_on`, and
  204. `right_on` parameters was added in version 0.23.0
  205. Support for merging named Series objects was added in version 0.24.0
  206. Examples
  207. --------
  208. >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  209. ... 'value': [1, 2, 3, 5]})
  210. >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
  211. ... 'value': [5, 6, 7, 8]})
  212. >>> df1
  213. lkey value
  214. 0 foo 1
  215. 1 bar 2
  216. 2 baz 3
  217. 3 foo 5
  218. >>> df2
  219. rkey value
  220. 0 foo 5
  221. 1 bar 6
  222. 2 baz 7
  223. 3 foo 8
  224. Merge df1 and df2 on the lkey and rkey columns. The value columns have
  225. the default suffixes, _x and _y, appended.
  226. >>> df1.merge(df2, left_on='lkey', right_on='rkey')
  227. lkey value_x rkey value_y
  228. 0 foo 1 foo 5
  229. 1 foo 1 foo 8
  230. 2 foo 5 foo 5
  231. 3 foo 5 foo 8
  232. 4 bar 2 bar 6
  233. 5 baz 3 baz 7
  234. Merge DataFrames df1 and df2 with specified left and right suffixes
  235. appended to any overlapping columns.
  236. >>> df1.merge(df2, left_on='lkey', right_on='rkey',
  237. ... suffixes=('_left', '_right'))
  238. lkey value_left rkey value_right
  239. 0 foo 1 foo 5
  240. 1 foo 1 foo 8
  241. 2 foo 5 foo 5
  242. 3 foo 5 foo 8
  243. 4 bar 2 bar 6
  244. 5 baz 3 baz 7
  245. Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
  246. any overlapping columns.
  247. >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
  248. Traceback (most recent call last):
  249. ...
  250. ValueError: columns overlap but no suffix specified:
  251. Index(['value'], dtype='object')
  252. """
  253. # -----------------------------------------------------------------------
  254. # DataFrame class
  255. class DataFrame(NDFrame):
  256. """
  257. Two-dimensional size-mutable, potentially heterogeneous tabular data
  258. structure with labeled axes (rows and columns). Arithmetic operations
  259. align on both row and column labels. Can be thought of as a dict-like
  260. container for Series objects. The primary pandas data structure.
  261. Parameters
  262. ----------
  263. data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
  264. Dict can contain Series, arrays, constants, or list-like objects
  265. .. versionchanged :: 0.23.0
  266. If data is a dict, argument order is maintained for Python 3.6
  267. and later.
  268. index : Index or array-like
  269. Index to use for resulting frame. Will default to RangeIndex if
  270. no indexing information part of input data and no index provided
  271. columns : Index or array-like
  272. Column labels to use for resulting frame. Will default to
  273. RangeIndex (0, 1, 2, ..., n) if no column labels are provided
  274. dtype : dtype, default None
  275. Data type to force. Only a single dtype is allowed. If None, infer
  276. copy : boolean, default False
  277. Copy data from inputs. Only affects DataFrame / 2d ndarray input
  278. See Also
  279. --------
  280. DataFrame.from_records : Constructor from tuples, also record arrays.
  281. DataFrame.from_dict : From dicts of Series, arrays, or dicts.
  282. DataFrame.from_items : From sequence of (key, value) pairs
  283. pandas.read_csv, pandas.read_table, pandas.read_clipboard.
  284. Examples
  285. --------
  286. Constructing DataFrame from a dictionary.
  287. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  288. >>> df = pd.DataFrame(data=d)
  289. >>> df
  290. col1 col2
  291. 0 1 3
  292. 1 2 4
  293. Notice that the inferred dtype is int64.
  294. >>> df.dtypes
  295. col1 int64
  296. col2 int64
  297. dtype: object
  298. To enforce a single dtype:
  299. >>> df = pd.DataFrame(data=d, dtype=np.int8)
  300. >>> df.dtypes
  301. col1 int8
  302. col2 int8
  303. dtype: object
  304. Constructing DataFrame from numpy ndarray:
  305. >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
  306. ... columns=['a', 'b', 'c'])
  307. >>> df2
  308. a b c
  309. 0 1 2 3
  310. 1 4 5 6
  311. 2 7 8 9
  312. """
  313. @property
  314. def _constructor(self):
  315. return DataFrame
  316. _constructor_sliced = Series
  317. _deprecations = NDFrame._deprecations | frozenset(
  318. ['get_value', 'set_value', 'from_csv', 'from_items'])
  319. _accessors = set()
  320. @property
  321. def _constructor_expanddim(self):
  322. from pandas.core.panel import Panel
  323. return Panel
  324. # ----------------------------------------------------------------------
  325. # Constructors
  326. def __init__(self, data=None, index=None, columns=None, dtype=None,
  327. copy=False):
  328. if data is None:
  329. data = {}
  330. if dtype is not None:
  331. dtype = self._validate_dtype(dtype)
  332. if isinstance(data, DataFrame):
  333. data = data._data
  334. if isinstance(data, BlockManager):
  335. mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
  336. dtype=dtype, copy=copy)
  337. elif isinstance(data, dict):
  338. mgr = init_dict(data, index, columns, dtype=dtype)
  339. elif isinstance(data, ma.MaskedArray):
  340. import numpy.ma.mrecords as mrecords
  341. # masked recarray
  342. if isinstance(data, mrecords.MaskedRecords):
  343. mgr = masked_rec_array_to_mgr(data, index, columns, dtype,
  344. copy)
  345. # a masked array
  346. else:
  347. mask = ma.getmaskarray(data)
  348. if mask.any():
  349. data, fill_value = maybe_upcast(data, copy=True)
  350. data.soften_mask() # set hardmask False if it was True
  351. data[mask] = fill_value
  352. else:
  353. data = data.copy()
  354. mgr = init_ndarray(data, index, columns, dtype=dtype,
  355. copy=copy)
  356. elif isinstance(data, (np.ndarray, Series, Index)):
  357. if data.dtype.names:
  358. data_columns = list(data.dtype.names)
  359. data = {k: data[k] for k in data_columns}
  360. if columns is None:
  361. columns = data_columns
  362. mgr = init_dict(data, index, columns, dtype=dtype)
  363. elif getattr(data, 'name', None) is not None:
  364. mgr = init_dict({data.name: data}, index, columns,
  365. dtype=dtype)
  366. else:
  367. mgr = init_ndarray(data, index, columns, dtype=dtype,
  368. copy=copy)
  369. # For data is list-like, or Iterable (will consume into list)
  370. elif (isinstance(data, compat.Iterable)
  371. and not isinstance(data, string_and_binary_types)):
  372. if not isinstance(data, compat.Sequence):
  373. data = list(data)
  374. if len(data) > 0:
  375. if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
  376. if is_named_tuple(data[0]) and columns is None:
  377. columns = data[0]._fields
  378. arrays, columns = to_arrays(data, columns, dtype=dtype)
  379. columns = ensure_index(columns)
  380. # set the index
  381. if index is None:
  382. if isinstance(data[0], Series):
  383. index = get_names_from_index(data)
  384. elif isinstance(data[0], Categorical):
  385. index = ibase.default_index(len(data[0]))
  386. else:
  387. index = ibase.default_index(len(data))
  388. mgr = arrays_to_mgr(arrays, columns, index, columns,
  389. dtype=dtype)
  390. else:
  391. mgr = init_ndarray(data, index, columns, dtype=dtype,
  392. copy=copy)
  393. else:
  394. mgr = init_dict({}, index, columns, dtype=dtype)
  395. else:
  396. try:
  397. arr = np.array(data, dtype=dtype, copy=copy)
  398. except (ValueError, TypeError) as e:
  399. exc = TypeError('DataFrame constructor called with '
  400. 'incompatible data and dtype: {e}'.format(e=e))
  401. raise_with_traceback(exc)
  402. if arr.ndim == 0 and index is not None and columns is not None:
  403. values = cast_scalar_to_array((len(index), len(columns)),
  404. data, dtype=dtype)
  405. mgr = init_ndarray(values, index, columns,
  406. dtype=values.dtype, copy=False)
  407. else:
  408. raise ValueError('DataFrame constructor not properly called!')
  409. NDFrame.__init__(self, mgr, fastpath=True)
  410. # ----------------------------------------------------------------------
  411. @property
  412. def axes(self):
  413. """
  414. Return a list representing the axes of the DataFrame.
  415. It has the row axis labels and column axis labels as the only members.
  416. They are returned in that order.
  417. Examples
  418. --------
  419. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  420. >>> df.axes
  421. [RangeIndex(start=0, stop=2, step=1), Index(['coll', 'col2'],
  422. dtype='object')]
  423. """
  424. return [self.index, self.columns]
  425. @property
  426. def shape(self):
  427. """
  428. Return a tuple representing the dimensionality of the DataFrame.
  429. See Also
  430. --------
  431. ndarray.shape
  432. Examples
  433. --------
  434. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  435. >>> df.shape
  436. (2, 2)
  437. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
  438. ... 'col3': [5, 6]})
  439. >>> df.shape
  440. (2, 3)
  441. """
  442. return len(self.index), len(self.columns)
  443. @property
  444. def _is_homogeneous_type(self):
  445. """
  446. Whether all the columns in a DataFrame have the same type.
  447. Returns
  448. -------
  449. bool
  450. Examples
  451. --------
  452. >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
  453. True
  454. >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
  455. False
  456. Items with the same type but different sizes are considered
  457. different types.
  458. >>> DataFrame({
  459. ... "A": np.array([1, 2], dtype=np.int32),
  460. ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
  461. False
  462. """
  463. if self._data.any_extension_types:
  464. return len({block.dtype for block in self._data.blocks}) == 1
  465. else:
  466. return not self._data.is_mixed_type
  467. # ----------------------------------------------------------------------
  468. # Rendering Methods
  469. def _repr_fits_vertical_(self):
  470. """
  471. Check length against max_rows.
  472. """
  473. max_rows = get_option("display.max_rows")
  474. return len(self) <= max_rows
  475. def _repr_fits_horizontal_(self, ignore_width=False):
  476. """
  477. Check if full repr fits in horizontal boundaries imposed by the display
  478. options width and max_columns.
  479. In case off non-interactive session, no boundaries apply.
  480. `ignore_width` is here so ipnb+HTML output can behave the way
  481. users expect. display.max_columns remains in effect.
  482. GH3541, GH3573
  483. """
  484. width, height = console.get_console_size()
  485. max_columns = get_option("display.max_columns")
  486. nb_columns = len(self.columns)
  487. # exceed max columns
  488. if ((max_columns and nb_columns > max_columns) or
  489. ((not ignore_width) and width and nb_columns > (width // 2))):
  490. return False
  491. # used by repr_html under IPython notebook or scripts ignore terminal
  492. # dims
  493. if ignore_width or not console.in_interactive_session():
  494. return True
  495. if (get_option('display.width') is not None or
  496. console.in_ipython_frontend()):
  497. # check at least the column row for excessive width
  498. max_rows = 1
  499. else:
  500. max_rows = get_option("display.max_rows")
  501. # when auto-detecting, so width=None and not in ipython front end
  502. # check whether repr fits horizontal by actually checking
  503. # the width of the rendered repr
  504. buf = StringIO()
  505. # only care about the stuff we'll actually print out
  506. # and to_string on entire frame may be expensive
  507. d = self
  508. if not (max_rows is None): # unlimited rows
  509. # min of two, where one may be None
  510. d = d.iloc[:min(max_rows, len(d))]
  511. else:
  512. return True
  513. d.to_string(buf=buf)
  514. value = buf.getvalue()
  515. repr_width = max(len(l) for l in value.split('\n'))
  516. return repr_width < width
  517. def _info_repr(self):
  518. """
  519. True if the repr should show the info view.
  520. """
  521. info_repr_option = (get_option("display.large_repr") == "info")
  522. return info_repr_option and not (self._repr_fits_horizontal_() and
  523. self._repr_fits_vertical_())
  524. def __unicode__(self):
  525. """
  526. Return a string representation for a particular DataFrame.
  527. Invoked by unicode(df) in py2 only. Yields a Unicode String in both
  528. py2/py3.
  529. """
  530. buf = StringIO(u(""))
  531. if self._info_repr():
  532. self.info(buf=buf)
  533. return buf.getvalue()
  534. max_rows = get_option("display.max_rows")
  535. max_cols = get_option("display.max_columns")
  536. show_dimensions = get_option("display.show_dimensions")
  537. if get_option("display.expand_frame_repr"):
  538. width, _ = console.get_console_size()
  539. else:
  540. width = None
  541. self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
  542. line_width=width, show_dimensions=show_dimensions)
  543. return buf.getvalue()
  544. def _repr_html_(self):
  545. """
  546. Return a html representation for a particular DataFrame.
  547. Mainly for IPython notebook.
  548. """
  549. # qtconsole doesn't report its line width, and also
  550. # behaves badly when outputting an HTML table
  551. # that doesn't fit the window, so disable it.
  552. # XXX: In IPython 3.x and above, the Qt console will not attempt to
  553. # display HTML, so this check can be removed when support for
  554. # IPython 2.x is no longer needed.
  555. try:
  556. import IPython
  557. except ImportError:
  558. pass
  559. else:
  560. if LooseVersion(IPython.__version__) < LooseVersion('3.0'):
  561. if console.in_qtconsole():
  562. # 'HTML output is disabled in QtConsole'
  563. return None
  564. if self._info_repr():
  565. buf = StringIO(u(""))
  566. self.info(buf=buf)
  567. # need to escape the <class>, should be the first line.
  568. val = buf.getvalue().replace('<', r'&lt;', 1)
  569. val = val.replace('>', r'&gt;', 1)
  570. return '<pre>' + val + '</pre>'
  571. if get_option("display.notebook_repr_html"):
  572. max_rows = get_option("display.max_rows")
  573. max_cols = get_option("display.max_columns")
  574. show_dimensions = get_option("display.show_dimensions")
  575. return self.to_html(max_rows=max_rows, max_cols=max_cols,
  576. show_dimensions=show_dimensions, notebook=True)
  577. else:
  578. return None
  579. @Substitution(header='Write out the column names. If a list of strings '
  580. 'is given, it is assumed to be aliases for the '
  581. 'column names')
  582. @Substitution(shared_params=fmt.common_docstring,
  583. returns=fmt.return_docstring)
  584. def to_string(self, buf=None, columns=None, col_space=None, header=True,
  585. index=True, na_rep='NaN', formatters=None, float_format=None,
  586. sparsify=None, index_names=True, justify=None,
  587. max_rows=None, max_cols=None, show_dimensions=False,
  588. decimal='.', line_width=None):
  589. """
  590. Render a DataFrame to a console-friendly tabular output.
  591. %(shared_params)s
  592. line_width : int, optional
  593. Width to wrap a line in characters.
  594. %(returns)s
  595. See Also
  596. --------
  597. to_html : Convert DataFrame to HTML.
  598. Examples
  599. --------
  600. >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
  601. >>> df = pd.DataFrame(d)
  602. >>> print(df.to_string())
  603. col1 col2
  604. 0 1 4
  605. 1 2 5
  606. 2 3 6
  607. """
  608. formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
  609. col_space=col_space, na_rep=na_rep,
  610. formatters=formatters,
  611. float_format=float_format,
  612. sparsify=sparsify, justify=justify,
  613. index_names=index_names,
  614. header=header, index=index,
  615. max_rows=max_rows,
  616. max_cols=max_cols,
  617. show_dimensions=show_dimensions,
  618. decimal=decimal,
  619. line_width=line_width)
  620. formatter.to_string()
  621. if buf is None:
  622. result = formatter.buf.getvalue()
  623. return result
  624. # ----------------------------------------------------------------------
  625. @property
  626. def style(self):
  627. """
  628. Property returning a Styler object containing methods for
  629. building a styled HTML representation fo the DataFrame.
  630. See Also
  631. --------
  632. pandas.io.formats.style.Styler
  633. """
  634. from pandas.io.formats.style import Styler
  635. return Styler(self)
  636. def iteritems(self):
  637. r"""
  638. Iterator over (column name, Series) pairs.
  639. Iterates over the DataFrame columns, returning a tuple with
  640. the column name and the content as a Series.
  641. Yields
  642. ------
  643. label : object
  644. The column names for the DataFrame being iterated over.
  645. content : Series
  646. The column entries belonging to each label, as a Series.
  647. See Also
  648. --------
  649. DataFrame.iterrows : Iterate over DataFrame rows as
  650. (index, Series) pairs.
  651. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
  652. of the values.
  653. Examples
  654. --------
  655. >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
  656. ... 'population': [1864, 22000, 80000]},
  657. ... index=['panda', 'polar', 'koala'])
  658. >>> df
  659. species population
  660. panda bear 1864
  661. polar bear 22000
  662. koala marsupial 80000
  663. >>> for label, content in df.iteritems():
  664. ... print('label:', label)
  665. ... print('content:', content, sep='\n')
  666. ...
  667. label: species
  668. content:
  669. panda bear
  670. polar bear
  671. koala marsupial
  672. Name: species, dtype: object
  673. label: population
  674. content:
  675. panda 1864
  676. polar 22000
  677. koala 80000
  678. Name: population, dtype: int64
  679. """
  680. if self.columns.is_unique and hasattr(self, '_item_cache'):
  681. for k in self.columns:
  682. yield k, self._get_item_cache(k)
  683. else:
  684. for i, k in enumerate(self.columns):
  685. yield k, self._ixs(i, axis=1)
  686. def iterrows(self):
  687. """
  688. Iterate over DataFrame rows as (index, Series) pairs.
  689. Yields
  690. ------
  691. index : label or tuple of label
  692. The index of the row. A tuple for a `MultiIndex`.
  693. data : Series
  694. The data of the row as a Series.
  695. it : generator
  696. A generator that iterates over the rows of the frame.
  697. See Also
  698. --------
  699. itertuples : Iterate over DataFrame rows as namedtuples of the values.
  700. iteritems : Iterate over (column name, Series) pairs.
  701. Notes
  702. -----
  703. 1. Because ``iterrows`` returns a Series for each row,
  704. it does **not** preserve dtypes across the rows (dtypes are
  705. preserved across columns for DataFrames). For example,
  706. >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
  707. >>> row = next(df.iterrows())[1]
  708. >>> row
  709. int 1.0
  710. float 1.5
  711. Name: 0, dtype: float64
  712. >>> print(row['int'].dtype)
  713. float64
  714. >>> print(df['int'].dtype)
  715. int64
  716. To preserve dtypes while iterating over the rows, it is better
  717. to use :meth:`itertuples` which returns namedtuples of the values
  718. and which is generally faster than ``iterrows``.
  719. 2. You should **never modify** something you are iterating over.
  720. This is not guaranteed to work in all cases. Depending on the
  721. data types, the iterator returns a copy and not a view, and writing
  722. to it will have no effect.
  723. """
  724. columns = self.columns
  725. klass = self._constructor_sliced
  726. for k, v in zip(self.index, self.values):
  727. s = klass(v, index=columns, name=k)
  728. yield k, s
  729. def itertuples(self, index=True, name="Pandas"):
  730. """
  731. Iterate over DataFrame rows as namedtuples.
  732. Parameters
  733. ----------
  734. index : bool, default True
  735. If True, return the index as the first element of the tuple.
  736. name : str or None, default "Pandas"
  737. The name of the returned namedtuples or None to return regular
  738. tuples.
  739. Yields
  740. -------
  741. collections.namedtuple
  742. Yields a namedtuple for each row in the DataFrame with the first
  743. field possibly being the index and following fields being the
  744. column values.
  745. See Also
  746. --------
  747. DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
  748. pairs.
  749. DataFrame.iteritems : Iterate over (column name, Series) pairs.
  750. Notes
  751. -----
  752. The column names will be renamed to positional names if they are
  753. invalid Python identifiers, repeated, or start with an underscore.
  754. With a large number of columns (>255), regular tuples are returned.
  755. Examples
  756. --------
  757. >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
  758. ... index=['dog', 'hawk'])
  759. >>> df
  760. num_legs num_wings
  761. dog 4 0
  762. hawk 2 2
  763. >>> for row in df.itertuples():
  764. ... print(row)
  765. ...
  766. Pandas(Index='dog', num_legs=4, num_wings=0)
  767. Pandas(Index='hawk', num_legs=2, num_wings=2)
  768. By setting the `index` parameter to False we can remove the index
  769. as the first element of the tuple:
  770. >>> for row in df.itertuples(index=False):
  771. ... print(row)
  772. ...
  773. Pandas(num_legs=4, num_wings=0)
  774. Pandas(num_legs=2, num_wings=2)
  775. With the `name` parameter set we set a custom name for the yielded
  776. namedtuples:
  777. >>> for row in df.itertuples(name='Animal'):
  778. ... print(row)
  779. ...
  780. Animal(Index='dog', num_legs=4, num_wings=0)
  781. Animal(Index='hawk', num_legs=2, num_wings=2)
  782. """
  783. arrays = []
  784. fields = list(self.columns)
  785. if index:
  786. arrays.append(self.index)
  787. fields.insert(0, "Index")
  788. # use integer indexing because of possible duplicate column names
  789. arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
  790. # Python 3 supports at most 255 arguments to constructor, and
  791. # things get slow with this many fields in Python 2
  792. if name is not None and len(self.columns) + index < 256:
  793. # `rename` is unsupported in Python 2.6
  794. try:
  795. itertuple = collections.namedtuple(name, fields, rename=True)
  796. return map(itertuple._make, zip(*arrays))
  797. except Exception:
  798. pass
  799. # fallback to regular tuples
  800. return zip(*arrays)
  801. items = iteritems
  802. def __len__(self):
  803. """
  804. Returns length of info axis, but here we use the index.
  805. """
  806. return len(self.index)
  807. def dot(self, other):
  808. """
  809. Compute the matrix mutiplication between the DataFrame and other.
  810. This method computes the matrix product between the DataFrame and the
  811. values of an other Series, DataFrame or a numpy array.
  812. It can also be called using ``self @ other`` in Python >= 3.5.
  813. Parameters
  814. ----------
  815. other : Series, DataFrame or array-like
  816. The other object to compute the matrix product with.
  817. Returns
  818. -------
  819. Series or DataFrame
  820. If other is a Series, return the matrix product between self and
  821. other as a Serie. If other is a DataFrame or a numpy.array, return
  822. the matrix product of self and other in a DataFrame of a np.array.
  823. See Also
  824. --------
  825. Series.dot: Similar method for Series.
  826. Notes
  827. -----
  828. The dimensions of DataFrame and other must be compatible in order to
  829. compute the matrix multiplication.
  830. The dot method for Series computes the inner product, instead of the
  831. matrix product here.
  832. Examples
  833. --------
  834. Here we multiply a DataFrame with a Series.
  835. >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
  836. >>> s = pd.Series([1, 1, 2, 1])
  837. >>> df.dot(s)
  838. 0 -4
  839. 1 5
  840. dtype: int64
  841. Here we multiply a DataFrame with another DataFrame.
  842. >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
  843. >>> df.dot(other)
  844. 0 1
  845. 0 1 4
  846. 1 2 2
  847. Note that the dot method give the same result as @
  848. >>> df @ other
  849. 0 1
  850. 0 1 4
  851. 1 2 2
  852. The dot method works also if other is an np.array.
  853. >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
  854. >>> df.dot(arr)
  855. 0 1
  856. 0 1 4
  857. 1 2 2
  858. """
  859. if isinstance(other, (Series, DataFrame)):
  860. common = self.columns.union(other.index)
  861. if (len(common) > len(self.columns) or
  862. len(common) > len(other.index)):
  863. raise ValueError('matrices are not aligned')
  864. left = self.reindex(columns=common, copy=False)
  865. right = other.reindex(index=common, copy=False)
  866. lvals = left.values
  867. rvals = right.values
  868. else:
  869. left = self
  870. lvals = self.values
  871. rvals = np.asarray(other)
  872. if lvals.shape[1] != rvals.shape[0]:
  873. raise ValueError('Dot product shape mismatch, '
  874. '{s} vs {r}'.format(s=lvals.shape,
  875. r=rvals.shape))
  876. if isinstance(other, DataFrame):
  877. return self._constructor(np.dot(lvals, rvals), index=left.index,
  878. columns=other.columns)
  879. elif isinstance(other, Series):
  880. return Series(np.dot(lvals, rvals), index=left.index)
  881. elif isinstance(rvals, (np.ndarray, Index)):
  882. result = np.dot(lvals, rvals)
  883. if result.ndim == 2:
  884. return self._constructor(result, index=left.index)
  885. else:
  886. return Series(result, index=left.index)
  887. else: # pragma: no cover
  888. raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
  889. def __matmul__(self, other):
  890. """
  891. Matrix multiplication using binary `@` operator in Python>=3.5.
  892. """
  893. return self.dot(other)
  894. def __rmatmul__(self, other):
  895. """
  896. Matrix multiplication using binary `@` operator in Python>=3.5.
  897. """
  898. return self.T.dot(np.transpose(other)).T
  899. # ----------------------------------------------------------------------
  900. # IO methods (to / from other formats)
  901. @classmethod
  902. def from_dict(cls, data, orient='columns', dtype=None, columns=None):
  903. """
  904. Construct DataFrame from dict of array-like or dicts.
  905. Creates DataFrame object from dictionary by columns or by index
  906. allowing dtype specification.
  907. Parameters
  908. ----------
  909. data : dict
  910. Of the form {field : array-like} or {field : dict}.
  911. orient : {'columns', 'index'}, default 'columns'
  912. The "orientation" of the data. If the keys of the passed dict
  913. should be the columns of the resulting DataFrame, pass 'columns'
  914. (default). Otherwise if the keys should be rows, pass 'index'.
  915. dtype : dtype, default None
  916. Data type to force, otherwise infer.
  917. columns : list, default None
  918. Column labels to use when ``orient='index'``. Raises a ValueError
  919. if used with ``orient='columns'``.
  920. .. versionadded:: 0.23.0
  921. Returns
  922. -------
  923. pandas.DataFrame
  924. See Also
  925. --------
  926. DataFrame.from_records : DataFrame from ndarray (structured
  927. dtype), list of tuples, dict, or DataFrame.
  928. DataFrame : DataFrame object creation using constructor.
  929. Examples
  930. --------
  931. By default the keys of the dict become the DataFrame columns:
  932. >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
  933. >>> pd.DataFrame.from_dict(data)
  934. col_1 col_2
  935. 0 3 a
  936. 1 2 b
  937. 2 1 c
  938. 3 0 d
  939. Specify ``orient='index'`` to create the DataFrame using dictionary
  940. keys as rows:
  941. >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
  942. >>> pd.DataFrame.from_dict(data, orient='index')
  943. 0 1 2 3
  944. row_1 3 2 1 0
  945. row_2 a b c d
  946. When using the 'index' orientation, the column names can be
  947. specified manually:
  948. >>> pd.DataFrame.from_dict(data, orient='index',
  949. ... columns=['A', 'B', 'C', 'D'])
  950. A B C D
  951. row_1 3 2 1 0
  952. row_2 a b c d
  953. """
  954. index = None
  955. orient = orient.lower()
  956. if orient == 'index':
  957. if len(data) > 0:
  958. # TODO speed up Series case
  959. if isinstance(list(data.values())[0], (Series, dict)):
  960. data = _from_nested_dict(data)
  961. else:
  962. data, index = list(data.values()), list(data.keys())
  963. elif orient == 'columns':
  964. if columns is not None:
  965. raise ValueError("cannot use columns parameter with "
  966. "orient='columns'")
  967. else: # pragma: no cover
  968. raise ValueError('only recognize index or columns for orient')
  969. return cls(data, index=index, columns=columns, dtype=dtype)
  970. def to_numpy(self, dtype=None, copy=False):
  971. """
  972. Convert the DataFrame to a NumPy array.
  973. .. versionadded:: 0.24.0
  974. By default, the dtype of the returned array will be the common NumPy
  975. dtype of all types in the DataFrame. For example, if the dtypes are
  976. ``float16`` and ``float32``, the results dtype will be ``float32``.
  977. This may require copying data and coercing values, which may be
  978. expensive.
  979. Parameters
  980. ----------
  981. dtype : str or numpy.dtype, optional
  982. The dtype to pass to :meth:`numpy.asarray`
  983. copy : bool, default False
  984. Whether to ensure that the returned value is a not a view on
  985. another array. Note that ``copy=False`` does not *ensure* that
  986. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  987. a copy is made, even if not strictly necessary.
  988. Returns
  989. -------
  990. array : numpy.ndarray
  991. See Also
  992. --------
  993. Series.to_numpy : Similar method for Series.
  994. Examples
  995. --------
  996. >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
  997. array([[1, 3],
  998. [2, 4]])
  999. With heterogenous data, the lowest common type will have to
  1000. be used.
  1001. >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
  1002. >>> df.to_numpy()
  1003. array([[1. , 3. ],
  1004. [2. , 4.5]])
  1005. For a mix of numeric and non-numeric types, the output array will
  1006. have object dtype.
  1007. >>> df['C'] = pd.date_range('2000', periods=2)
  1008. >>> df.to_numpy()
  1009. array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
  1010. [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
  1011. """
  1012. result = np.array(self.values, dtype=dtype, copy=copy)
  1013. return result
  1014. def to_dict(self, orient='dict', into=dict):
  1015. """
  1016. Convert the DataFrame to a dictionary.
  1017. The type of the key-value pairs can be customized with the parameters
  1018. (see below).
  1019. Parameters
  1020. ----------
  1021. orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
  1022. Determines the type of the values of the dictionary.
  1023. - 'dict' (default) : dict like {column -> {index -> value}}
  1024. - 'list' : dict like {column -> [values]}
  1025. - 'series' : dict like {column -> Series(values)}
  1026. - 'split' : dict like
  1027. {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
  1028. - 'records' : list like
  1029. [{column -> value}, ... , {column -> value}]
  1030. - 'index' : dict like {index -> {column -> value}}
  1031. Abbreviations are allowed. `s` indicates `series` and `sp`
  1032. indicates `split`.
  1033. into : class, default dict
  1034. The collections.Mapping subclass used for all Mappings
  1035. in the return value. Can be the actual class or an empty
  1036. instance of the mapping type you want. If you want a
  1037. collections.defaultdict, you must pass it initialized.
  1038. .. versionadded:: 0.21.0
  1039. Returns
  1040. -------
  1041. dict, list or collections.Mapping
  1042. Return a collections.Mapping object representing the DataFrame.
  1043. The resulting transformation depends on the `orient` parameter.
  1044. See Also
  1045. --------
  1046. DataFrame.from_dict: Create a DataFrame from a dictionary.
  1047. DataFrame.to_json: Convert a DataFrame to JSON format.
  1048. Examples
  1049. --------
  1050. >>> df = pd.DataFrame({'col1': [1, 2],
  1051. ... 'col2': [0.5, 0.75]},
  1052. ... index=['row1', 'row2'])
  1053. >>> df
  1054. col1 col2
  1055. row1 1 0.50
  1056. row2 2 0.75
  1057. >>> df.to_dict()
  1058. {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
  1059. You can specify the return orientation.
  1060. >>> df.to_dict('series')
  1061. {'col1': row1 1
  1062. row2 2
  1063. Name: col1, dtype: int64,
  1064. 'col2': row1 0.50
  1065. row2 0.75
  1066. Name: col2, dtype: float64}
  1067. >>> df.to_dict('split')
  1068. {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
  1069. 'data': [[1, 0.5], [2, 0.75]]}
  1070. >>> df.to_dict('records')
  1071. [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
  1072. >>> df.to_dict('index')
  1073. {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
  1074. You can also specify the mapping type.
  1075. >>> from collections import OrderedDict, defaultdict
  1076. >>> df.to_dict(into=OrderedDict)
  1077. OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
  1078. ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
  1079. If you want a `defaultdict`, you need to initialize it:
  1080. >>> dd = defaultdict(list)
  1081. >>> df.to_dict('records', into=dd)
  1082. [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
  1083. defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
  1084. """
  1085. if not self.columns.is_unique:
  1086. warnings.warn("DataFrame columns are not unique, some "
  1087. "columns will be omitted.", UserWarning,
  1088. stacklevel=2)
  1089. # GH16122
  1090. into_c = com.standardize_mapping(into)
  1091. if orient.lower().startswith('d'):
  1092. return into_c(
  1093. (k, v.to_dict(into)) for k, v in compat.iteritems(self))
  1094. elif orient.lower().startswith('l'):
  1095. return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
  1096. elif orient.lower().startswith('sp'):
  1097. return into_c((('index', self.index.tolist()),
  1098. ('columns', self.columns.tolist()),
  1099. ('data', [
  1100. list(map(com.maybe_box_datetimelike, t))
  1101. for t in self.itertuples(index=False, name=None)
  1102. ])))
  1103. elif orient.lower().startswith('s'):
  1104. return into_c((k, com.maybe_box_datetimelike(v))
  1105. for k, v in compat.iteritems(self))
  1106. elif orient.lower().startswith('r'):
  1107. columns = self.columns.tolist()
  1108. rows = (dict(zip(columns, row))
  1109. for row in self.itertuples(index=False, name=None))
  1110. return [
  1111. into_c((k, com.maybe_box_datetimelike(v))
  1112. for k, v in compat.iteritems(row))
  1113. for row in rows]
  1114. elif orient.lower().startswith('i'):
  1115. if not self.index.is_unique:
  1116. raise ValueError(
  1117. "DataFrame index must be unique for orient='index'."
  1118. )
  1119. return into_c((t[0], dict(zip(self.columns, t[1:])))
  1120. for t in self.itertuples(name=None))
  1121. else:
  1122. raise ValueError("orient '{o}' not understood".format(o=orient))
  1123. def to_gbq(self, destination_table, project_id=None, chunksize=None,
  1124. reauth=False, if_exists='fail', auth_local_webserver=False,
  1125. table_schema=None, location=None, progress_bar=True,
  1126. credentials=None, verbose=None, private_key=None):
  1127. """
  1128. Write a DataFrame to a Google BigQuery table.
  1129. This function requires the `pandas-gbq package
  1130. <https://pandas-gbq.readthedocs.io>`__.
  1131. See the `How to authenticate with Google BigQuery
  1132. <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
  1133. guide for authentication instructions.
  1134. Parameters
  1135. ----------
  1136. destination_table : str
  1137. Name of table to be written, in the form ``dataset.tablename``.
  1138. project_id : str, optional
  1139. Google BigQuery Account project ID. Optional when available from
  1140. the environment.
  1141. chunksize : int, optional
  1142. Number of rows to be inserted in each chunk from the dataframe.
  1143. Set to ``None`` to load the whole dataframe at once.
  1144. reauth : bool, default False
  1145. Force Google BigQuery to re-authenticate the user. This is useful
  1146. if multiple accounts are used.
  1147. if_exists : str, default 'fail'
  1148. Behavior when the destination table exists. Value can be one of:
  1149. ``'fail'``
  1150. If table exists, do nothing.
  1151. ``'replace'``
  1152. If table exists, drop it, recreate it, and insert data.
  1153. ``'append'``
  1154. If table exists, insert data. Create if does not exist.
  1155. auth_local_webserver : bool, default False
  1156. Use the `local webserver flow`_ instead of the `console flow`_
  1157. when getting user credentials.
  1158. .. _local webserver flow:
  1159. http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
  1160. .. _console flow:
  1161. http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
  1162. *New in version 0.2.0 of pandas-gbq*.
  1163. table_schema : list of dicts, optional
  1164. List of BigQuery table fields to which according DataFrame
  1165. columns conform to, e.g. ``[{'name': 'col1', 'type':
  1166. 'STRING'},...]``. If schema is not provided, it will be
  1167. generated according to dtypes of DataFrame columns. See
  1168. BigQuery API documentation on available names of a field.
  1169. *New in version 0.3.1 of pandas-gbq*.
  1170. location : str, optional
  1171. Location where the load job should run. See the `BigQuery locations
  1172. documentation
  1173. <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
  1174. list of available locations. The location must match that of the
  1175. target dataset.
  1176. *New in version 0.5.0 of pandas-gbq*.
  1177. progress_bar : bool, default True
  1178. Use the library `tqdm` to show the progress bar for the upload,
  1179. chunk by chunk.
  1180. *New in version 0.5.0 of pandas-gbq*.
  1181. credentials : google.auth.credentials.Credentials, optional
  1182. Credentials for accessing Google APIs. Use this parameter to
  1183. override default credentials, such as to use Compute Engine
  1184. :class:`google.auth.compute_engine.Credentials` or Service
  1185. Account :class:`google.oauth2.service_account.Credentials`
  1186. directly.
  1187. *New in version 0.8.0 of pandas-gbq*.
  1188. .. versionadded:: 0.24.0
  1189. verbose : bool, deprecated
  1190. Deprecated in pandas-gbq version 0.4.0. Use the `logging module
  1191. to adjust verbosity instead
  1192. <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
  1193. private_key : str, deprecated
  1194. Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
  1195. parameter and
  1196. :func:`google.oauth2.service_account.Credentials.from_service_account_info`
  1197. or
  1198. :func:`google.oauth2.service_account.Credentials.from_service_account_file`
  1199. instead.
  1200. Service account private key in JSON format. Can be file path
  1201. or string contents. This is useful for remote server
  1202. authentication (eg. Jupyter/IPython notebook on remote host).
  1203. See Also
  1204. --------
  1205. pandas_gbq.to_gbq : This function in the pandas-gbq library.
  1206. pandas.read_gbq : Read a DataFrame from Google BigQuery.
  1207. """
  1208. from pandas.io import gbq
  1209. return gbq.to_gbq(
  1210. self, destination_table, project_id=project_id,
  1211. chunksize=chunksize, reauth=reauth, if_exists=if_exists,
  1212. auth_local_webserver=auth_local_webserver,
  1213. table_schema=table_schema, location=location,
  1214. progress_bar=progress_bar, credentials=credentials,
  1215. verbose=verbose, private_key=private_key)
  1216. @classmethod
  1217. def from_records(cls, data, index=None, exclude=None, columns=None,
  1218. coerce_float=False, nrows=None):
  1219. """
  1220. Convert structured or record ndarray to DataFrame.
  1221. Parameters
  1222. ----------
  1223. data : ndarray (structured dtype), list of tuples, dict, or DataFrame
  1224. index : string, list of fields, array-like
  1225. Field of array to use as the index, alternately a specific set of
  1226. input labels to use
  1227. exclude : sequence, default None
  1228. Columns or fields to exclude
  1229. columns : sequence, default None
  1230. Column names to use. If the passed data do not have names
  1231. associated with them, this argument provides names for the
  1232. columns. Otherwise this argument indicates the order of the columns
  1233. in the result (any names not found in the data will become all-NA
  1234. columns)
  1235. coerce_float : boolean, default False
  1236. Attempt to convert values of non-string, non-numeric objects (like
  1237. decimal.Decimal) to floating point, useful for SQL result sets
  1238. nrows : int, default None
  1239. Number of rows to read if data is an iterator
  1240. Returns
  1241. -------
  1242. df : DataFrame
  1243. """
  1244. # Make a copy of the input columns so we can modify it
  1245. if columns is not None:
  1246. columns = ensure_index(columns)
  1247. if is_iterator(data):
  1248. if nrows == 0:
  1249. return cls()
  1250. try:
  1251. first_row = next(data)
  1252. except StopIteration:
  1253. return cls(index=index, columns=columns)
  1254. dtype = None
  1255. if hasattr(first_row, 'dtype') and first_row.dtype.names:
  1256. dtype = first_row.dtype
  1257. values = [first_row]
  1258. if nrows is None:
  1259. values += data
  1260. else:
  1261. values.extend(itertools.islice(data, nrows - 1))
  1262. if dtype is not None:
  1263. data = np.array(values, dtype=dtype)
  1264. else:
  1265. data = values
  1266. if isinstance(data, dict):
  1267. if columns is None:
  1268. columns = arr_columns = ensure_index(sorted(data))
  1269. arrays = [data[k] for k in columns]
  1270. else:
  1271. arrays = []
  1272. arr_columns = []
  1273. for k, v in compat.iteritems(data):
  1274. if k in columns:
  1275. arr_columns.append(k)
  1276. arrays.append(v)
  1277. arrays, arr_columns = reorder_arrays(arrays, arr_columns,
  1278. columns)
  1279. elif isinstance(data, (np.ndarray, DataFrame)):
  1280. arrays, columns = to_arrays(data, columns)
  1281. if columns is not None:
  1282. columns = ensure_index(columns)
  1283. arr_columns = columns
  1284. else:
  1285. arrays, arr_columns = to_arrays(data, columns,
  1286. coerce_float=coerce_float)
  1287. arr_columns = ensure_index(arr_columns)
  1288. if columns is not None:
  1289. columns = ensure_index(columns)
  1290. else:
  1291. columns = arr_columns
  1292. if exclude is None:
  1293. exclude = set()
  1294. else:
  1295. exclude = set(exclude)
  1296. result_index = None
  1297. if index is not None:
  1298. if (isinstance(index, compat.string_types) or
  1299. not hasattr(index, "__iter__")):
  1300. i = columns.get_loc(index)
  1301. exclude.add(index)
  1302. if len(arrays) > 0:
  1303. result_index = Index(arrays[i], name=index)
  1304. else:
  1305. result_index = Index([], name=index)
  1306. else:
  1307. try:
  1308. to_remove = [arr_columns.get_loc(field) for field in index]
  1309. index_data = [arrays[i] for i in to_remove]
  1310. result_index = ensure_index_from_sequences(index_data,
  1311. names=index)
  1312. exclude.update(index)
  1313. except Exception:
  1314. result_index = index
  1315. if any(exclude):
  1316. arr_exclude = [x for x in exclude if x in arr_columns]
  1317. to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
  1318. arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
  1319. arr_columns = arr_columns.drop(arr_exclude)
  1320. columns = columns.drop(exclude)
  1321. mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
  1322. return cls(mgr)
  1323. def to_records(self, index=True, convert_datetime64=None,
  1324. column_dtypes=None, index_dtypes=None):
  1325. """
  1326. Convert DataFrame to a NumPy record array.
  1327. Index will be included as the first field of the record array if
  1328. requested.
  1329. Parameters
  1330. ----------
  1331. index : bool, default True
  1332. Include index in resulting record array, stored in 'index'
  1333. field or using the index label, if set.
  1334. convert_datetime64 : bool, default None
  1335. .. deprecated:: 0.23.0
  1336. Whether to convert the index to datetime.datetime if it is a
  1337. DatetimeIndex.
  1338. column_dtypes : str, type, dict, default None
  1339. .. versionadded:: 0.24.0
  1340. If a string or type, the data type to store all columns. If
  1341. a dictionary, a mapping of column names and indices (zero-indexed)
  1342. to specific data types.
  1343. index_dtypes : str, type, dict, default None
  1344. .. versionadded:: 0.24.0
  1345. If a string or type, the data type to store all index levels. If
  1346. a dictionary, a mapping of index level names and indices
  1347. (zero-indexed) to specific data types.
  1348. This mapping is applied only if `index=True`.
  1349. Returns
  1350. -------
  1351. numpy.recarray
  1352. NumPy ndarray with the DataFrame labels as fields and each row
  1353. of the DataFrame as entries.
  1354. See Also
  1355. --------
  1356. DataFrame.from_records: Convert structured or record ndarray
  1357. to DataFrame.
  1358. numpy.recarray: An ndarray that allows field access using
  1359. attributes, analogous to typed columns in a
  1360. spreadsheet.
  1361. Examples
  1362. --------
  1363. >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
  1364. ... index=['a', 'b'])
  1365. >>> df
  1366. A B
  1367. a 1 0.50
  1368. b 2 0.75
  1369. >>> df.to_records()
  1370. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1371. dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
  1372. If the DataFrame index has no label then the recarray field name
  1373. is set to 'index'. If the index has a label then this is used as the
  1374. field name:
  1375. >>> df.index = df.index.rename("I")
  1376. >>> df.to_records()
  1377. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1378. dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
  1379. The index can be excluded from the record array:
  1380. >>> df.to_records(index=False)
  1381. rec.array([(1, 0.5 ), (2, 0.75)],
  1382. dtype=[('A', '<i8'), ('B', '<f8')])
  1383. Data types can be specified for the columns:
  1384. >>> df.to_records(column_dtypes={"A": "int32"})
  1385. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1386. dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
  1387. As well as for the index:
  1388. >>> df.to_records(index_dtypes="<S2")
  1389. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  1390. dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
  1391. >>> index_dtypes = "<S{}".format(df.index.str.len().max())
  1392. >>> df.to_records(index_dtypes=index_dtypes)
  1393. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  1394. dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
  1395. """
  1396. if convert_datetime64 is not None:
  1397. warnings.warn("The 'convert_datetime64' parameter is "
  1398. "deprecated and will be removed in a future "
  1399. "version",
  1400. FutureWarning, stacklevel=2)
  1401. if index:
  1402. if is_datetime64_any_dtype(self.index) and convert_datetime64:
  1403. ix_vals = [self.index.to_pydatetime()]
  1404. else:
  1405. if isinstance(self.index, MultiIndex):
  1406. # array of tuples to numpy cols. copy copy copy
  1407. ix_vals = lmap(np.array, zip(*self.index.values))
  1408. else:
  1409. ix_vals = [self.index.values]
  1410. arrays = ix_vals + [self[c].get_values() for c in self.columns]
  1411. count = 0
  1412. index_names = list(self.index.names)
  1413. if isinstance(self.index, MultiIndex):
  1414. for i, n in enumerate(index_names):
  1415. if n is None:
  1416. index_names[i] = 'level_%d' % count
  1417. count += 1
  1418. elif index_names[0] is None:
  1419. index_names = ['index']
  1420. names = (lmap(compat.text_type, index_names) +
  1421. lmap(compat.text_type, self.columns))
  1422. else:
  1423. arrays = [self[c].get_values() for c in self.columns]
  1424. names = lmap(compat.text_type, self.columns)
  1425. index_names = []
  1426. index_len = len(index_names)
  1427. formats = []
  1428. for i, v in enumerate(arrays):
  1429. index = i
  1430. # When the names and arrays are collected, we
  1431. # first collect those in the DataFrame's index,
  1432. # followed by those in its columns.
  1433. #
  1434. # Thus, the total length of the array is:
  1435. # len(index_names) + len(DataFrame.columns).
  1436. #
  1437. # This check allows us to see whether we are
  1438. # handling a name / array in the index or column.
  1439. if index < index_len:
  1440. dtype_mapping = index_dtypes
  1441. name = index_names[index]
  1442. else:
  1443. index -= index_len
  1444. dtype_mapping = column_dtypes
  1445. name = self.columns[index]
  1446. # We have a dictionary, so we get the data type
  1447. # associated with the index or column (which can
  1448. # be denoted by its name in the DataFrame or its
  1449. # position in DataFrame's array of indices or
  1450. # columns, whichever is applicable.
  1451. if is_dict_like(dtype_mapping):
  1452. if name in dtype_mapping:
  1453. dtype_mapping = dtype_mapping[name]
  1454. elif index in dtype_mapping:
  1455. dtype_mapping = dtype_mapping[index]
  1456. else:
  1457. dtype_mapping = None
  1458. # If no mapping can be found, use the array's
  1459. # dtype attribute for formatting.
  1460. #
  1461. # A valid dtype must either be a type or
  1462. # string naming a type.
  1463. if dtype_mapping is None:
  1464. formats.append(v.dtype)
  1465. elif isinstance(dtype_mapping, (type, compat.string_types)):
  1466. formats.append(dtype_mapping)
  1467. else:
  1468. element = "row" if i < index_len else "column"
  1469. msg = ("Invalid dtype {dtype} specified for "
  1470. "{element} {name}").format(dtype=dtype_mapping,
  1471. element=element, name=name)
  1472. raise ValueError(msg)
  1473. return np.rec.fromarrays(
  1474. arrays,
  1475. dtype={'names': names, 'formats': formats}
  1476. )
  1477. @classmethod
  1478. def from_items(cls, items, columns=None, orient='columns'):
  1479. """
  1480. Construct a DataFrame from a list of tuples.
  1481. .. deprecated:: 0.23.0
  1482. `from_items` is deprecated and will be removed in a future version.
  1483. Use :meth:`DataFrame.from_dict(dict(items)) <DataFrame.from_dict>`
  1484. instead.
  1485. :meth:`DataFrame.from_dict(OrderedDict(items)) <DataFrame.from_dict>`
  1486. may be used to preserve the key order.
  1487. Convert (key, value) pairs to DataFrame. The keys will be the axis
  1488. index (usually the columns, but depends on the specified
  1489. orientation). The values should be arrays or Series.
  1490. Parameters
  1491. ----------
  1492. items : sequence of (key, value) pairs
  1493. Values should be arrays or Series.
  1494. columns : sequence of column labels, optional
  1495. Must be passed if orient='index'.
  1496. orient : {'columns', 'index'}, default 'columns'
  1497. The "orientation" of the data. If the keys of the
  1498. input correspond to column labels, pass 'columns'
  1499. (default). Otherwise if the keys correspond to the index,
  1500. pass 'index'.
  1501. Returns
  1502. -------
  1503. frame : DataFrame
  1504. """
  1505. warnings.warn("from_items is deprecated. Please use "
  1506. "DataFrame.from_dict(dict(items), ...) instead. "
  1507. "DataFrame.from_dict(OrderedDict(items)) may be used to "
  1508. "preserve the key order.",
  1509. FutureWarning, stacklevel=2)
  1510. keys, values = lzip(*items)
  1511. if orient == 'columns':
  1512. if columns is not None:
  1513. columns = ensure_index(columns)
  1514. idict = dict(items)
  1515. if len(idict) < len(items):
  1516. if not columns.equals(ensure_index(keys)):
  1517. raise ValueError('With non-unique item names, passed '
  1518. 'columns must be identical')
  1519. arrays = values
  1520. else:
  1521. arrays = [idict[k] for k in columns if k in idict]
  1522. else:
  1523. columns = ensure_index(keys)
  1524. arrays = values
  1525. # GH 17312
  1526. # Provide more informative error msg when scalar values passed
  1527. try:
  1528. return cls._from_arrays(arrays, columns, None)
  1529. except ValueError:
  1530. if not is_nested_list_like(values):
  1531. raise ValueError('The value in each (key, value) pair '
  1532. 'must be an array, Series, or dict')
  1533. elif orient == 'index':
  1534. if columns is None:
  1535. raise TypeError("Must pass columns with orient='index'")
  1536. keys = ensure_index(keys)
  1537. # GH 17312
  1538. # Provide more informative error msg when scalar values passed
  1539. try:
  1540. arr = np.array(values, dtype=object).T
  1541. data = [lib.maybe_convert_objects(v) for v in arr]
  1542. return cls._from_arrays(data, columns, keys)
  1543. except TypeError:
  1544. if not is_nested_list_like(values):
  1545. raise ValueError('The value in each (key, value) pair '
  1546. 'must be an array, Series, or dict')
  1547. else: # pragma: no cover
  1548. raise ValueError("'orient' must be either 'columns' or 'index'")
  1549. @classmethod
  1550. def _from_arrays(cls, arrays, columns, index, dtype=None):
  1551. mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
  1552. return cls(mgr)
  1553. @classmethod
  1554. def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True,
  1555. encoding=None, tupleize_cols=None,
  1556. infer_datetime_format=False):
  1557. """
  1558. Read CSV file.
  1559. .. deprecated:: 0.21.0
  1560. Use :func:`pandas.read_csv` instead.
  1561. It is preferable to use the more powerful :func:`pandas.read_csv`
  1562. for most general purposes, but ``from_csv`` makes for an easy
  1563. roundtrip to and from a file (the exact counterpart of
  1564. ``to_csv``), especially with a DataFrame of time series data.
  1565. This method only differs from the preferred :func:`pandas.read_csv`
  1566. in some defaults:
  1567. - `index_col` is ``0`` instead of ``None`` (take first column as index
  1568. by default)
  1569. - `parse_dates` is ``True`` instead of ``False`` (try parsing the index
  1570. as datetime by default)
  1571. So a ``pd.DataFrame.from_csv(path)`` can be replaced by
  1572. ``pd.read_csv(path, index_col=0, parse_dates=True)``.
  1573. Parameters
  1574. ----------
  1575. path : string file path or file handle / StringIO
  1576. header : int, default 0
  1577. Row to use as header (skip prior rows)
  1578. sep : string, default ','
  1579. Field delimiter
  1580. index_col : int or sequence, default 0
  1581. Column to use for index. If a sequence is given, a MultiIndex
  1582. is used. Different default from read_table
  1583. parse_dates : boolean, default True
  1584. Parse dates. Different default from read_table
  1585. tupleize_cols : boolean, default False
  1586. write multi_index columns as a list of tuples (if True)
  1587. or new (expanded format) if False)
  1588. infer_datetime_format : boolean, default False
  1589. If True and `parse_dates` is True for a column, try to infer the
  1590. datetime format based on the first datetime string. If the format
  1591. can be inferred, there often will be a large parsing speed-up.
  1592. Returns
  1593. -------
  1594. y : DataFrame
  1595. See Also
  1596. --------
  1597. pandas.read_csv
  1598. """
  1599. warnings.warn("from_csv is deprecated. Please use read_csv(...) "
  1600. "instead. Note that some of the default arguments are "
  1601. "different, so please refer to the documentation "
  1602. "for from_csv when changing your function calls",
  1603. FutureWarning, stacklevel=2)
  1604. from pandas.io.parsers import read_csv
  1605. return read_csv(path, header=header, sep=sep,
  1606. parse_dates=parse_dates, index_col=index_col,
  1607. encoding=encoding, tupleize_cols=tupleize_cols,
  1608. infer_datetime_format=infer_datetime_format)
  1609. def to_sparse(self, fill_value=None, kind='block'):
  1610. """
  1611. Convert to SparseDataFrame.
  1612. Implement the sparse version of the DataFrame meaning that any data
  1613. matching a specific value it's omitted in the representation.
  1614. The sparse DataFrame allows for a more efficient storage.
  1615. Parameters
  1616. ----------
  1617. fill_value : float, default None
  1618. The specific value that should be omitted in the representation.
  1619. kind : {'block', 'integer'}, default 'block'
  1620. The kind of the SparseIndex tracking where data is not equal to
  1621. the fill value:
  1622. - 'block' tracks only the locations and sizes of blocks of data.
  1623. - 'integer' keeps an array with all the locations of the data.
  1624. In most cases 'block' is recommended, since it's more memory
  1625. efficient.
  1626. Returns
  1627. -------
  1628. SparseDataFrame
  1629. The sparse representation of the DataFrame.
  1630. See Also
  1631. --------
  1632. DataFrame.to_dense :
  1633. Converts the DataFrame back to the its dense form.
  1634. Examples
  1635. --------
  1636. >>> df = pd.DataFrame([(np.nan, np.nan),
  1637. ... (1., np.nan),
  1638. ... (np.nan, 1.)])
  1639. >>> df
  1640. 0 1
  1641. 0 NaN NaN
  1642. 1 1.0 NaN
  1643. 2 NaN 1.0
  1644. >>> type(df)
  1645. <class 'pandas.core.frame.DataFrame'>
  1646. >>> sdf = df.to_sparse()
  1647. >>> sdf
  1648. 0 1
  1649. 0 NaN NaN
  1650. 1 1.0 NaN
  1651. 2 NaN 1.0
  1652. >>> type(sdf)
  1653. <class 'pandas.core.sparse.frame.SparseDataFrame'>
  1654. """
  1655. from pandas.core.sparse.api import SparseDataFrame
  1656. return SparseDataFrame(self._series, index=self.index,
  1657. columns=self.columns, default_kind=kind,
  1658. default_fill_value=fill_value)
  1659. def to_panel(self):
  1660. """
  1661. Transform long (stacked) format (DataFrame) into wide (3D, Panel)
  1662. format.
  1663. .. deprecated:: 0.20.0
  1664. Currently the index of the DataFrame must be a 2-level MultiIndex. This
  1665. may be generalized later
  1666. Returns
  1667. -------
  1668. panel : Panel
  1669. """
  1670. # only support this kind for now
  1671. if (not isinstance(self.index, MultiIndex) or # pragma: no cover
  1672. len(self.index.levels) != 2):
  1673. raise NotImplementedError('Only 2-level MultiIndex are supported.')
  1674. if not self.index.is_unique:
  1675. raise ValueError("Can't convert non-uniquely indexed "
  1676. "DataFrame to Panel")
  1677. self._consolidate_inplace()
  1678. # minor axis must be sorted
  1679. if self.index.lexsort_depth < 2:
  1680. selfsorted = self.sort_index(level=0)
  1681. else:
  1682. selfsorted = self
  1683. major_axis, minor_axis = selfsorted.index.levels
  1684. major_codes, minor_codes = selfsorted.index.codes
  1685. shape = len(major_axis), len(minor_axis)
  1686. # preserve names, if any
  1687. major_axis = major_axis.copy()
  1688. major_axis.name = self.index.names[0]
  1689. minor_axis = minor_axis.copy()
  1690. minor_axis.name = self.index.names[1]
  1691. # create new axes
  1692. new_axes = [selfsorted.columns, major_axis, minor_axis]
  1693. # create new manager
  1694. new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
  1695. labels=[major_codes,
  1696. minor_codes],
  1697. shape=shape,
  1698. ref_items=selfsorted.columns)
  1699. return self._constructor_expanddim(new_mgr)
  1700. @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
  1701. def to_stata(self, fname, convert_dates=None, write_index=True,
  1702. encoding="latin-1", byteorder=None, time_stamp=None,
  1703. data_label=None, variable_labels=None, version=114,
  1704. convert_strl=None):
  1705. """
  1706. Export DataFrame object to Stata dta format.
  1707. Writes the DataFrame to a Stata dataset file.
  1708. "dta" files contain a Stata dataset.
  1709. Parameters
  1710. ----------
  1711. fname : str, buffer or path object
  1712. String, path object (pathlib.Path or py._path.local.LocalPath) or
  1713. object implementing a binary write() function. If using a buffer
  1714. then the buffer will not be automatically closed after the file
  1715. data has been written.
  1716. convert_dates : dict
  1717. Dictionary mapping columns containing datetime types to stata
  1718. internal format to use when writing the dates. Options are 'tc',
  1719. 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
  1720. or a name. Datetime columns that do not have a conversion type
  1721. specified will be converted to 'tc'. Raises NotImplementedError if
  1722. a datetime column has timezone information.
  1723. write_index : bool
  1724. Write the index to Stata dataset.
  1725. encoding : str
  1726. Default is latin-1. Unicode is not supported.
  1727. byteorder : str
  1728. Can be ">", "<", "little", or "big". default is `sys.byteorder`.
  1729. time_stamp : datetime
  1730. A datetime to use as file creation date. Default is the current
  1731. time.
  1732. data_label : str, optional
  1733. A label for the data set. Must be 80 characters or smaller.
  1734. variable_labels : dict
  1735. Dictionary containing columns as keys and variable labels as
  1736. values. Each label must be 80 characters or smaller.
  1737. .. versionadded:: 0.19.0
  1738. version : {114, 117}, default 114
  1739. Version to use in the output dta file. Version 114 can be used
  1740. read by Stata 10 and later. Version 117 can be read by Stata 13
  1741. or later. Version 114 limits string variables to 244 characters or
  1742. fewer while 117 allows strings with lengths up to 2,000,000
  1743. characters.
  1744. .. versionadded:: 0.23.0
  1745. convert_strl : list, optional
  1746. List of column names to convert to string columns to Stata StrL
  1747. format. Only available if version is 117. Storing strings in the
  1748. StrL format can produce smaller dta files if strings have more than
  1749. 8 characters and values are repeated.
  1750. .. versionadded:: 0.23.0
  1751. Raises
  1752. ------
  1753. NotImplementedError
  1754. * If datetimes contain timezone information
  1755. * Column dtype is not representable in Stata
  1756. ValueError
  1757. * Columns listed in convert_dates are neither datetime64[ns]
  1758. or datetime.datetime
  1759. * Column listed in convert_dates is not in DataFrame
  1760. * Categorical label contains more than 32,000 characters
  1761. .. versionadded:: 0.19.0
  1762. See Also
  1763. --------
  1764. read_stata : Import Stata data files.
  1765. io.stata.StataWriter : Low-level writer for Stata data files.
  1766. io.stata.StataWriter117 : Low-level writer for version 117 files.
  1767. Examples
  1768. --------
  1769. >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',
  1770. ... 'parrot'],
  1771. ... 'speed': [350, 18, 361, 15]})
  1772. >>> df.to_stata('animals.dta') # doctest: +SKIP
  1773. """
  1774. kwargs = {}
  1775. if version not in (114, 117):
  1776. raise ValueError('Only formats 114 and 117 supported.')
  1777. if version == 114:
  1778. if convert_strl is not None:
  1779. raise ValueError('strl support is only available when using '
  1780. 'format 117')
  1781. from pandas.io.stata import StataWriter as statawriter
  1782. else:
  1783. from pandas.io.stata import StataWriter117 as statawriter
  1784. kwargs['convert_strl'] = convert_strl
  1785. writer = statawriter(fname, self, convert_dates=convert_dates,
  1786. byteorder=byteorder, time_stamp=time_stamp,
  1787. data_label=data_label, write_index=write_index,
  1788. variable_labels=variable_labels, **kwargs)
  1789. writer.write_file()
  1790. def to_feather(self, fname):
  1791. """
  1792. Write out the binary feather-format for DataFrames.
  1793. .. versionadded:: 0.20.0
  1794. Parameters
  1795. ----------
  1796. fname : str
  1797. string file path
  1798. """
  1799. from pandas.io.feather_format import to_feather
  1800. to_feather(self, fname)
  1801. def to_parquet(self, fname, engine='auto', compression='snappy',
  1802. index=None, partition_cols=None, **kwargs):
  1803. """
  1804. Write a DataFrame to the binary parquet format.
  1805. .. versionadded:: 0.21.0
  1806. This function writes the dataframe as a `parquet file
  1807. <https://parquet.apache.org/>`_. You can choose different parquet
  1808. backends, and have the option of compression. See
  1809. :ref:`the user guide <io.parquet>` for more details.
  1810. Parameters
  1811. ----------
  1812. fname : str
  1813. File path or Root Directory path. Will be used as Root Directory
  1814. path while writing a partitioned dataset.
  1815. .. versionchanged:: 0.24.0
  1816. engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
  1817. Parquet library to use. If 'auto', then the option
  1818. ``io.parquet.engine`` is used. The default ``io.parquet.engine``
  1819. behavior is to try 'pyarrow', falling back to 'fastparquet' if
  1820. 'pyarrow' is unavailable.
  1821. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
  1822. Name of the compression to use. Use ``None`` for no compression.
  1823. index : bool, default None
  1824. If ``True``, include the dataframe's index(es) in the file output.
  1825. If ``False``, they will not be written to the file. If ``None``,
  1826. the behavior depends on the chosen engine.
  1827. .. versionadded:: 0.24.0
  1828. partition_cols : list, optional, default None
  1829. Column names by which to partition the dataset
  1830. Columns are partitioned in the order they are given
  1831. .. versionadded:: 0.24.0
  1832. **kwargs
  1833. Additional arguments passed to the parquet library. See
  1834. :ref:`pandas io <io.parquet>` for more details.
  1835. See Also
  1836. --------
  1837. read_parquet : Read a parquet file.
  1838. DataFrame.to_csv : Write a csv file.
  1839. DataFrame.to_sql : Write to a sql table.
  1840. DataFrame.to_hdf : Write to hdf.
  1841. Notes
  1842. -----
  1843. This function requires either the `fastparquet
  1844. <https://pypi.org/project/fastparquet>`_ or `pyarrow
  1845. <https://arrow.apache.org/docs/python/>`_ library.
  1846. Examples
  1847. --------
  1848. >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
  1849. >>> df.to_parquet('df.parquet.gzip',
  1850. ... compression='gzip') # doctest: +SKIP
  1851. >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
  1852. col1 col2
  1853. 0 1 3
  1854. 1 2 4
  1855. """
  1856. from pandas.io.parquet import to_parquet
  1857. to_parquet(self, fname, engine,
  1858. compression=compression, index=index,
  1859. partition_cols=partition_cols, **kwargs)
  1860. @Substitution(header='Whether to print column labels, default True')
  1861. @Substitution(shared_params=fmt.common_docstring,
  1862. returns=fmt.return_docstring)
  1863. def to_html(self, buf=None, columns=None, col_space=None, header=True,
  1864. index=True, na_rep='NaN', formatters=None, float_format=None,
  1865. sparsify=None, index_names=True, justify=None, max_rows=None,
  1866. max_cols=None, show_dimensions=False, decimal='.',
  1867. bold_rows=True, classes=None, escape=True, notebook=False,
  1868. border=None, table_id=None, render_links=False):
  1869. """
  1870. Render a DataFrame as an HTML table.
  1871. %(shared_params)s
  1872. bold_rows : bool, default True
  1873. Make the row labels bold in the output.
  1874. classes : str or list or tuple, default None
  1875. CSS class(es) to apply to the resulting html table.
  1876. escape : bool, default True
  1877. Convert the characters <, >, and & to HTML-safe sequences.
  1878. notebook : {True, False}, default False
  1879. Whether the generated HTML is for IPython Notebook.
  1880. border : int
  1881. A ``border=border`` attribute is included in the opening
  1882. `<table>` tag. Default ``pd.options.html.border``.
  1883. .. versionadded:: 0.19.0
  1884. table_id : str, optional
  1885. A css id is included in the opening `<table>` tag if specified.
  1886. .. versionadded:: 0.23.0
  1887. render_links : bool, default False
  1888. Convert URLs to HTML links.
  1889. .. versionadded:: 0.24.0
  1890. %(returns)s
  1891. See Also
  1892. --------
  1893. to_string : Convert DataFrame to a string.
  1894. """
  1895. if (justify is not None and
  1896. justify not in fmt._VALID_JUSTIFY_PARAMETERS):
  1897. raise ValueError("Invalid value for justify parameter")
  1898. formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
  1899. col_space=col_space, na_rep=na_rep,
  1900. formatters=formatters,
  1901. float_format=float_format,
  1902. sparsify=sparsify, justify=justify,
  1903. index_names=index_names,
  1904. header=header, index=index,
  1905. bold_rows=bold_rows, escape=escape,
  1906. max_rows=max_rows,
  1907. max_cols=max_cols,
  1908. show_dimensions=show_dimensions,
  1909. decimal=decimal, table_id=table_id,
  1910. render_links=render_links)
  1911. # TODO: a generic formatter wld b in DataFrameFormatter
  1912. formatter.to_html(classes=classes, notebook=notebook, border=border)
  1913. if buf is None:
  1914. return formatter.buf.getvalue()
  1915. # ----------------------------------------------------------------------
  1916. def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
  1917. null_counts=None):
  1918. """
  1919. Print a concise summary of a DataFrame.
  1920. This method prints information about a DataFrame including
  1921. the index dtype and column dtypes, non-null values and memory usage.
  1922. Parameters
  1923. ----------
  1924. verbose : bool, optional
  1925. Whether to print the full summary. By default, the setting in
  1926. ``pandas.options.display.max_info_columns`` is followed.
  1927. buf : writable buffer, defaults to sys.stdout
  1928. Where to send the output. By default, the output is printed to
  1929. sys.stdout. Pass a writable buffer if you need to further process
  1930. the output.
  1931. max_cols : int, optional
  1932. When to switch from the verbose to the truncated output. If the
  1933. DataFrame has more than `max_cols` columns, the truncated output
  1934. is used. By default, the setting in
  1935. ``pandas.options.display.max_info_columns`` is used.
  1936. memory_usage : bool, str, optional
  1937. Specifies whether total memory usage of the DataFrame
  1938. elements (including the index) should be displayed. By default,
  1939. this follows the ``pandas.options.display.memory_usage`` setting.
  1940. True always show memory usage. False never shows memory usage.
  1941. A value of 'deep' is equivalent to "True with deep introspection".
  1942. Memory usage is shown in human-readable units (base-2
  1943. representation). Without deep introspection a memory estimation is
  1944. made based in column dtype and number of rows assuming values
  1945. consume the same memory amount for corresponding dtypes. With deep
  1946. memory introspection, a real memory usage calculation is performed
  1947. at the cost of computational resources.
  1948. null_counts : bool, optional
  1949. Whether to show the non-null counts. By default, this is shown
  1950. only if the frame is smaller than
  1951. ``pandas.options.display.max_info_rows`` and
  1952. ``pandas.options.display.max_info_columns``. A value of True always
  1953. shows the counts, and False never shows the counts.
  1954. Returns
  1955. -------
  1956. None
  1957. This method prints a summary of a DataFrame and returns None.
  1958. See Also
  1959. --------
  1960. DataFrame.describe: Generate descriptive statistics of DataFrame
  1961. columns.
  1962. DataFrame.memory_usage: Memory usage of DataFrame columns.
  1963. Examples
  1964. --------
  1965. >>> int_values = [1, 2, 3, 4, 5]
  1966. >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
  1967. >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
  1968. >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
  1969. ... "float_col": float_values})
  1970. >>> df
  1971. int_col text_col float_col
  1972. 0 1 alpha 0.00
  1973. 1 2 beta 0.25
  1974. 2 3 gamma 0.50
  1975. 3 4 delta 0.75
  1976. 4 5 epsilon 1.00
  1977. Prints information of all columns:
  1978. >>> df.info(verbose=True)
  1979. <class 'pandas.core.frame.DataFrame'>
  1980. RangeIndex: 5 entries, 0 to 4
  1981. Data columns (total 3 columns):
  1982. int_col 5 non-null int64
  1983. text_col 5 non-null object
  1984. float_col 5 non-null float64
  1985. dtypes: float64(1), int64(1), object(1)
  1986. memory usage: 200.0+ bytes
  1987. Prints a summary of columns count and its dtypes but not per column
  1988. information:
  1989. >>> df.info(verbose=False)
  1990. <class 'pandas.core.frame.DataFrame'>
  1991. RangeIndex: 5 entries, 0 to 4
  1992. Columns: 3 entries, int_col to float_col
  1993. dtypes: float64(1), int64(1), object(1)
  1994. memory usage: 200.0+ bytes
  1995. Pipe output of DataFrame.info to buffer instead of sys.stdout, get
  1996. buffer content and writes to a text file:
  1997. >>> import io
  1998. >>> buffer = io.StringIO()
  1999. >>> df.info(buf=buffer)
  2000. >>> s = buffer.getvalue()
  2001. >>> with open("df_info.txt", "w",
  2002. ... encoding="utf-8") as f: # doctest: +SKIP
  2003. ... f.write(s)
  2004. 260
  2005. The `memory_usage` parameter allows deep introspection mode, specially
  2006. useful for big DataFrames and fine-tune memory optimization:
  2007. >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
  2008. >>> df = pd.DataFrame({
  2009. ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  2010. ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  2011. ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
  2012. ... })
  2013. >>> df.info()
  2014. <class 'pandas.core.frame.DataFrame'>
  2015. RangeIndex: 1000000 entries, 0 to 999999
  2016. Data columns (total 3 columns):
  2017. column_1 1000000 non-null object
  2018. column_2 1000000 non-null object
  2019. column_3 1000000 non-null object
  2020. dtypes: object(3)
  2021. memory usage: 22.9+ MB
  2022. >>> df.info(memory_usage='deep')
  2023. <class 'pandas.core.frame.DataFrame'>
  2024. RangeIndex: 1000000 entries, 0 to 999999
  2025. Data columns (total 3 columns):
  2026. column_1 1000000 non-null object
  2027. column_2 1000000 non-null object
  2028. column_3 1000000 non-null object
  2029. dtypes: object(3)
  2030. memory usage: 188.8 MB
  2031. """
  2032. if buf is None: # pragma: no cover
  2033. buf = sys.stdout
  2034. lines = []
  2035. lines.append(str(type(self)))
  2036. lines.append(self.index._summary())
  2037. if len(self.columns) == 0:
  2038. lines.append('Empty {name}'.format(name=type(self).__name__))
  2039. fmt.buffer_put_lines(buf, lines)
  2040. return
  2041. cols = self.columns
  2042. # hack
  2043. if max_cols is None:
  2044. max_cols = get_option('display.max_info_columns',
  2045. len(self.columns) + 1)
  2046. max_rows = get_option('display.max_info_rows', len(self) + 1)
  2047. if null_counts is None:
  2048. show_counts = ((len(self.columns) <= max_cols) and
  2049. (len(self) < max_rows))
  2050. else:
  2051. show_counts = null_counts
  2052. exceeds_info_cols = len(self.columns) > max_cols
  2053. def _verbose_repr():
  2054. lines.append('Data columns (total %d columns):' %
  2055. len(self.columns))
  2056. space = max(len(pprint_thing(k)) for k in self.columns) + 4
  2057. counts = None
  2058. tmpl = "{count}{dtype}"
  2059. if show_counts:
  2060. counts = self.count()
  2061. if len(cols) != len(counts): # pragma: no cover
  2062. raise AssertionError(
  2063. 'Columns must equal counts '
  2064. '({cols:d} != {counts:d})'.format(
  2065. cols=len(cols), counts=len(counts)))
  2066. tmpl = "{count} non-null {dtype}"
  2067. dtypes = self.dtypes
  2068. for i, col in enumerate(self.columns):
  2069. dtype = dtypes.iloc[i]
  2070. col = pprint_thing(col)
  2071. count = ""
  2072. if show_counts:
  2073. count = counts.iloc[i]
  2074. lines.append(_put_str(col, space) + tmpl.format(count=count,
  2075. dtype=dtype))
  2076. def _non_verbose_repr():
  2077. lines.append(self.columns._summary(name='Columns'))
  2078. def _sizeof_fmt(num, size_qualifier):
  2079. # returns size in human readable format
  2080. for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
  2081. if num < 1024.0:
  2082. return ("{num:3.1f}{size_q} "
  2083. "{x}".format(num=num, size_q=size_qualifier, x=x))
  2084. num /= 1024.0
  2085. return "{num:3.1f}{size_q} {pb}".format(num=num,
  2086. size_q=size_qualifier,
  2087. pb='PB')
  2088. if verbose:
  2089. _verbose_repr()
  2090. elif verbose is False: # specifically set to False, not nesc None
  2091. _non_verbose_repr()
  2092. else:
  2093. if exceeds_info_cols:
  2094. _non_verbose_repr()
  2095. else:
  2096. _verbose_repr()
  2097. counts = self.get_dtype_counts()
  2098. dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
  2099. in sorted(compat.iteritems(counts))]
  2100. lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
  2101. if memory_usage is None:
  2102. memory_usage = get_option('display.memory_usage')
  2103. if memory_usage:
  2104. # append memory usage of df to display
  2105. size_qualifier = ''
  2106. if memory_usage == 'deep':
  2107. deep = True
  2108. else:
  2109. # size_qualifier is just a best effort; not guaranteed to catch
  2110. # all cases (e.g., it misses categorical data even with object
  2111. # categories)
  2112. deep = False
  2113. if ('object' in counts or
  2114. self.index._is_memory_usage_qualified()):
  2115. size_qualifier = '+'
  2116. mem_usage = self.memory_usage(index=True, deep=deep).sum()
  2117. lines.append("memory usage: {mem}\n".format(
  2118. mem=_sizeof_fmt(mem_usage, size_qualifier)))
  2119. fmt.buffer_put_lines(buf, lines)
  2120. def memory_usage(self, index=True, deep=False):
  2121. """
  2122. Return the memory usage of each column in bytes.
  2123. The memory usage can optionally include the contribution of
  2124. the index and elements of `object` dtype.
  2125. This value is displayed in `DataFrame.info` by default. This can be
  2126. suppressed by setting ``pandas.options.display.memory_usage`` to False.
  2127. Parameters
  2128. ----------
  2129. index : bool, default True
  2130. Specifies whether to include the memory usage of the DataFrame's
  2131. index in returned Series. If ``index=True`` the memory usage of the
  2132. index the first item in the output.
  2133. deep : bool, default False
  2134. If True, introspect the data deeply by interrogating
  2135. `object` dtypes for system-level memory consumption, and include
  2136. it in the returned values.
  2137. Returns
  2138. -------
  2139. sizes : Series
  2140. A Series whose index is the original column names and whose values
  2141. is the memory usage of each column in bytes.
  2142. See Also
  2143. --------
  2144. numpy.ndarray.nbytes : Total bytes consumed by the elements of an
  2145. ndarray.
  2146. Series.memory_usage : Bytes consumed by a Series.
  2147. pandas.Categorical : Memory-efficient array for string values with
  2148. many repeated values.
  2149. DataFrame.info : Concise summary of a DataFrame.
  2150. Examples
  2151. --------
  2152. >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
  2153. >>> data = dict([(t, np.ones(shape=5000).astype(t))
  2154. ... for t in dtypes])
  2155. >>> df = pd.DataFrame(data)
  2156. >>> df.head()
  2157. int64 float64 complex128 object bool
  2158. 0 1 1.0 (1+0j) 1 True
  2159. 1 1 1.0 (1+0j) 1 True
  2160. 2 1 1.0 (1+0j) 1 True
  2161. 3 1 1.0 (1+0j) 1 True
  2162. 4 1 1.0 (1+0j) 1 True
  2163. >>> df.memory_usage()
  2164. Index 80
  2165. int64 40000
  2166. float64 40000
  2167. complex128 80000
  2168. object 40000
  2169. bool 5000
  2170. dtype: int64
  2171. >>> df.memory_usage(index=False)
  2172. int64 40000
  2173. float64 40000
  2174. complex128 80000
  2175. object 40000
  2176. bool 5000
  2177. dtype: int64
  2178. The memory footprint of `object` dtype columns is ignored by default:
  2179. >>> df.memory_usage(deep=True)
  2180. Index 80
  2181. int64 40000
  2182. float64 40000
  2183. complex128 80000
  2184. object 160000
  2185. bool 5000
  2186. dtype: int64
  2187. Use a Categorical for efficient storage of an object-dtype column with
  2188. many repeated values.
  2189. >>> df['object'].astype('category').memory_usage(deep=True)
  2190. 5168
  2191. """
  2192. result = Series([c.memory_usage(index=False, deep=deep)
  2193. for col, c in self.iteritems()], index=self.columns)
  2194. if index:
  2195. result = Series(self.index.memory_usage(deep=deep),
  2196. index=['Index']).append(result)
  2197. return result
  2198. def transpose(self, *args, **kwargs):
  2199. """
  2200. Transpose index and columns.
  2201. Reflect the DataFrame over its main diagonal by writing rows as columns
  2202. and vice-versa. The property :attr:`.T` is an accessor to the method
  2203. :meth:`transpose`.
  2204. Parameters
  2205. ----------
  2206. copy : bool, default False
  2207. If True, the underlying data is copied. Otherwise (default), no
  2208. copy is made if possible.
  2209. *args, **kwargs
  2210. Additional keywords have no effect but might be accepted for
  2211. compatibility with numpy.
  2212. Returns
  2213. -------
  2214. DataFrame
  2215. The transposed DataFrame.
  2216. See Also
  2217. --------
  2218. numpy.transpose : Permute the dimensions of a given array.
  2219. Notes
  2220. -----
  2221. Transposing a DataFrame with mixed dtypes will result in a homogeneous
  2222. DataFrame with the `object` dtype. In such a case, a copy of the data
  2223. is always made.
  2224. Examples
  2225. --------
  2226. **Square DataFrame with homogeneous dtype**
  2227. >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
  2228. >>> df1 = pd.DataFrame(data=d1)
  2229. >>> df1
  2230. col1 col2
  2231. 0 1 3
  2232. 1 2 4
  2233. >>> df1_transposed = df1.T # or df1.transpose()
  2234. >>> df1_transposed
  2235. 0 1
  2236. col1 1 2
  2237. col2 3 4
  2238. When the dtype is homogeneous in the original DataFrame, we get a
  2239. transposed DataFrame with the same dtype:
  2240. >>> df1.dtypes
  2241. col1 int64
  2242. col2 int64
  2243. dtype: object
  2244. >>> df1_transposed.dtypes
  2245. 0 int64
  2246. 1 int64
  2247. dtype: object
  2248. **Non-square DataFrame with mixed dtypes**
  2249. >>> d2 = {'name': ['Alice', 'Bob'],
  2250. ... 'score': [9.5, 8],
  2251. ... 'employed': [False, True],
  2252. ... 'kids': [0, 0]}
  2253. >>> df2 = pd.DataFrame(data=d2)
  2254. >>> df2
  2255. name score employed kids
  2256. 0 Alice 9.5 False 0
  2257. 1 Bob 8.0 True 0
  2258. >>> df2_transposed = df2.T # or df2.transpose()
  2259. >>> df2_transposed
  2260. 0 1
  2261. name Alice Bob
  2262. score 9.5 8
  2263. employed False True
  2264. kids 0 0
  2265. When the DataFrame has mixed dtypes, we get a transposed DataFrame with
  2266. the `object` dtype:
  2267. >>> df2.dtypes
  2268. name object
  2269. score float64
  2270. employed bool
  2271. kids int64
  2272. dtype: object
  2273. >>> df2_transposed.dtypes
  2274. 0 object
  2275. 1 object
  2276. dtype: object
  2277. """
  2278. nv.validate_transpose(args, dict())
  2279. return super(DataFrame, self).transpose(1, 0, **kwargs)
  2280. T = property(transpose)
  2281. # ----------------------------------------------------------------------
  2282. # Picklability
  2283. # legacy pickle formats
  2284. def _unpickle_frame_compat(self, state): # pragma: no cover
  2285. if len(state) == 2: # pragma: no cover
  2286. series, idx = state
  2287. columns = sorted(series)
  2288. else:
  2289. series, cols, idx = state
  2290. columns = com._unpickle_array(cols)
  2291. index = com._unpickle_array(idx)
  2292. self._data = self._init_dict(series, index, columns, None)
  2293. def _unpickle_matrix_compat(self, state): # pragma: no cover
  2294. # old unpickling
  2295. (vals, idx, cols), object_state = state
  2296. index = com._unpickle_array(idx)
  2297. dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols),
  2298. copy=False)
  2299. if object_state is not None:
  2300. ovals, _, ocols = object_state
  2301. objects = DataFrame(ovals, index=index,
  2302. columns=com._unpickle_array(ocols), copy=False)
  2303. dm = dm.join(objects)
  2304. self._data = dm._data
  2305. # ----------------------------------------------------------------------
  2306. # Getting and setting elements
  2307. def get_value(self, index, col, takeable=False):
  2308. """
  2309. Quickly retrieve single value at passed column and index.
  2310. .. deprecated:: 0.21.0
  2311. Use .at[] or .iat[] accessors instead.
  2312. Parameters
  2313. ----------
  2314. index : row label
  2315. col : column label
  2316. takeable : interpret the index/col as indexers, default False
  2317. Returns
  2318. -------
  2319. value : scalar value
  2320. """
  2321. warnings.warn("get_value is deprecated and will be removed "
  2322. "in a future release. Please use "
  2323. ".at[] or .iat[] accessors instead", FutureWarning,
  2324. stacklevel=2)
  2325. return self._get_value(index, col, takeable=takeable)
  2326. def _get_value(self, index, col, takeable=False):
  2327. if takeable:
  2328. series = self._iget_item_cache(col)
  2329. return com.maybe_box_datetimelike(series._values[index])
  2330. series = self._get_item_cache(col)
  2331. engine = self.index._engine
  2332. try:
  2333. return engine.get_value(series._values, index)
  2334. except (TypeError, ValueError):
  2335. # we cannot handle direct indexing
  2336. # use positional
  2337. col = self.columns.get_loc(col)
  2338. index = self.index.get_loc(index)
  2339. return self._get_value(index, col, takeable=True)
  2340. _get_value.__doc__ = get_value.__doc__
  2341. def set_value(self, index, col, value, takeable=False):
  2342. """
  2343. Put single value at passed column and index.
  2344. .. deprecated:: 0.21.0
  2345. Use .at[] or .iat[] accessors instead.
  2346. Parameters
  2347. ----------
  2348. index : row label
  2349. col : column label
  2350. value : scalar value
  2351. takeable : interpret the index/col as indexers, default False
  2352. Returns
  2353. -------
  2354. frame : DataFrame
  2355. If label pair is contained, will be reference to calling DataFrame,
  2356. otherwise a new object
  2357. """
  2358. warnings.warn("set_value is deprecated and will be removed "
  2359. "in a future release. Please use "
  2360. ".at[] or .iat[] accessors instead", FutureWarning,
  2361. stacklevel=2)
  2362. return self._set_value(index, col, value, takeable=takeable)
  2363. def _set_value(self, index, col, value, takeable=False):
  2364. try:
  2365. if takeable is True:
  2366. series = self._iget_item_cache(col)
  2367. return series._set_value(index, value, takeable=True)
  2368. series = self._get_item_cache(col)
  2369. engine = self.index._engine
  2370. engine.set_value(series._values, index, value)
  2371. return self
  2372. except (KeyError, TypeError):
  2373. # set using a non-recursive method & reset the cache
  2374. if takeable:
  2375. self.iloc[index, col] = value
  2376. else:
  2377. self.loc[index, col] = value
  2378. self._item_cache.pop(col, None)
  2379. return self
  2380. _set_value.__doc__ = set_value.__doc__
  2381. def _ixs(self, i, axis=0):
  2382. """
  2383. Parameters
  2384. ----------
  2385. i : int, slice, or sequence of integers
  2386. axis : int
  2387. Notes
  2388. -----
  2389. If slice passed, the resulting data will be a view.
  2390. """
  2391. # irow
  2392. if axis == 0:
  2393. if isinstance(i, slice):
  2394. return self[i]
  2395. else:
  2396. label = self.index[i]
  2397. if isinstance(label, Index):
  2398. # a location index by definition
  2399. result = self.take(i, axis=axis)
  2400. copy = True
  2401. else:
  2402. new_values = self._data.fast_xs(i)
  2403. if is_scalar(new_values):
  2404. return new_values
  2405. # if we are a copy, mark as such
  2406. copy = (isinstance(new_values, np.ndarray) and
  2407. new_values.base is None)
  2408. result = self._constructor_sliced(new_values,
  2409. index=self.columns,
  2410. name=self.index[i],
  2411. dtype=new_values.dtype)
  2412. result._set_is_copy(self, copy=copy)
  2413. return result
  2414. # icol
  2415. else:
  2416. label = self.columns[i]
  2417. if isinstance(i, slice):
  2418. # need to return view
  2419. lab_slice = slice(label[0], label[-1])
  2420. return self.loc[:, lab_slice]
  2421. else:
  2422. if isinstance(label, Index):
  2423. return self._take(i, axis=1)
  2424. index_len = len(self.index)
  2425. # if the values returned are not the same length
  2426. # as the index (iow a not found value), iget returns
  2427. # a 0-len ndarray. This is effectively catching
  2428. # a numpy error (as numpy should really raise)
  2429. values = self._data.iget(i)
  2430. if index_len and not len(values):
  2431. values = np.array([np.nan] * index_len, dtype=object)
  2432. result = self._box_col_values(values, label)
  2433. # this is a cached value, mark it so
  2434. result._set_as_cached(label, self)
  2435. return result
  2436. def __getitem__(self, key):
  2437. key = com.apply_if_callable(key, self)
  2438. # shortcut if the key is in columns
  2439. try:
  2440. if self.columns.is_unique and key in self.columns:
  2441. if self.columns.nlevels > 1:
  2442. return self._getitem_multilevel(key)
  2443. return self._get_item_cache(key)
  2444. except (TypeError, ValueError):
  2445. # The TypeError correctly catches non hashable "key" (e.g. list)
  2446. # The ValueError can be removed once GH #21729 is fixed
  2447. pass
  2448. # Do we have a slicer (on rows)?
  2449. indexer = convert_to_index_sliceable(self, key)
  2450. if indexer is not None:
  2451. return self._slice(indexer, axis=0)
  2452. # Do we have a (boolean) DataFrame?
  2453. if isinstance(key, DataFrame):
  2454. return self._getitem_frame(key)
  2455. # Do we have a (boolean) 1d indexer?
  2456. if com.is_bool_indexer(key):
  2457. return self._getitem_bool_array(key)
  2458. # We are left with two options: a single key, and a collection of keys,
  2459. # We interpret tuples as collections only for non-MultiIndex
  2460. is_single_key = isinstance(key, tuple) or not is_list_like(key)
  2461. if is_single_key:
  2462. if self.columns.nlevels > 1:
  2463. return self._getitem_multilevel(key)
  2464. indexer = self.columns.get_loc(key)
  2465. if is_integer(indexer):
  2466. indexer = [indexer]
  2467. else:
  2468. if is_iterator(key):
  2469. key = list(key)
  2470. indexer = self.loc._convert_to_indexer(key, axis=1,
  2471. raise_missing=True)
  2472. # take() does not accept boolean indexers
  2473. if getattr(indexer, "dtype", None) == bool:
  2474. indexer = np.where(indexer)[0]
  2475. data = self._take(indexer, axis=1)
  2476. if is_single_key:
  2477. # What does looking for a single key in a non-unique index return?
  2478. # The behavior is inconsistent. It returns a Series, except when
  2479. # - the key itself is repeated (test on data.shape, #9519), or
  2480. # - we have a MultiIndex on columns (test on self.columns, #21309)
  2481. if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
  2482. data = data[key]
  2483. return data
  2484. def _getitem_bool_array(self, key):
  2485. # also raises Exception if object array with NA values
  2486. # warning here just in case -- previously __setitem__ was
  2487. # reindexing but __getitem__ was not; it seems more reasonable to
  2488. # go with the __setitem__ behavior since that is more consistent
  2489. # with all other indexing behavior
  2490. if isinstance(key, Series) and not key.index.equals(self.index):
  2491. warnings.warn("Boolean Series key will be reindexed to match "
  2492. "DataFrame index.", UserWarning, stacklevel=3)
  2493. elif len(key) != len(self.index):
  2494. raise ValueError('Item wrong length %d instead of %d.' %
  2495. (len(key), len(self.index)))
  2496. # check_bool_indexer will throw exception if Series key cannot
  2497. # be reindexed to match DataFrame rows
  2498. key = check_bool_indexer(self.index, key)
  2499. indexer = key.nonzero()[0]
  2500. return self._take(indexer, axis=0)
  2501. def _getitem_multilevel(self, key):
  2502. loc = self.columns.get_loc(key)
  2503. if isinstance(loc, (slice, Series, np.ndarray, Index)):
  2504. new_columns = self.columns[loc]
  2505. result_columns = maybe_droplevels(new_columns, key)
  2506. if self._is_mixed_type:
  2507. result = self.reindex(columns=new_columns)
  2508. result.columns = result_columns
  2509. else:
  2510. new_values = self.values[:, loc]
  2511. result = self._constructor(new_values, index=self.index,
  2512. columns=result_columns)
  2513. result = result.__finalize__(self)
  2514. # If there is only one column being returned, and its name is
  2515. # either an empty string, or a tuple with an empty string as its
  2516. # first element, then treat the empty string as a placeholder
  2517. # and return the column as if the user had provided that empty
  2518. # string in the key. If the result is a Series, exclude the
  2519. # implied empty string from its name.
  2520. if len(result.columns) == 1:
  2521. top = result.columns[0]
  2522. if isinstance(top, tuple):
  2523. top = top[0]
  2524. if top == '':
  2525. result = result['']
  2526. if isinstance(result, Series):
  2527. result = self._constructor_sliced(result,
  2528. index=self.index,
  2529. name=key)
  2530. result._set_is_copy(self)
  2531. return result
  2532. else:
  2533. return self._get_item_cache(key)
  2534. def _getitem_frame(self, key):
  2535. if key.values.size and not is_bool_dtype(key.values):
  2536. raise ValueError('Must pass DataFrame with boolean values only')
  2537. return self.where(key)
  2538. def query(self, expr, inplace=False, **kwargs):
  2539. """
  2540. Query the columns of a DataFrame with a boolean expression.
  2541. Parameters
  2542. ----------
  2543. expr : string
  2544. The query string to evaluate. You can refer to variables
  2545. in the environment by prefixing them with an '@' character like
  2546. ``@a + b``.
  2547. inplace : bool
  2548. Whether the query should modify the data in place or return
  2549. a modified copy
  2550. .. versionadded:: 0.18.0
  2551. kwargs : dict
  2552. See the documentation for :func:`pandas.eval` for complete details
  2553. on the keyword arguments accepted by :meth:`DataFrame.query`.
  2554. Returns
  2555. -------
  2556. q : DataFrame
  2557. See Also
  2558. --------
  2559. pandas.eval
  2560. DataFrame.eval
  2561. Notes
  2562. -----
  2563. The result of the evaluation of this expression is first passed to
  2564. :attr:`DataFrame.loc` and if that fails because of a
  2565. multidimensional key (e.g., a DataFrame) then the result will be passed
  2566. to :meth:`DataFrame.__getitem__`.
  2567. This method uses the top-level :func:`pandas.eval` function to
  2568. evaluate the passed query.
  2569. The :meth:`~pandas.DataFrame.query` method uses a slightly
  2570. modified Python syntax by default. For example, the ``&`` and ``|``
  2571. (bitwise) operators have the precedence of their boolean cousins,
  2572. :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
  2573. however the semantics are different.
  2574. You can change the semantics of the expression by passing the keyword
  2575. argument ``parser='python'``. This enforces the same semantics as
  2576. evaluation in Python space. Likewise, you can pass ``engine='python'``
  2577. to evaluate an expression using Python itself as a backend. This is not
  2578. recommended as it is inefficient compared to using ``numexpr`` as the
  2579. engine.
  2580. The :attr:`DataFrame.index` and
  2581. :attr:`DataFrame.columns` attributes of the
  2582. :class:`~pandas.DataFrame` instance are placed in the query namespace
  2583. by default, which allows you to treat both the index and columns of the
  2584. frame as a column in the frame.
  2585. The identifier ``index`` is used for the frame index; you can also
  2586. use the name of the index to identify it in a query. Please note that
  2587. Python keywords may not be used as identifiers.
  2588. For further details and examples see the ``query`` documentation in
  2589. :ref:`indexing <indexing.query>`.
  2590. Examples
  2591. --------
  2592. >>> df = pd.DataFrame(np.random.randn(10, 2), columns=list('ab'))
  2593. >>> df.query('a > b')
  2594. >>> df[df.a > df.b] # same result as the previous expression
  2595. """
  2596. inplace = validate_bool_kwarg(inplace, 'inplace')
  2597. if not isinstance(expr, compat.string_types):
  2598. msg = "expr must be a string to be evaluated, {0} given"
  2599. raise ValueError(msg.format(type(expr)))
  2600. kwargs['level'] = kwargs.pop('level', 0) + 1
  2601. kwargs['target'] = None
  2602. res = self.eval(expr, **kwargs)
  2603. try:
  2604. new_data = self.loc[res]
  2605. except ValueError:
  2606. # when res is multi-dimensional loc raises, but this is sometimes a
  2607. # valid query
  2608. new_data = self[res]
  2609. if inplace:
  2610. self._update_inplace(new_data)
  2611. else:
  2612. return new_data
  2613. def eval(self, expr, inplace=False, **kwargs):
  2614. """
  2615. Evaluate a string describing operations on DataFrame columns.
  2616. Operates on columns only, not specific rows or elements. This allows
  2617. `eval` to run arbitrary code, which can make you vulnerable to code
  2618. injection if you pass user input to this function.
  2619. Parameters
  2620. ----------
  2621. expr : str
  2622. The expression string to evaluate.
  2623. inplace : bool, default False
  2624. If the expression contains an assignment, whether to perform the
  2625. operation inplace and mutate the existing DataFrame. Otherwise,
  2626. a new DataFrame is returned.
  2627. .. versionadded:: 0.18.0.
  2628. kwargs : dict
  2629. See the documentation for :func:`~pandas.eval` for complete details
  2630. on the keyword arguments accepted by
  2631. :meth:`~pandas.DataFrame.query`.
  2632. Returns
  2633. -------
  2634. ndarray, scalar, or pandas object
  2635. The result of the evaluation.
  2636. See Also
  2637. --------
  2638. DataFrame.query : Evaluates a boolean expression to query the columns
  2639. of a frame.
  2640. DataFrame.assign : Can evaluate an expression or function to create new
  2641. values for a column.
  2642. pandas.eval : Evaluate a Python expression as a string using various
  2643. backends.
  2644. Notes
  2645. -----
  2646. For more details see the API documentation for :func:`~pandas.eval`.
  2647. For detailed examples see :ref:`enhancing performance with eval
  2648. <enhancingperf.eval>`.
  2649. Examples
  2650. --------
  2651. >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
  2652. >>> df
  2653. A B
  2654. 0 1 10
  2655. 1 2 8
  2656. 2 3 6
  2657. 3 4 4
  2658. 4 5 2
  2659. >>> df.eval('A + B')
  2660. 0 11
  2661. 1 10
  2662. 2 9
  2663. 3 8
  2664. 4 7
  2665. dtype: int64
  2666. Assignment is allowed though by default the original DataFrame is not
  2667. modified.
  2668. >>> df.eval('C = A + B')
  2669. A B C
  2670. 0 1 10 11
  2671. 1 2 8 10
  2672. 2 3 6 9
  2673. 3 4 4 8
  2674. 4 5 2 7
  2675. >>> df
  2676. A B
  2677. 0 1 10
  2678. 1 2 8
  2679. 2 3 6
  2680. 3 4 4
  2681. 4 5 2
  2682. Use ``inplace=True`` to modify the original DataFrame.
  2683. >>> df.eval('C = A + B', inplace=True)
  2684. >>> df
  2685. A B C
  2686. 0 1 10 11
  2687. 1 2 8 10
  2688. 2 3 6 9
  2689. 3 4 4 8
  2690. 4 5 2 7
  2691. """
  2692. from pandas.core.computation.eval import eval as _eval
  2693. inplace = validate_bool_kwarg(inplace, 'inplace')
  2694. resolvers = kwargs.pop('resolvers', None)
  2695. kwargs['level'] = kwargs.pop('level', 0) + 1
  2696. if resolvers is None:
  2697. index_resolvers = self._get_index_resolvers()
  2698. resolvers = dict(self.iteritems()), index_resolvers
  2699. if 'target' not in kwargs:
  2700. kwargs['target'] = self
  2701. kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
  2702. return _eval(expr, inplace=inplace, **kwargs)
  2703. def select_dtypes(self, include=None, exclude=None):
  2704. """
  2705. Return a subset of the DataFrame's columns based on the column dtypes.
  2706. Parameters
  2707. ----------
  2708. include, exclude : scalar or list-like
  2709. A selection of dtypes or strings to be included/excluded. At least
  2710. one of these parameters must be supplied.
  2711. Returns
  2712. -------
  2713. subset : DataFrame
  2714. The subset of the frame including the dtypes in ``include`` and
  2715. excluding the dtypes in ``exclude``.
  2716. Raises
  2717. ------
  2718. ValueError
  2719. * If both of ``include`` and ``exclude`` are empty
  2720. * If ``include`` and ``exclude`` have overlapping elements
  2721. * If any kind of string dtype is passed in.
  2722. Notes
  2723. -----
  2724. * To select all *numeric* types, use ``np.number`` or ``'number'``
  2725. * To select strings you must use the ``object`` dtype, but note that
  2726. this will return *all* object dtype columns
  2727. * See the `numpy dtype hierarchy
  2728. <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
  2729. * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
  2730. ``'datetime64'``
  2731. * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
  2732. ``'timedelta64'``
  2733. * To select Pandas categorical dtypes, use ``'category'``
  2734. * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
  2735. 0.20.0) or ``'datetime64[ns, tz]'``
  2736. Examples
  2737. --------
  2738. >>> df = pd.DataFrame({'a': [1, 2] * 3,
  2739. ... 'b': [True, False] * 3,
  2740. ... 'c': [1.0, 2.0] * 3})
  2741. >>> df
  2742. a b c
  2743. 0 1 True 1.0
  2744. 1 2 False 2.0
  2745. 2 1 True 1.0
  2746. 3 2 False 2.0
  2747. 4 1 True 1.0
  2748. 5 2 False 2.0
  2749. >>> df.select_dtypes(include='bool')
  2750. b
  2751. 0 True
  2752. 1 False
  2753. 2 True
  2754. 3 False
  2755. 4 True
  2756. 5 False
  2757. >>> df.select_dtypes(include=['float64'])
  2758. c
  2759. 0 1.0
  2760. 1 2.0
  2761. 2 1.0
  2762. 3 2.0
  2763. 4 1.0
  2764. 5 2.0
  2765. >>> df.select_dtypes(exclude=['int'])
  2766. b c
  2767. 0 True 1.0
  2768. 1 False 2.0
  2769. 2 True 1.0
  2770. 3 False 2.0
  2771. 4 True 1.0
  2772. 5 False 2.0
  2773. """
  2774. def _get_info_slice(obj, indexer):
  2775. """Slice the info axis of `obj` with `indexer`."""
  2776. if not hasattr(obj, '_info_axis_number'):
  2777. msg = 'object of type {typ!r} has no info axis'
  2778. raise TypeError(msg.format(typ=type(obj).__name__))
  2779. slices = [slice(None)] * obj.ndim
  2780. slices[obj._info_axis_number] = indexer
  2781. return tuple(slices)
  2782. if not is_list_like(include):
  2783. include = (include,) if include is not None else ()
  2784. if not is_list_like(exclude):
  2785. exclude = (exclude,) if exclude is not None else ()
  2786. selection = tuple(map(frozenset, (include, exclude)))
  2787. if not any(selection):
  2788. raise ValueError('at least one of include or exclude must be '
  2789. 'nonempty')
  2790. # convert the myriad valid dtypes object to a single representation
  2791. include, exclude = map(
  2792. lambda x: frozenset(map(infer_dtype_from_object, x)), selection)
  2793. for dtypes in (include, exclude):
  2794. invalidate_string_dtypes(dtypes)
  2795. # can't both include AND exclude!
  2796. if not include.isdisjoint(exclude):
  2797. raise ValueError('include and exclude overlap on {inc_ex}'.format(
  2798. inc_ex=(include & exclude)))
  2799. # empty include/exclude -> defaults to True
  2800. # three cases (we've already raised if both are empty)
  2801. # case 1: empty include, nonempty exclude
  2802. # we have True, True, ... True for include, same for exclude
  2803. # in the loop below we get the excluded
  2804. # and when we call '&' below we get only the excluded
  2805. # case 2: nonempty include, empty exclude
  2806. # same as case 1, but with include
  2807. # case 3: both nonempty
  2808. # the "union" of the logic of case 1 and case 2:
  2809. # we get the included and excluded, and return their logical and
  2810. include_these = Series(not bool(include), index=self.columns)
  2811. exclude_these = Series(not bool(exclude), index=self.columns)
  2812. def is_dtype_instance_mapper(idx, dtype):
  2813. return idx, functools.partial(issubclass, dtype.type)
  2814. for idx, f in itertools.starmap(is_dtype_instance_mapper,
  2815. enumerate(self.dtypes)):
  2816. if include: # checks for the case of empty include or exclude
  2817. include_these.iloc[idx] = any(map(f, include))
  2818. if exclude:
  2819. exclude_these.iloc[idx] = not any(map(f, exclude))
  2820. dtype_indexer = include_these & exclude_these
  2821. return self.loc[_get_info_slice(self, dtype_indexer)]
  2822. def _box_item_values(self, key, values):
  2823. items = self.columns[self.columns.get_loc(key)]
  2824. if values.ndim == 2:
  2825. return self._constructor(values.T, columns=items, index=self.index)
  2826. else:
  2827. return self._box_col_values(values, items)
  2828. def _box_col_values(self, values, items):
  2829. """
  2830. Provide boxed values for a column.
  2831. """
  2832. klass = self._constructor_sliced
  2833. return klass(values, index=self.index, name=items, fastpath=True)
  2834. def __setitem__(self, key, value):
  2835. key = com.apply_if_callable(key, self)
  2836. # see if we can slice the rows
  2837. indexer = convert_to_index_sliceable(self, key)
  2838. if indexer is not None:
  2839. return self._setitem_slice(indexer, value)
  2840. if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2:
  2841. self._setitem_frame(key, value)
  2842. elif isinstance(key, (Series, np.ndarray, list, Index)):
  2843. self._setitem_array(key, value)
  2844. else:
  2845. # set column
  2846. self._set_item(key, value)
  2847. def _setitem_slice(self, key, value):
  2848. self._check_setitem_copy()
  2849. self.loc._setitem_with_indexer(key, value)
  2850. def _setitem_array(self, key, value):
  2851. # also raises Exception if object array with NA values
  2852. if com.is_bool_indexer(key):
  2853. if len(key) != len(self.index):
  2854. raise ValueError('Item wrong length %d instead of %d!' %
  2855. (len(key), len(self.index)))
  2856. key = check_bool_indexer(self.index, key)
  2857. indexer = key.nonzero()[0]
  2858. self._check_setitem_copy()
  2859. self.loc._setitem_with_indexer(indexer, value)
  2860. else:
  2861. if isinstance(value, DataFrame):
  2862. if len(value.columns) != len(key):
  2863. raise ValueError('Columns must be same length as key')
  2864. for k1, k2 in zip(key, value.columns):
  2865. self[k1] = value[k2]
  2866. else:
  2867. indexer = self.loc._convert_to_indexer(key, axis=1)
  2868. self._check_setitem_copy()
  2869. self.loc._setitem_with_indexer((slice(None), indexer), value)
  2870. def _setitem_frame(self, key, value):
  2871. # support boolean setting with DataFrame input, e.g.
  2872. # df[df > df2] = 0
  2873. if isinstance(key, np.ndarray):
  2874. if key.shape != self.shape:
  2875. raise ValueError(
  2876. 'Array conditional must be same shape as self'
  2877. )
  2878. key = self._constructor(key, **self._construct_axes_dict())
  2879. if key.values.size and not is_bool_dtype(key.values):
  2880. raise TypeError(
  2881. 'Must pass DataFrame or 2-d ndarray with boolean values only'
  2882. )
  2883. self._check_inplace_setting(value)
  2884. self._check_setitem_copy()
  2885. self._where(-key, value, inplace=True)
  2886. def _ensure_valid_index(self, value):
  2887. """
  2888. Ensure that if we don't have an index, that we can create one from the
  2889. passed value.
  2890. """
  2891. # GH5632, make sure that we are a Series convertible
  2892. if not len(self.index) and is_list_like(value):
  2893. try:
  2894. value = Series(value)
  2895. except (ValueError, NotImplementedError, TypeError):
  2896. raise ValueError('Cannot set a frame with no defined index '
  2897. 'and a value that cannot be converted to a '
  2898. 'Series')
  2899. self._data = self._data.reindex_axis(value.index.copy(), axis=1,
  2900. fill_value=np.nan)
  2901. def _set_item(self, key, value):
  2902. """
  2903. Add series to DataFrame in specified column.
  2904. If series is a numpy-array (not a Series/TimeSeries), it must be the
  2905. same length as the DataFrames index or an error will be thrown.
  2906. Series/TimeSeries will be conformed to the DataFrames index to
  2907. ensure homogeneity.
  2908. """
  2909. self._ensure_valid_index(value)
  2910. value = self._sanitize_column(key, value)
  2911. NDFrame._set_item(self, key, value)
  2912. # check if we are modifying a copy
  2913. # try to set first as we want an invalid
  2914. # value exception to occur first
  2915. if len(self):
  2916. self._check_setitem_copy()
  2917. def insert(self, loc, column, value, allow_duplicates=False):
  2918. """
  2919. Insert column into DataFrame at specified location.
  2920. Raises a ValueError if `column` is already contained in the DataFrame,
  2921. unless `allow_duplicates` is set to True.
  2922. Parameters
  2923. ----------
  2924. loc : int
  2925. Insertion index. Must verify 0 <= loc <= len(columns)
  2926. column : string, number, or hashable object
  2927. label of the inserted column
  2928. value : int, Series, or array-like
  2929. allow_duplicates : bool, optional
  2930. """
  2931. self._ensure_valid_index(value)
  2932. value = self._sanitize_column(column, value, broadcast=False)
  2933. self._data.insert(loc, column, value,
  2934. allow_duplicates=allow_duplicates)
  2935. def assign(self, **kwargs):
  2936. r"""
  2937. Assign new columns to a DataFrame.
  2938. Returns a new object with all original columns in addition to new ones.
  2939. Existing columns that are re-assigned will be overwritten.
  2940. Parameters
  2941. ----------
  2942. **kwargs : dict of {str: callable or Series}
  2943. The column names are keywords. If the values are
  2944. callable, they are computed on the DataFrame and
  2945. assigned to the new columns. The callable must not
  2946. change input DataFrame (though pandas doesn't check it).
  2947. If the values are not callable, (e.g. a Series, scalar, or array),
  2948. they are simply assigned.
  2949. Returns
  2950. -------
  2951. DataFrame
  2952. A new DataFrame with the new columns in addition to
  2953. all the existing columns.
  2954. Notes
  2955. -----
  2956. Assigning multiple columns within the same ``assign`` is possible.
  2957. For Python 3.6 and above, later items in '\*\*kwargs' may refer to
  2958. newly created or modified columns in 'df'; items are computed and
  2959. assigned into 'df' in order. For Python 3.5 and below, the order of
  2960. keyword arguments is not specified, you cannot refer to newly created
  2961. or modified columns. All items are computed first, and then assigned
  2962. in alphabetical order.
  2963. .. versionchanged :: 0.23.0
  2964. Keyword argument order is maintained for Python 3.6 and later.
  2965. Examples
  2966. --------
  2967. >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
  2968. ... index=['Portland', 'Berkeley'])
  2969. >>> df
  2970. temp_c
  2971. Portland 17.0
  2972. Berkeley 25.0
  2973. Where the value is a callable, evaluated on `df`:
  2974. >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
  2975. temp_c temp_f
  2976. Portland 17.0 62.6
  2977. Berkeley 25.0 77.0
  2978. Alternatively, the same behavior can be achieved by directly
  2979. referencing an existing Series or sequence:
  2980. >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
  2981. temp_c temp_f
  2982. Portland 17.0 62.6
  2983. Berkeley 25.0 77.0
  2984. In Python 3.6+, you can create multiple columns within the same assign
  2985. where one of the columns depends on another one defined within the same
  2986. assign:
  2987. >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
  2988. ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
  2989. temp_c temp_f temp_k
  2990. Portland 17.0 62.6 290.15
  2991. Berkeley 25.0 77.0 298.15
  2992. """
  2993. data = self.copy()
  2994. # >= 3.6 preserve order of kwargs
  2995. if PY36:
  2996. for k, v in kwargs.items():
  2997. data[k] = com.apply_if_callable(v, data)
  2998. else:
  2999. # <= 3.5: do all calculations first...
  3000. results = OrderedDict()
  3001. for k, v in kwargs.items():
  3002. results[k] = com.apply_if_callable(v, data)
  3003. # <= 3.5 and earlier
  3004. results = sorted(results.items())
  3005. # ... and then assign
  3006. for k, v in results:
  3007. data[k] = v
  3008. return data
  3009. def _sanitize_column(self, key, value, broadcast=True):
  3010. """
  3011. Ensures new columns (which go into the BlockManager as new blocks) are
  3012. always copied and converted into an array.
  3013. Parameters
  3014. ----------
  3015. key : object
  3016. value : scalar, Series, or array-like
  3017. broadcast : bool, default True
  3018. If ``key`` matches multiple duplicate column names in the
  3019. DataFrame, this parameter indicates whether ``value`` should be
  3020. tiled so that the returned array contains a (duplicated) column for
  3021. each occurrence of the key. If False, ``value`` will not be tiled.
  3022. Returns
  3023. -------
  3024. sanitized_column : numpy-array
  3025. """
  3026. def reindexer(value):
  3027. # reindex if necessary
  3028. if value.index.equals(self.index) or not len(self.index):
  3029. value = value._values.copy()
  3030. else:
  3031. # GH 4107
  3032. try:
  3033. value = value.reindex(self.index)._values
  3034. except Exception as e:
  3035. # duplicate axis
  3036. if not value.index.is_unique:
  3037. raise e
  3038. # other
  3039. raise TypeError('incompatible index of inserted column '
  3040. 'with frame index')
  3041. return value
  3042. if isinstance(value, Series):
  3043. value = reindexer(value)
  3044. elif isinstance(value, DataFrame):
  3045. # align right-hand-side columns if self.columns
  3046. # is multi-index and self[key] is a sub-frame
  3047. if isinstance(self.columns, MultiIndex) and key in self.columns:
  3048. loc = self.columns.get_loc(key)
  3049. if isinstance(loc, (slice, Series, np.ndarray, Index)):
  3050. cols = maybe_droplevels(self.columns[loc], key)
  3051. if len(cols) and not cols.equals(value.columns):
  3052. value = value.reindex(cols, axis=1)
  3053. # now align rows
  3054. value = reindexer(value).T
  3055. elif isinstance(value, ExtensionArray):
  3056. # Explicitly copy here, instead of in sanitize_index,
  3057. # as sanitize_index won't copy an EA, even with copy=True
  3058. value = value.copy()
  3059. value = sanitize_index(value, self.index, copy=False)
  3060. elif isinstance(value, Index) or is_sequence(value):
  3061. # turn me into an ndarray
  3062. value = sanitize_index(value, self.index, copy=False)
  3063. if not isinstance(value, (np.ndarray, Index)):
  3064. if isinstance(value, list) and len(value) > 0:
  3065. value = maybe_convert_platform(value)
  3066. else:
  3067. value = com.asarray_tuplesafe(value)
  3068. elif value.ndim == 2:
  3069. value = value.copy().T
  3070. elif isinstance(value, Index):
  3071. value = value.copy(deep=True)
  3072. else:
  3073. value = value.copy()
  3074. # possibly infer to datetimelike
  3075. if is_object_dtype(value.dtype):
  3076. value = maybe_infer_to_datetimelike(value)
  3077. else:
  3078. # cast ignores pandas dtypes. so save the dtype first
  3079. infer_dtype, _ = infer_dtype_from_scalar(
  3080. value, pandas_dtype=True)
  3081. # upcast
  3082. value = cast_scalar_to_array(len(self.index), value)
  3083. value = maybe_cast_to_datetime(value, infer_dtype)
  3084. # return internal types directly
  3085. if is_extension_type(value) or is_extension_array_dtype(value):
  3086. return value
  3087. # broadcast across multiple columns if necessary
  3088. if broadcast and key in self.columns and value.ndim == 1:
  3089. if (not self.columns.is_unique or
  3090. isinstance(self.columns, MultiIndex)):
  3091. existing_piece = self[key]
  3092. if isinstance(existing_piece, DataFrame):
  3093. value = np.tile(value, (len(existing_piece.columns), 1))
  3094. return np.atleast_2d(np.asarray(value))
  3095. @property
  3096. def _series(self):
  3097. return {item: Series(self._data.iget(idx), index=self.index, name=item)
  3098. for idx, item in enumerate(self.columns)}
  3099. def lookup(self, row_labels, col_labels):
  3100. """
  3101. Label-based "fancy indexing" function for DataFrame.
  3102. Given equal-length arrays of row and column labels, return an
  3103. array of the values corresponding to each (row, col) pair.
  3104. Parameters
  3105. ----------
  3106. row_labels : sequence
  3107. The row labels to use for lookup
  3108. col_labels : sequence
  3109. The column labels to use for lookup
  3110. Notes
  3111. -----
  3112. Akin to::
  3113. result = [df.get_value(row, col)
  3114. for row, col in zip(row_labels, col_labels)]
  3115. Examples
  3116. --------
  3117. values : ndarray
  3118. The found values
  3119. """
  3120. n = len(row_labels)
  3121. if n != len(col_labels):
  3122. raise ValueError('Row labels must have same size as column labels')
  3123. thresh = 1000
  3124. if not self._is_mixed_type or n > thresh:
  3125. values = self.values
  3126. ridx = self.index.get_indexer(row_labels)
  3127. cidx = self.columns.get_indexer(col_labels)
  3128. if (ridx == -1).any():
  3129. raise KeyError('One or more row labels was not found')
  3130. if (cidx == -1).any():
  3131. raise KeyError('One or more column labels was not found')
  3132. flat_index = ridx * len(self.columns) + cidx
  3133. result = values.flat[flat_index]
  3134. else:
  3135. result = np.empty(n, dtype='O')
  3136. for i, (r, c) in enumerate(zip(row_labels, col_labels)):
  3137. result[i] = self._get_value(r, c)
  3138. if is_object_dtype(result):
  3139. result = lib.maybe_convert_objects(result)
  3140. return result
  3141. # ----------------------------------------------------------------------
  3142. # Reindexing and alignment
  3143. def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
  3144. copy):
  3145. frame = self
  3146. columns = axes['columns']
  3147. if columns is not None:
  3148. frame = frame._reindex_columns(columns, method, copy, level,
  3149. fill_value, limit, tolerance)
  3150. index = axes['index']
  3151. if index is not None:
  3152. frame = frame._reindex_index(index, method, copy, level,
  3153. fill_value, limit, tolerance)
  3154. return frame
  3155. def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
  3156. limit=None, tolerance=None):
  3157. new_index, indexer = self.index.reindex(new_index, method=method,
  3158. level=level, limit=limit,
  3159. tolerance=tolerance)
  3160. return self._reindex_with_indexers({0: [new_index, indexer]},
  3161. copy=copy, fill_value=fill_value,
  3162. allow_dups=False)
  3163. def _reindex_columns(self, new_columns, method, copy, level,
  3164. fill_value=None, limit=None, tolerance=None):
  3165. new_columns, indexer = self.columns.reindex(new_columns, method=method,
  3166. level=level, limit=limit,
  3167. tolerance=tolerance)
  3168. return self._reindex_with_indexers({1: [new_columns, indexer]},
  3169. copy=copy, fill_value=fill_value,
  3170. allow_dups=False)
  3171. def _reindex_multi(self, axes, copy, fill_value):
  3172. """
  3173. We are guaranteed non-Nones in the axes.
  3174. """
  3175. new_index, row_indexer = self.index.reindex(axes['index'])
  3176. new_columns, col_indexer = self.columns.reindex(axes['columns'])
  3177. if row_indexer is not None and col_indexer is not None:
  3178. indexer = row_indexer, col_indexer
  3179. new_values = algorithms.take_2d_multi(self.values, indexer,
  3180. fill_value=fill_value)
  3181. return self._constructor(new_values, index=new_index,
  3182. columns=new_columns)
  3183. else:
  3184. return self._reindex_with_indexers({0: [new_index, row_indexer],
  3185. 1: [new_columns, col_indexer]},
  3186. copy=copy,
  3187. fill_value=fill_value)
  3188. @Appender(_shared_docs['align'] % _shared_doc_kwargs)
  3189. def align(self, other, join='outer', axis=None, level=None, copy=True,
  3190. fill_value=None, method=None, limit=None, fill_axis=0,
  3191. broadcast_axis=None):
  3192. return super(DataFrame, self).align(other, join=join, axis=axis,
  3193. level=level, copy=copy,
  3194. fill_value=fill_value,
  3195. method=method, limit=limit,
  3196. fill_axis=fill_axis,
  3197. broadcast_axis=broadcast_axis)
  3198. @Substitution(**_shared_doc_kwargs)
  3199. @Appender(NDFrame.reindex.__doc__)
  3200. @rewrite_axis_style_signature('labels', [('method', None),
  3201. ('copy', True),
  3202. ('level', None),
  3203. ('fill_value', np.nan),
  3204. ('limit', None),
  3205. ('tolerance', None)])
  3206. def reindex(self, *args, **kwargs):
  3207. axes = validate_axis_style_args(self, args, kwargs, 'labels',
  3208. 'reindex')
  3209. kwargs.update(axes)
  3210. # Pop these, since the values are in `kwargs` under different names
  3211. kwargs.pop('axis', None)
  3212. kwargs.pop('labels', None)
  3213. return super(DataFrame, self).reindex(**kwargs)
  3214. @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
  3215. def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
  3216. limit=None, fill_value=np.nan):
  3217. return super(DataFrame,
  3218. self).reindex_axis(labels=labels, axis=axis,
  3219. method=method, level=level, copy=copy,
  3220. limit=limit, fill_value=fill_value)
  3221. def drop(self, labels=None, axis=0, index=None, columns=None,
  3222. level=None, inplace=False, errors='raise'):
  3223. """
  3224. Drop specified labels from rows or columns.
  3225. Remove rows or columns by specifying label names and corresponding
  3226. axis, or by specifying directly index or column names. When using a
  3227. multi-index, labels on different levels can be removed by specifying
  3228. the level.
  3229. Parameters
  3230. ----------
  3231. labels : single label or list-like
  3232. Index or column labels to drop.
  3233. axis : {0 or 'index', 1 or 'columns'}, default 0
  3234. Whether to drop labels from the index (0 or 'index') or
  3235. columns (1 or 'columns').
  3236. index, columns : single label or list-like
  3237. Alternative to specifying axis (``labels, axis=1``
  3238. is equivalent to ``columns=labels``).
  3239. .. versionadded:: 0.21.0
  3240. level : int or level name, optional
  3241. For MultiIndex, level from which the labels will be removed.
  3242. inplace : bool, default False
  3243. If True, do operation inplace and return None.
  3244. errors : {'ignore', 'raise'}, default 'raise'
  3245. If 'ignore', suppress error and only existing labels are
  3246. dropped.
  3247. Returns
  3248. -------
  3249. dropped : pandas.DataFrame
  3250. Raises
  3251. ------
  3252. KeyError
  3253. If none of the labels are found in the selected axis
  3254. See Also
  3255. --------
  3256. DataFrame.loc : Label-location based indexer for selection by label.
  3257. DataFrame.dropna : Return DataFrame with labels on given axis omitted
  3258. where (all or any) data are missing.
  3259. DataFrame.drop_duplicates : Return DataFrame with duplicate rows
  3260. removed, optionally only considering certain columns.
  3261. Series.drop : Return Series with specified index labels removed.
  3262. Examples
  3263. --------
  3264. >>> df = pd.DataFrame(np.arange(12).reshape(3,4),
  3265. ... columns=['A', 'B', 'C', 'D'])
  3266. >>> df
  3267. A B C D
  3268. 0 0 1 2 3
  3269. 1 4 5 6 7
  3270. 2 8 9 10 11
  3271. Drop columns
  3272. >>> df.drop(['B', 'C'], axis=1)
  3273. A D
  3274. 0 0 3
  3275. 1 4 7
  3276. 2 8 11
  3277. >>> df.drop(columns=['B', 'C'])
  3278. A D
  3279. 0 0 3
  3280. 1 4 7
  3281. 2 8 11
  3282. Drop a row by index
  3283. >>> df.drop([0, 1])
  3284. A B C D
  3285. 2 8 9 10 11
  3286. Drop columns and/or rows of MultiIndex DataFrame
  3287. >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
  3288. ... ['speed', 'weight', 'length']],
  3289. ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
  3290. ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
  3291. >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
  3292. ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
  3293. ... [250, 150], [1.5, 0.8], [320, 250],
  3294. ... [1, 0.8], [0.3,0.2]])
  3295. >>> df
  3296. big small
  3297. lama speed 45.0 30.0
  3298. weight 200.0 100.0
  3299. length 1.5 1.0
  3300. cow speed 30.0 20.0
  3301. weight 250.0 150.0
  3302. length 1.5 0.8
  3303. falcon speed 320.0 250.0
  3304. weight 1.0 0.8
  3305. length 0.3 0.2
  3306. >>> df.drop(index='cow', columns='small')
  3307. big
  3308. lama speed 45.0
  3309. weight 200.0
  3310. length 1.5
  3311. falcon speed 320.0
  3312. weight 1.0
  3313. length 0.3
  3314. >>> df.drop(index='length', level=1)
  3315. big small
  3316. lama speed 45.0 30.0
  3317. weight 200.0 100.0
  3318. cow speed 30.0 20.0
  3319. weight 250.0 150.0
  3320. falcon speed 320.0 250.0
  3321. weight 1.0 0.8
  3322. """
  3323. return super(DataFrame, self).drop(labels=labels, axis=axis,
  3324. index=index, columns=columns,
  3325. level=level, inplace=inplace,
  3326. errors=errors)
  3327. @rewrite_axis_style_signature('mapper', [('copy', True),
  3328. ('inplace', False),
  3329. ('level', None)])
  3330. def rename(self, *args, **kwargs):
  3331. """
  3332. Alter axes labels.
  3333. Function / dict values must be unique (1-to-1). Labels not contained in
  3334. a dict / Series will be left as-is. Extra labels listed don't throw an
  3335. error.
  3336. See the :ref:`user guide <basics.rename>` for more.
  3337. Parameters
  3338. ----------
  3339. mapper, index, columns : dict-like or function, optional
  3340. dict-like or functions transformations to apply to
  3341. that axis' values. Use either ``mapper`` and ``axis`` to
  3342. specify the axis to target with ``mapper``, or ``index`` and
  3343. ``columns``.
  3344. axis : int or str, optional
  3345. Axis to target with ``mapper``. Can be either the axis name
  3346. ('index', 'columns') or number (0, 1). The default is 'index'.
  3347. copy : boolean, default True
  3348. Also copy underlying data
  3349. inplace : boolean, default False
  3350. Whether to return a new DataFrame. If True then value of copy is
  3351. ignored.
  3352. level : int or level name, default None
  3353. In case of a MultiIndex, only rename labels in the specified
  3354. level.
  3355. Returns
  3356. -------
  3357. renamed : DataFrame
  3358. See Also
  3359. --------
  3360. pandas.DataFrame.rename_axis
  3361. Examples
  3362. --------
  3363. ``DataFrame.rename`` supports two calling conventions
  3364. * ``(index=index_mapper, columns=columns_mapper, ...)``
  3365. * ``(mapper, axis={'index', 'columns'}, ...)``
  3366. We *highly* recommend using keyword arguments to clarify your
  3367. intent.
  3368. >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  3369. >>> df.rename(index=str, columns={"A": "a", "B": "c"})
  3370. a c
  3371. 0 1 4
  3372. 1 2 5
  3373. 2 3 6
  3374. >>> df.rename(index=str, columns={"A": "a", "C": "c"})
  3375. a B
  3376. 0 1 4
  3377. 1 2 5
  3378. 2 3 6
  3379. Using axis-style parameters
  3380. >>> df.rename(str.lower, axis='columns')
  3381. a b
  3382. 0 1 4
  3383. 1 2 5
  3384. 2 3 6
  3385. >>> df.rename({1: 2, 2: 4}, axis='index')
  3386. A B
  3387. 0 1 4
  3388. 2 2 5
  3389. 4 3 6
  3390. """
  3391. axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename')
  3392. kwargs.update(axes)
  3393. # Pop these, since the values are in `kwargs` under different names
  3394. kwargs.pop('axis', None)
  3395. kwargs.pop('mapper', None)
  3396. return super(DataFrame, self).rename(**kwargs)
  3397. @Substitution(**_shared_doc_kwargs)
  3398. @Appender(NDFrame.fillna.__doc__)
  3399. def fillna(self, value=None, method=None, axis=None, inplace=False,
  3400. limit=None, downcast=None, **kwargs):
  3401. return super(DataFrame,
  3402. self).fillna(value=value, method=method, axis=axis,
  3403. inplace=inplace, limit=limit,
  3404. downcast=downcast, **kwargs)
  3405. @Appender(_shared_docs['replace'] % _shared_doc_kwargs)
  3406. def replace(self, to_replace=None, value=None, inplace=False, limit=None,
  3407. regex=False, method='pad'):
  3408. return super(DataFrame, self).replace(to_replace=to_replace,
  3409. value=value, inplace=inplace,
  3410. limit=limit, regex=regex,
  3411. method=method)
  3412. @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
  3413. def shift(self, periods=1, freq=None, axis=0, fill_value=None):
  3414. return super(DataFrame, self).shift(periods=periods, freq=freq,
  3415. axis=axis, fill_value=fill_value)
  3416. def set_index(self, keys, drop=True, append=False, inplace=False,
  3417. verify_integrity=False):
  3418. """
  3419. Set the DataFrame index using existing columns.
  3420. Set the DataFrame index (row labels) using one or more existing
  3421. columns or arrays (of the correct length). The index can replace the
  3422. existing index or expand on it.
  3423. Parameters
  3424. ----------
  3425. keys : label or array-like or list of labels/arrays
  3426. This parameter can be either a single column key, a single array of
  3427. the same length as the calling DataFrame, or a list containing an
  3428. arbitrary combination of column keys and arrays. Here, "array"
  3429. encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
  3430. drop : bool, default True
  3431. Delete columns to be used as the new index.
  3432. append : bool, default False
  3433. Whether to append columns to existing index.
  3434. inplace : bool, default False
  3435. Modify the DataFrame in place (do not create a new object).
  3436. verify_integrity : bool, default False
  3437. Check the new index for duplicates. Otherwise defer the check until
  3438. necessary. Setting to False will improve the performance of this
  3439. method.
  3440. Returns
  3441. -------
  3442. DataFrame
  3443. Changed row labels.
  3444. See Also
  3445. --------
  3446. DataFrame.reset_index : Opposite of set_index.
  3447. DataFrame.reindex : Change to new indices or expand indices.
  3448. DataFrame.reindex_like : Change to same indices as other DataFrame.
  3449. Examples
  3450. --------
  3451. >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
  3452. ... 'year': [2012, 2014, 2013, 2014],
  3453. ... 'sale': [55, 40, 84, 31]})
  3454. >>> df
  3455. month year sale
  3456. 0 1 2012 55
  3457. 1 4 2014 40
  3458. 2 7 2013 84
  3459. 3 10 2014 31
  3460. Set the index to become the 'month' column:
  3461. >>> df.set_index('month')
  3462. year sale
  3463. month
  3464. 1 2012 55
  3465. 4 2014 40
  3466. 7 2013 84
  3467. 10 2014 31
  3468. Create a MultiIndex using columns 'year' and 'month':
  3469. >>> df.set_index(['year', 'month'])
  3470. sale
  3471. year month
  3472. 2012 1 55
  3473. 2014 4 40
  3474. 2013 7 84
  3475. 2014 10 31
  3476. Create a MultiIndex using an Index and a column:
  3477. >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
  3478. month sale
  3479. year
  3480. 1 2012 1 55
  3481. 2 2014 4 40
  3482. 3 2013 7 84
  3483. 4 2014 10 31
  3484. Create a MultiIndex using two Series:
  3485. >>> s = pd.Series([1, 2, 3, 4])
  3486. >>> df.set_index([s, s**2])
  3487. month year sale
  3488. 1 1 1 2012 55
  3489. 2 4 4 2014 40
  3490. 3 9 7 2013 84
  3491. 4 16 10 2014 31
  3492. """
  3493. inplace = validate_bool_kwarg(inplace, 'inplace')
  3494. if not isinstance(keys, list):
  3495. keys = [keys]
  3496. if inplace:
  3497. frame = self
  3498. else:
  3499. frame = self.copy()
  3500. arrays = []
  3501. names = []
  3502. if append:
  3503. names = [x for x in self.index.names]
  3504. if isinstance(self.index, MultiIndex):
  3505. for i in range(self.index.nlevels):
  3506. arrays.append(self.index._get_level_values(i))
  3507. else:
  3508. arrays.append(self.index)
  3509. to_remove = []
  3510. for col in keys:
  3511. if isinstance(col, MultiIndex):
  3512. # append all but the last column so we don't have to modify
  3513. # the end of this loop
  3514. for n in range(col.nlevels - 1):
  3515. arrays.append(col._get_level_values(n))
  3516. level = col._get_level_values(col.nlevels - 1)
  3517. names.extend(col.names)
  3518. elif isinstance(col, Series):
  3519. level = col._values
  3520. names.append(col.name)
  3521. elif isinstance(col, Index):
  3522. level = col
  3523. names.append(col.name)
  3524. elif isinstance(col, (list, np.ndarray, Index)):
  3525. level = col
  3526. names.append(None)
  3527. else:
  3528. level = frame[col]._values
  3529. names.append(col)
  3530. if drop:
  3531. to_remove.append(col)
  3532. arrays.append(level)
  3533. index = ensure_index_from_sequences(arrays, names)
  3534. if verify_integrity and not index.is_unique:
  3535. duplicates = index[index.duplicated()].unique()
  3536. raise ValueError('Index has duplicate keys: {dup}'.format(
  3537. dup=duplicates))
  3538. # use set to handle duplicate column names gracefully in case of drop
  3539. for c in set(to_remove):
  3540. del frame[c]
  3541. # clear up memory usage
  3542. index._cleanup()
  3543. frame.index = index
  3544. if not inplace:
  3545. return frame
  3546. def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
  3547. col_fill=''):
  3548. """
  3549. Reset the index, or a level of it.
  3550. Reset the index of the DataFrame, and use the default one instead.
  3551. If the DataFrame has a MultiIndex, this method can remove one or more
  3552. levels.
  3553. Parameters
  3554. ----------
  3555. level : int, str, tuple, or list, default None
  3556. Only remove the given levels from the index. Removes all levels by
  3557. default.
  3558. drop : bool, default False
  3559. Do not try to insert index into dataframe columns. This resets
  3560. the index to the default integer index.
  3561. inplace : bool, default False
  3562. Modify the DataFrame in place (do not create a new object).
  3563. col_level : int or str, default 0
  3564. If the columns have multiple levels, determines which level the
  3565. labels are inserted into. By default it is inserted into the first
  3566. level.
  3567. col_fill : object, default ''
  3568. If the columns have multiple levels, determines how the other
  3569. levels are named. If None then the index name is repeated.
  3570. Returns
  3571. -------
  3572. DataFrame
  3573. DataFrame with the new index.
  3574. See Also
  3575. --------
  3576. DataFrame.set_index : Opposite of reset_index.
  3577. DataFrame.reindex : Change to new indices or expand indices.
  3578. DataFrame.reindex_like : Change to same indices as other DataFrame.
  3579. Examples
  3580. --------
  3581. >>> df = pd.DataFrame([('bird', 389.0),
  3582. ... ('bird', 24.0),
  3583. ... ('mammal', 80.5),
  3584. ... ('mammal', np.nan)],
  3585. ... index=['falcon', 'parrot', 'lion', 'monkey'],
  3586. ... columns=('class', 'max_speed'))
  3587. >>> df
  3588. class max_speed
  3589. falcon bird 389.0
  3590. parrot bird 24.0
  3591. lion mammal 80.5
  3592. monkey mammal NaN
  3593. When we reset the index, the old index is added as a column, and a
  3594. new sequential index is used:
  3595. >>> df.reset_index()
  3596. index class max_speed
  3597. 0 falcon bird 389.0
  3598. 1 parrot bird 24.0
  3599. 2 lion mammal 80.5
  3600. 3 monkey mammal NaN
  3601. We can use the `drop` parameter to avoid the old index being added as
  3602. a column:
  3603. >>> df.reset_index(drop=True)
  3604. class max_speed
  3605. 0 bird 389.0
  3606. 1 bird 24.0
  3607. 2 mammal 80.5
  3608. 3 mammal NaN
  3609. You can also use `reset_index` with `MultiIndex`.
  3610. >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
  3611. ... ('bird', 'parrot'),
  3612. ... ('mammal', 'lion'),
  3613. ... ('mammal', 'monkey')],
  3614. ... names=['class', 'name'])
  3615. >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
  3616. ... ('species', 'type')])
  3617. >>> df = pd.DataFrame([(389.0, 'fly'),
  3618. ... ( 24.0, 'fly'),
  3619. ... ( 80.5, 'run'),
  3620. ... (np.nan, 'jump')],
  3621. ... index=index,
  3622. ... columns=columns)
  3623. >>> df
  3624. speed species
  3625. max type
  3626. class name
  3627. bird falcon 389.0 fly
  3628. parrot 24.0 fly
  3629. mammal lion 80.5 run
  3630. monkey NaN jump
  3631. If the index has multiple levels, we can reset a subset of them:
  3632. >>> df.reset_index(level='class')
  3633. class speed species
  3634. max type
  3635. name
  3636. falcon bird 389.0 fly
  3637. parrot bird 24.0 fly
  3638. lion mammal 80.5 run
  3639. monkey mammal NaN jump
  3640. If we are not dropping the index, by default, it is placed in the top
  3641. level. We can place it in another level:
  3642. >>> df.reset_index(level='class', col_level=1)
  3643. speed species
  3644. class max type
  3645. name
  3646. falcon bird 389.0 fly
  3647. parrot bird 24.0 fly
  3648. lion mammal 80.5 run
  3649. monkey mammal NaN jump
  3650. When the index is inserted under another level, we can specify under
  3651. which one with the parameter `col_fill`:
  3652. >>> df.reset_index(level='class', col_level=1, col_fill='species')
  3653. species speed species
  3654. class max type
  3655. name
  3656. falcon bird 389.0 fly
  3657. parrot bird 24.0 fly
  3658. lion mammal 80.5 run
  3659. monkey mammal NaN jump
  3660. If we specify a nonexistent level for `col_fill`, it is created:
  3661. >>> df.reset_index(level='class', col_level=1, col_fill='genus')
  3662. genus speed species
  3663. class max type
  3664. name
  3665. falcon bird 389.0 fly
  3666. parrot bird 24.0 fly
  3667. lion mammal 80.5 run
  3668. monkey mammal NaN jump
  3669. """
  3670. inplace = validate_bool_kwarg(inplace, 'inplace')
  3671. if inplace:
  3672. new_obj = self
  3673. else:
  3674. new_obj = self.copy()
  3675. def _maybe_casted_values(index, labels=None):
  3676. values = index._values
  3677. if not isinstance(index, (PeriodIndex, DatetimeIndex)):
  3678. if values.dtype == np.object_:
  3679. values = lib.maybe_convert_objects(values)
  3680. # if we have the labels, extract the values with a mask
  3681. if labels is not None:
  3682. mask = labels == -1
  3683. # we can have situations where the whole mask is -1,
  3684. # meaning there is nothing found in labels, so make all nan's
  3685. if mask.all():
  3686. values = np.empty(len(mask))
  3687. values.fill(np.nan)
  3688. else:
  3689. values = values.take(labels)
  3690. # TODO(https://github.com/pandas-dev/pandas/issues/24206)
  3691. # Push this into maybe_upcast_putmask?
  3692. # We can't pass EAs there right now. Looks a bit
  3693. # complicated.
  3694. # So we unbox the ndarray_values, op, re-box.
  3695. values_type = type(values)
  3696. values_dtype = values.dtype
  3697. if issubclass(values_type, DatetimeLikeArray):
  3698. values = values._data
  3699. if mask.any():
  3700. values, changed = maybe_upcast_putmask(
  3701. values, mask, np.nan)
  3702. if issubclass(values_type, DatetimeLikeArray):
  3703. values = values_type(values, dtype=values_dtype)
  3704. return values
  3705. new_index = ibase.default_index(len(new_obj))
  3706. if level is not None:
  3707. if not isinstance(level, (tuple, list)):
  3708. level = [level]
  3709. level = [self.index._get_level_number(lev) for lev in level]
  3710. if len(level) < self.index.nlevels:
  3711. new_index = self.index.droplevel(level)
  3712. if not drop:
  3713. if isinstance(self.index, MultiIndex):
  3714. names = [n if n is not None else ('level_%d' % i)
  3715. for (i, n) in enumerate(self.index.names)]
  3716. to_insert = lzip(self.index.levels, self.index.codes)
  3717. else:
  3718. default = 'index' if 'index' not in self else 'level_0'
  3719. names = ([default] if self.index.name is None
  3720. else [self.index.name])
  3721. to_insert = ((self.index, None),)
  3722. multi_col = isinstance(self.columns, MultiIndex)
  3723. for i, (lev, lab) in reversed(list(enumerate(to_insert))):
  3724. if not (level is None or i in level):
  3725. continue
  3726. name = names[i]
  3727. if multi_col:
  3728. col_name = (list(name) if isinstance(name, tuple)
  3729. else [name])
  3730. if col_fill is None:
  3731. if len(col_name) not in (1, self.columns.nlevels):
  3732. raise ValueError("col_fill=None is incompatible "
  3733. "with incomplete column name "
  3734. "{}".format(name))
  3735. col_fill = col_name[0]
  3736. lev_num = self.columns._get_level_number(col_level)
  3737. name_lst = [col_fill] * lev_num + col_name
  3738. missing = self.columns.nlevels - len(name_lst)
  3739. name_lst += [col_fill] * missing
  3740. name = tuple(name_lst)
  3741. # to ndarray and maybe infer different dtype
  3742. level_values = _maybe_casted_values(lev, lab)
  3743. new_obj.insert(0, name, level_values)
  3744. new_obj.index = new_index
  3745. if not inplace:
  3746. return new_obj
  3747. # ----------------------------------------------------------------------
  3748. # Reindex-based selection methods
  3749. @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
  3750. def isna(self):
  3751. return super(DataFrame, self).isna()
  3752. @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
  3753. def isnull(self):
  3754. return super(DataFrame, self).isnull()
  3755. @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
  3756. def notna(self):
  3757. return super(DataFrame, self).notna()
  3758. @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
  3759. def notnull(self):
  3760. return super(DataFrame, self).notnull()
  3761. def dropna(self, axis=0, how='any', thresh=None, subset=None,
  3762. inplace=False):
  3763. """
  3764. Remove missing values.
  3765. See the :ref:`User Guide <missing_data>` for more on which values are
  3766. considered missing, and how to work with missing data.
  3767. Parameters
  3768. ----------
  3769. axis : {0 or 'index', 1 or 'columns'}, default 0
  3770. Determine if rows or columns which contain missing values are
  3771. removed.
  3772. * 0, or 'index' : Drop rows which contain missing values.
  3773. * 1, or 'columns' : Drop columns which contain missing value.
  3774. .. deprecated:: 0.23.0
  3775. Pass tuple or list to drop on multiple axes.
  3776. Only a single axis is allowed.
  3777. how : {'any', 'all'}, default 'any'
  3778. Determine if row or column is removed from DataFrame, when we have
  3779. at least one NA or all NA.
  3780. * 'any' : If any NA values are present, drop that row or column.
  3781. * 'all' : If all values are NA, drop that row or column.
  3782. thresh : int, optional
  3783. Require that many non-NA values.
  3784. subset : array-like, optional
  3785. Labels along other axis to consider, e.g. if you are dropping rows
  3786. these would be a list of columns to include.
  3787. inplace : bool, default False
  3788. If True, do operation inplace and return None.
  3789. Returns
  3790. -------
  3791. DataFrame
  3792. DataFrame with NA entries dropped from it.
  3793. See Also
  3794. --------
  3795. DataFrame.isna: Indicate missing values.
  3796. DataFrame.notna : Indicate existing (non-missing) values.
  3797. DataFrame.fillna : Replace missing values.
  3798. Series.dropna : Drop missing values.
  3799. Index.dropna : Drop missing indices.
  3800. Examples
  3801. --------
  3802. >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
  3803. ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
  3804. ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
  3805. ... pd.NaT]})
  3806. >>> df
  3807. name toy born
  3808. 0 Alfred NaN NaT
  3809. 1 Batman Batmobile 1940-04-25
  3810. 2 Catwoman Bullwhip NaT
  3811. Drop the rows where at least one element is missing.
  3812. >>> df.dropna()
  3813. name toy born
  3814. 1 Batman Batmobile 1940-04-25
  3815. Drop the columns where at least one element is missing.
  3816. >>> df.dropna(axis='columns')
  3817. name
  3818. 0 Alfred
  3819. 1 Batman
  3820. 2 Catwoman
  3821. Drop the rows where all elements are missing.
  3822. >>> df.dropna(how='all')
  3823. name toy born
  3824. 0 Alfred NaN NaT
  3825. 1 Batman Batmobile 1940-04-25
  3826. 2 Catwoman Bullwhip NaT
  3827. Keep only the rows with at least 2 non-NA values.
  3828. >>> df.dropna(thresh=2)
  3829. name toy born
  3830. 1 Batman Batmobile 1940-04-25
  3831. 2 Catwoman Bullwhip NaT
  3832. Define in which columns to look for missing values.
  3833. >>> df.dropna(subset=['name', 'born'])
  3834. name toy born
  3835. 1 Batman Batmobile 1940-04-25
  3836. Keep the DataFrame with valid entries in the same variable.
  3837. >>> df.dropna(inplace=True)
  3838. >>> df
  3839. name toy born
  3840. 1 Batman Batmobile 1940-04-25
  3841. """
  3842. inplace = validate_bool_kwarg(inplace, 'inplace')
  3843. if isinstance(axis, (tuple, list)):
  3844. # GH20987
  3845. msg = ("supplying multiple axes to axis is deprecated and "
  3846. "will be removed in a future version.")
  3847. warnings.warn(msg, FutureWarning, stacklevel=2)
  3848. result = self
  3849. for ax in axis:
  3850. result = result.dropna(how=how, thresh=thresh, subset=subset,
  3851. axis=ax)
  3852. else:
  3853. axis = self._get_axis_number(axis)
  3854. agg_axis = 1 - axis
  3855. agg_obj = self
  3856. if subset is not None:
  3857. ax = self._get_axis(agg_axis)
  3858. indices = ax.get_indexer_for(subset)
  3859. check = indices == -1
  3860. if check.any():
  3861. raise KeyError(list(np.compress(check, subset)))
  3862. agg_obj = self.take(indices, axis=agg_axis)
  3863. count = agg_obj.count(axis=agg_axis)
  3864. if thresh is not None:
  3865. mask = count >= thresh
  3866. elif how == 'any':
  3867. mask = count == len(agg_obj._get_axis(agg_axis))
  3868. elif how == 'all':
  3869. mask = count > 0
  3870. else:
  3871. if how is not None:
  3872. raise ValueError('invalid how option: {h}'.format(h=how))
  3873. else:
  3874. raise TypeError('must specify how or thresh')
  3875. result = self.loc(axis=axis)[mask]
  3876. if inplace:
  3877. self._update_inplace(result)
  3878. else:
  3879. return result
  3880. def drop_duplicates(self, subset=None, keep='first', inplace=False):
  3881. """
  3882. Return DataFrame with duplicate rows removed, optionally only
  3883. considering certain columns.
  3884. Parameters
  3885. ----------
  3886. subset : column label or sequence of labels, optional
  3887. Only consider certain columns for identifying duplicates, by
  3888. default use all of the columns
  3889. keep : {'first', 'last', False}, default 'first'
  3890. - ``first`` : Drop duplicates except for the first occurrence.
  3891. - ``last`` : Drop duplicates except for the last occurrence.
  3892. - False : Drop all duplicates.
  3893. inplace : boolean, default False
  3894. Whether to drop duplicates in place or to return a copy
  3895. Returns
  3896. -------
  3897. deduplicated : DataFrame
  3898. """
  3899. if self.empty:
  3900. return self.copy()
  3901. inplace = validate_bool_kwarg(inplace, 'inplace')
  3902. duplicated = self.duplicated(subset, keep=keep)
  3903. if inplace:
  3904. inds, = (-duplicated)._ndarray_values.nonzero()
  3905. new_data = self._data.take(inds)
  3906. self._update_inplace(new_data)
  3907. else:
  3908. return self[-duplicated]
  3909. def duplicated(self, subset=None, keep='first'):
  3910. """
  3911. Return boolean Series denoting duplicate rows, optionally only
  3912. considering certain columns.
  3913. Parameters
  3914. ----------
  3915. subset : column label or sequence of labels, optional
  3916. Only consider certain columns for identifying duplicates, by
  3917. default use all of the columns
  3918. keep : {'first', 'last', False}, default 'first'
  3919. - ``first`` : Mark duplicates as ``True`` except for the
  3920. first occurrence.
  3921. - ``last`` : Mark duplicates as ``True`` except for the
  3922. last occurrence.
  3923. - False : Mark all duplicates as ``True``.
  3924. Returns
  3925. -------
  3926. duplicated : Series
  3927. """
  3928. from pandas.core.sorting import get_group_index
  3929. from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
  3930. if self.empty:
  3931. return Series(dtype=bool)
  3932. def f(vals):
  3933. labels, shape = algorithms.factorize(
  3934. vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
  3935. return labels.astype('i8', copy=False), len(shape)
  3936. if subset is None:
  3937. subset = self.columns
  3938. elif (not np.iterable(subset) or
  3939. isinstance(subset, compat.string_types) or
  3940. isinstance(subset, tuple) and subset in self.columns):
  3941. subset = subset,
  3942. # Verify all columns in subset exist in the queried dataframe
  3943. # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
  3944. # key that doesn't exist.
  3945. diff = Index(subset).difference(self.columns)
  3946. if not diff.empty:
  3947. raise KeyError(diff)
  3948. vals = (col.values for name, col in self.iteritems()
  3949. if name in subset)
  3950. labels, shape = map(list, zip(*map(f, vals)))
  3951. ids = get_group_index(labels, shape, sort=False, xnull=False)
  3952. return Series(duplicated_int64(ids, keep), index=self.index)
  3953. # ----------------------------------------------------------------------
  3954. # Sorting
  3955. @Substitution(**_shared_doc_kwargs)
  3956. @Appender(NDFrame.sort_values.__doc__)
  3957. def sort_values(self, by, axis=0, ascending=True, inplace=False,
  3958. kind='quicksort', na_position='last'):
  3959. inplace = validate_bool_kwarg(inplace, 'inplace')
  3960. axis = self._get_axis_number(axis)
  3961. if not isinstance(by, list):
  3962. by = [by]
  3963. if is_sequence(ascending) and len(by) != len(ascending):
  3964. raise ValueError('Length of ascending (%d) != length of by (%d)' %
  3965. (len(ascending), len(by)))
  3966. if len(by) > 1:
  3967. from pandas.core.sorting import lexsort_indexer
  3968. keys = [self._get_label_or_level_values(x, axis=axis)
  3969. for x in by]
  3970. indexer = lexsort_indexer(keys, orders=ascending,
  3971. na_position=na_position)
  3972. indexer = ensure_platform_int(indexer)
  3973. else:
  3974. from pandas.core.sorting import nargsort
  3975. by = by[0]
  3976. k = self._get_label_or_level_values(by, axis=axis)
  3977. if isinstance(ascending, (tuple, list)):
  3978. ascending = ascending[0]
  3979. indexer = nargsort(k, kind=kind, ascending=ascending,
  3980. na_position=na_position)
  3981. new_data = self._data.take(indexer,
  3982. axis=self._get_block_manager_axis(axis),
  3983. verify=False)
  3984. if inplace:
  3985. return self._update_inplace(new_data)
  3986. else:
  3987. return self._constructor(new_data).__finalize__(self)
  3988. @Substitution(**_shared_doc_kwargs)
  3989. @Appender(NDFrame.sort_index.__doc__)
  3990. def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
  3991. kind='quicksort', na_position='last', sort_remaining=True,
  3992. by=None):
  3993. # TODO: this can be combined with Series.sort_index impl as
  3994. # almost identical
  3995. inplace = validate_bool_kwarg(inplace, 'inplace')
  3996. # 10726
  3997. if by is not None:
  3998. warnings.warn("by argument to sort_index is deprecated, "
  3999. "please use .sort_values(by=...)",
  4000. FutureWarning, stacklevel=2)
  4001. if level is not None:
  4002. raise ValueError("unable to simultaneously sort by and level")
  4003. return self.sort_values(by, axis=axis, ascending=ascending,
  4004. inplace=inplace)
  4005. axis = self._get_axis_number(axis)
  4006. labels = self._get_axis(axis)
  4007. # make sure that the axis is lexsorted to start
  4008. # if not we need to reconstruct to get the correct indexer
  4009. labels = labels._sort_levels_monotonic()
  4010. if level is not None:
  4011. new_axis, indexer = labels.sortlevel(level, ascending=ascending,
  4012. sort_remaining=sort_remaining)
  4013. elif isinstance(labels, MultiIndex):
  4014. from pandas.core.sorting import lexsort_indexer
  4015. indexer = lexsort_indexer(labels._get_codes_for_sorting(),
  4016. orders=ascending,
  4017. na_position=na_position)
  4018. else:
  4019. from pandas.core.sorting import nargsort
  4020. # Check monotonic-ness before sort an index
  4021. # GH11080
  4022. if ((ascending and labels.is_monotonic_increasing) or
  4023. (not ascending and labels.is_monotonic_decreasing)):
  4024. if inplace:
  4025. return
  4026. else:
  4027. return self.copy()
  4028. indexer = nargsort(labels, kind=kind, ascending=ascending,
  4029. na_position=na_position)
  4030. baxis = self._get_block_manager_axis(axis)
  4031. new_data = self._data.take(indexer,
  4032. axis=baxis,
  4033. verify=False)
  4034. # reconstruct axis if needed
  4035. new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
  4036. if inplace:
  4037. return self._update_inplace(new_data)
  4038. else:
  4039. return self._constructor(new_data).__finalize__(self)
  4040. def nlargest(self, n, columns, keep='first'):
  4041. """
  4042. Return the first `n` rows ordered by `columns` in descending order.
  4043. Return the first `n` rows with the largest values in `columns`, in
  4044. descending order. The columns that are not specified are returned as
  4045. well, but not used for ordering.
  4046. This method is equivalent to
  4047. ``df.sort_values(columns, ascending=False).head(n)``, but more
  4048. performant.
  4049. Parameters
  4050. ----------
  4051. n : int
  4052. Number of rows to return.
  4053. columns : label or list of labels
  4054. Column label(s) to order by.
  4055. keep : {'first', 'last', 'all'}, default 'first'
  4056. Where there are duplicate values:
  4057. - `first` : prioritize the first occurrence(s)
  4058. - `last` : prioritize the last occurrence(s)
  4059. - ``all`` : do not drop any duplicates, even it means
  4060. selecting more than `n` items.
  4061. .. versionadded:: 0.24.0
  4062. Returns
  4063. -------
  4064. DataFrame
  4065. The first `n` rows ordered by the given columns in descending
  4066. order.
  4067. See Also
  4068. --------
  4069. DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
  4070. ascending order.
  4071. DataFrame.sort_values : Sort DataFrame by the values.
  4072. DataFrame.head : Return the first `n` rows without re-ordering.
  4073. Notes
  4074. -----
  4075. This function cannot be used with all column types. For example, when
  4076. specifying columns with `object` or `category` dtypes, ``TypeError`` is
  4077. raised.
  4078. Examples
  4079. --------
  4080. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  4081. ... 434000, 434000, 337000, 11300,
  4082. ... 11300, 11300],
  4083. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  4084. ... 17036, 182, 38, 311],
  4085. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  4086. ... "IS", "NR", "TV", "AI"]},
  4087. ... index=["Italy", "France", "Malta",
  4088. ... "Maldives", "Brunei", "Iceland",
  4089. ... "Nauru", "Tuvalu", "Anguilla"])
  4090. >>> df
  4091. population GDP alpha-2
  4092. Italy 59000000 1937894 IT
  4093. France 65000000 2583560 FR
  4094. Malta 434000 12011 MT
  4095. Maldives 434000 4520 MV
  4096. Brunei 434000 12128 BN
  4097. Iceland 337000 17036 IS
  4098. Nauru 11300 182 NR
  4099. Tuvalu 11300 38 TV
  4100. Anguilla 11300 311 AI
  4101. In the following example, we will use ``nlargest`` to select the three
  4102. rows having the largest values in column "population".
  4103. >>> df.nlargest(3, 'population')
  4104. population GDP alpha-2
  4105. France 65000000 2583560 FR
  4106. Italy 59000000 1937894 IT
  4107. Malta 434000 12011 MT
  4108. When using ``keep='last'``, ties are resolved in reverse order:
  4109. >>> df.nlargest(3, 'population', keep='last')
  4110. population GDP alpha-2
  4111. France 65000000 2583560 FR
  4112. Italy 59000000 1937894 IT
  4113. Brunei 434000 12128 BN
  4114. When using ``keep='all'``, all duplicate items are maintained:
  4115. >>> df.nlargest(3, 'population', keep='all')
  4116. population GDP alpha-2
  4117. France 65000000 2583560 FR
  4118. Italy 59000000 1937894 IT
  4119. Malta 434000 12011 MT
  4120. Maldives 434000 4520 MV
  4121. Brunei 434000 12128 BN
  4122. To order by the largest values in column "population" and then "GDP",
  4123. we can specify multiple columns like in the next example.
  4124. >>> df.nlargest(3, ['population', 'GDP'])
  4125. population GDP alpha-2
  4126. France 65000000 2583560 FR
  4127. Italy 59000000 1937894 IT
  4128. Brunei 434000 12128 BN
  4129. """
  4130. return algorithms.SelectNFrame(self,
  4131. n=n,
  4132. keep=keep,
  4133. columns=columns).nlargest()
  4134. def nsmallest(self, n, columns, keep='first'):
  4135. """
  4136. Return the first `n` rows ordered by `columns` in ascending order.
  4137. Return the first `n` rows with the smallest values in `columns`, in
  4138. ascending order. The columns that are not specified are returned as
  4139. well, but not used for ordering.
  4140. This method is equivalent to
  4141. ``df.sort_values(columns, ascending=True).head(n)``, but more
  4142. performant.
  4143. Parameters
  4144. ----------
  4145. n : int
  4146. Number of items to retrieve.
  4147. columns : list or str
  4148. Column name or names to order by.
  4149. keep : {'first', 'last', 'all'}, default 'first'
  4150. Where there are duplicate values:
  4151. - ``first`` : take the first occurrence.
  4152. - ``last`` : take the last occurrence.
  4153. - ``all`` : do not drop any duplicates, even it means
  4154. selecting more than `n` items.
  4155. .. versionadded:: 0.24.0
  4156. Returns
  4157. -------
  4158. DataFrame
  4159. See Also
  4160. --------
  4161. DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
  4162. descending order.
  4163. DataFrame.sort_values : Sort DataFrame by the values.
  4164. DataFrame.head : Return the first `n` rows without re-ordering.
  4165. Examples
  4166. --------
  4167. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  4168. ... 434000, 434000, 337000, 11300,
  4169. ... 11300, 11300],
  4170. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  4171. ... 17036, 182, 38, 311],
  4172. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  4173. ... "IS", "NR", "TV", "AI"]},
  4174. ... index=["Italy", "France", "Malta",
  4175. ... "Maldives", "Brunei", "Iceland",
  4176. ... "Nauru", "Tuvalu", "Anguilla"])
  4177. >>> df
  4178. population GDP alpha-2
  4179. Italy 59000000 1937894 IT
  4180. France 65000000 2583560 FR
  4181. Malta 434000 12011 MT
  4182. Maldives 434000 4520 MV
  4183. Brunei 434000 12128 BN
  4184. Iceland 337000 17036 IS
  4185. Nauru 11300 182 NR
  4186. Tuvalu 11300 38 TV
  4187. Anguilla 11300 311 AI
  4188. In the following example, we will use ``nsmallest`` to select the
  4189. three rows having the smallest values in column "a".
  4190. >>> df.nsmallest(3, 'population')
  4191. population GDP alpha-2
  4192. Nauru 11300 182 NR
  4193. Tuvalu 11300 38 TV
  4194. Anguilla 11300 311 AI
  4195. When using ``keep='last'``, ties are resolved in reverse order:
  4196. >>> df.nsmallest(3, 'population', keep='last')
  4197. population GDP alpha-2
  4198. Anguilla 11300 311 AI
  4199. Tuvalu 11300 38 TV
  4200. Nauru 11300 182 NR
  4201. When using ``keep='all'``, all duplicate items are maintained:
  4202. >>> df.nsmallest(3, 'population', keep='all')
  4203. population GDP alpha-2
  4204. Nauru 11300 182 NR
  4205. Tuvalu 11300 38 TV
  4206. Anguilla 11300 311 AI
  4207. To order by the largest values in column "a" and then "c", we can
  4208. specify multiple columns like in the next example.
  4209. >>> df.nsmallest(3, ['population', 'GDP'])
  4210. population GDP alpha-2
  4211. Tuvalu 11300 38 TV
  4212. Nauru 11300 182 NR
  4213. Anguilla 11300 311 AI
  4214. """
  4215. return algorithms.SelectNFrame(self,
  4216. n=n,
  4217. keep=keep,
  4218. columns=columns).nsmallest()
  4219. def swaplevel(self, i=-2, j=-1, axis=0):
  4220. """
  4221. Swap levels i and j in a MultiIndex on a particular axis.
  4222. Parameters
  4223. ----------
  4224. i, j : int, string (can be mixed)
  4225. Level of index to be swapped. Can pass level name as string.
  4226. Returns
  4227. -------
  4228. swapped : same type as caller (new object)
  4229. .. versionchanged:: 0.18.1
  4230. The indexes ``i`` and ``j`` are now optional, and default to
  4231. the two innermost levels of the index.
  4232. """
  4233. result = self.copy()
  4234. axis = self._get_axis_number(axis)
  4235. if axis == 0:
  4236. result.index = result.index.swaplevel(i, j)
  4237. else:
  4238. result.columns = result.columns.swaplevel(i, j)
  4239. return result
  4240. def reorder_levels(self, order, axis=0):
  4241. """
  4242. Rearrange index levels using input order. May not drop or
  4243. duplicate levels.
  4244. Parameters
  4245. ----------
  4246. order : list of int or list of str
  4247. List representing new level order. Reference level by number
  4248. (position) or by key (label).
  4249. axis : int
  4250. Where to reorder levels.
  4251. Returns
  4252. -------
  4253. type of caller (new object)
  4254. """
  4255. axis = self._get_axis_number(axis)
  4256. if not isinstance(self._get_axis(axis),
  4257. MultiIndex): # pragma: no cover
  4258. raise TypeError('Can only reorder levels on a hierarchical axis.')
  4259. result = self.copy()
  4260. if axis == 0:
  4261. result.index = result.index.reorder_levels(order)
  4262. else:
  4263. result.columns = result.columns.reorder_levels(order)
  4264. return result
  4265. # ----------------------------------------------------------------------
  4266. # Arithmetic / combination related
  4267. def _combine_frame(self, other, func, fill_value=None, level=None):
  4268. this, other = self.align(other, join='outer', level=level, copy=False)
  4269. new_index, new_columns = this.index, this.columns
  4270. def _arith_op(left, right):
  4271. # for the mixed_type case where we iterate over columns,
  4272. # _arith_op(left, right) is equivalent to
  4273. # left._binop(right, func, fill_value=fill_value)
  4274. left, right = ops.fill_binop(left, right, fill_value)
  4275. return func(left, right)
  4276. if ops.should_series_dispatch(this, other, func):
  4277. # iterate over columns
  4278. return ops.dispatch_to_series(this, other, _arith_op)
  4279. else:
  4280. result = _arith_op(this.values, other.values)
  4281. return self._constructor(result,
  4282. index=new_index, columns=new_columns,
  4283. copy=False)
  4284. def _combine_match_index(self, other, func, level=None):
  4285. left, right = self.align(other, join='outer', axis=0, level=level,
  4286. copy=False)
  4287. assert left.index.equals(right.index)
  4288. if left._is_mixed_type or right._is_mixed_type:
  4289. # operate column-wise; avoid costly object-casting in `.values`
  4290. return ops.dispatch_to_series(left, right, func)
  4291. else:
  4292. # fastpath --> operate directly on values
  4293. with np.errstate(all="ignore"):
  4294. new_data = func(left.values.T, right.values).T
  4295. return self._constructor(new_data,
  4296. index=left.index, columns=self.columns,
  4297. copy=False)
  4298. def _combine_match_columns(self, other, func, level=None):
  4299. assert isinstance(other, Series)
  4300. left, right = self.align(other, join='outer', axis=1, level=level,
  4301. copy=False)
  4302. assert left.columns.equals(right.index)
  4303. return ops.dispatch_to_series(left, right, func, axis="columns")
  4304. def _combine_const(self, other, func):
  4305. assert lib.is_scalar(other) or np.ndim(other) == 0
  4306. return ops.dispatch_to_series(self, other, func)
  4307. def combine(self, other, func, fill_value=None, overwrite=True):
  4308. """
  4309. Perform column-wise combine with another DataFrame based on a
  4310. passed function.
  4311. Combines a DataFrame with `other` DataFrame using `func`
  4312. to element-wise combine columns. The row and column indexes of the
  4313. resulting DataFrame will be the union of the two.
  4314. Parameters
  4315. ----------
  4316. other : DataFrame
  4317. The DataFrame to merge column-wise.
  4318. func : function
  4319. Function that takes two series as inputs and return a Series or a
  4320. scalar. Used to merge the two dataframes column by columns.
  4321. fill_value : scalar value, default None
  4322. The value to fill NaNs with prior to passing any column to the
  4323. merge func.
  4324. overwrite : boolean, default True
  4325. If True, columns in `self` that do not exist in `other` will be
  4326. overwritten with NaNs.
  4327. Returns
  4328. -------
  4329. result : DataFrame
  4330. See Also
  4331. --------
  4332. DataFrame.combine_first : Combine two DataFrame objects and default to
  4333. non-null values in frame calling the method.
  4334. Examples
  4335. --------
  4336. Combine using a simple function that chooses the smaller column.
  4337. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  4338. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4339. >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
  4340. >>> df1.combine(df2, take_smaller)
  4341. A B
  4342. 0 0 3
  4343. 1 0 3
  4344. Example using a true element-wise combine function.
  4345. >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
  4346. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4347. >>> df1.combine(df2, np.minimum)
  4348. A B
  4349. 0 1 2
  4350. 1 0 3
  4351. Using `fill_value` fills Nones prior to passing the column to the
  4352. merge function.
  4353. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  4354. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4355. >>> df1.combine(df2, take_smaller, fill_value=-5)
  4356. A B
  4357. 0 0 -5.0
  4358. 1 0 4.0
  4359. However, if the same element in both dataframes is None, that None
  4360. is preserved
  4361. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  4362. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
  4363. >>> df1.combine(df2, take_smaller, fill_value=-5)
  4364. A B
  4365. 0 0 NaN
  4366. 1 0 3.0
  4367. Example that demonstrates the use of `overwrite` and behavior when
  4368. the axis differ between the dataframes.
  4369. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  4370. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1],}, index=[1, 2])
  4371. >>> df1.combine(df2, take_smaller)
  4372. A B C
  4373. 0 NaN NaN NaN
  4374. 1 NaN 3.0 -10.0
  4375. 2 NaN 3.0 1.0
  4376. >>> df1.combine(df2, take_smaller, overwrite=False)
  4377. A B C
  4378. 0 0.0 NaN NaN
  4379. 1 0.0 3.0 -10.0
  4380. 2 NaN 3.0 1.0
  4381. Demonstrating the preference of the passed in dataframe.
  4382. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1],}, index=[1, 2])
  4383. >>> df2.combine(df1, take_smaller)
  4384. A B C
  4385. 0 0.0 NaN NaN
  4386. 1 0.0 3.0 NaN
  4387. 2 NaN 3.0 NaN
  4388. >>> df2.combine(df1, take_smaller, overwrite=False)
  4389. A B C
  4390. 0 0.0 NaN NaN
  4391. 1 0.0 3.0 1.0
  4392. 2 NaN 3.0 1.0
  4393. """
  4394. other_idxlen = len(other.index) # save for compare
  4395. this, other = self.align(other, copy=False)
  4396. new_index = this.index
  4397. if other.empty and len(new_index) == len(self.index):
  4398. return self.copy()
  4399. if self.empty and len(other) == other_idxlen:
  4400. return other.copy()
  4401. # sorts if possible
  4402. new_columns = this.columns.union(other.columns)
  4403. do_fill = fill_value is not None
  4404. result = {}
  4405. for col in new_columns:
  4406. series = this[col]
  4407. otherSeries = other[col]
  4408. this_dtype = series.dtype
  4409. other_dtype = otherSeries.dtype
  4410. this_mask = isna(series)
  4411. other_mask = isna(otherSeries)
  4412. # don't overwrite columns unecessarily
  4413. # DO propagate if this column is not in the intersection
  4414. if not overwrite and other_mask.all():
  4415. result[col] = this[col].copy()
  4416. continue
  4417. if do_fill:
  4418. series = series.copy()
  4419. otherSeries = otherSeries.copy()
  4420. series[this_mask] = fill_value
  4421. otherSeries[other_mask] = fill_value
  4422. if col not in self.columns:
  4423. # If self DataFrame does not have col in other DataFrame,
  4424. # try to promote series, which is all NaN, as other_dtype.
  4425. new_dtype = other_dtype
  4426. try:
  4427. series = series.astype(new_dtype, copy=False)
  4428. except ValueError:
  4429. # e.g. new_dtype is integer types
  4430. pass
  4431. else:
  4432. # if we have different dtypes, possibly promote
  4433. new_dtype = find_common_type([this_dtype, other_dtype])
  4434. if not is_dtype_equal(this_dtype, new_dtype):
  4435. series = series.astype(new_dtype)
  4436. if not is_dtype_equal(other_dtype, new_dtype):
  4437. otherSeries = otherSeries.astype(new_dtype)
  4438. arr = func(series, otherSeries)
  4439. arr = maybe_downcast_to_dtype(arr, this_dtype)
  4440. result[col] = arr
  4441. # convert_objects just in case
  4442. return self._constructor(result, index=new_index,
  4443. columns=new_columns)
  4444. def combine_first(self, other):
  4445. """
  4446. Update null elements with value in the same location in `other`.
  4447. Combine two DataFrame objects by filling null values in one DataFrame
  4448. with non-null values from other DataFrame. The row and column indexes
  4449. of the resulting DataFrame will be the union of the two.
  4450. Parameters
  4451. ----------
  4452. other : DataFrame
  4453. Provided DataFrame to use to fill null values.
  4454. Returns
  4455. -------
  4456. combined : DataFrame
  4457. See Also
  4458. --------
  4459. DataFrame.combine : Perform series-wise operation on two DataFrames
  4460. using a given function.
  4461. Examples
  4462. --------
  4463. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
  4464. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4465. >>> df1.combine_first(df2)
  4466. A B
  4467. 0 1.0 3.0
  4468. 1 0.0 4.0
  4469. Null values still persist if the location of that null value
  4470. does not exist in `other`
  4471. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
  4472. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
  4473. >>> df1.combine_first(df2)
  4474. A B C
  4475. 0 NaN 4.0 NaN
  4476. 1 0.0 3.0 1.0
  4477. 2 NaN 3.0 1.0
  4478. """
  4479. import pandas.core.computation.expressions as expressions
  4480. def extract_values(arr):
  4481. # Does two things:
  4482. # 1. maybe gets the values from the Series / Index
  4483. # 2. convert datelike to i8
  4484. if isinstance(arr, (ABCIndexClass, ABCSeries)):
  4485. arr = arr._values
  4486. if needs_i8_conversion(arr):
  4487. if is_extension_array_dtype(arr.dtype):
  4488. arr = arr.asi8
  4489. else:
  4490. arr = arr.view('i8')
  4491. return arr
  4492. def combiner(x, y):
  4493. mask = isna(x)
  4494. if isinstance(mask, (ABCIndexClass, ABCSeries)):
  4495. mask = mask._values
  4496. x_values = extract_values(x)
  4497. y_values = extract_values(y)
  4498. # If the column y in other DataFrame is not in first DataFrame,
  4499. # just return y_values.
  4500. if y.name not in self.columns:
  4501. return y_values
  4502. return expressions.where(mask, y_values, x_values)
  4503. return self.combine(other, combiner, overwrite=False)
  4504. @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors',
  4505. mapping={False: 'ignore', True: 'raise'})
  4506. def update(self, other, join='left', overwrite=True, filter_func=None,
  4507. errors='ignore'):
  4508. """
  4509. Modify in place using non-NA values from another DataFrame.
  4510. Aligns on indices. There is no return value.
  4511. Parameters
  4512. ----------
  4513. other : DataFrame, or object coercible into a DataFrame
  4514. Should have at least one matching index/column label
  4515. with the original DataFrame. If a Series is passed,
  4516. its name attribute must be set, and that will be
  4517. used as the column name to align with the original DataFrame.
  4518. join : {'left'}, default 'left'
  4519. Only left join is implemented, keeping the index and columns of the
  4520. original object.
  4521. overwrite : bool, default True
  4522. How to handle non-NA values for overlapping keys:
  4523. * True: overwrite original DataFrame's values
  4524. with values from `other`.
  4525. * False: only update values that are NA in
  4526. the original DataFrame.
  4527. filter_func : callable(1d-array) -> bool 1d-array, optional
  4528. Can choose to replace values other than NA. Return True for values
  4529. that should be updated.
  4530. errors : {'raise', 'ignore'}, default 'ignore'
  4531. If 'raise', will raise a ValueError if the DataFrame and `other`
  4532. both contain non-NA data in the same place.
  4533. .. versionchanged :: 0.24.0
  4534. Changed from `raise_conflict=False|True`
  4535. to `errors='ignore'|'raise'`.
  4536. Returns
  4537. -------
  4538. None : method directly changes calling object
  4539. Raises
  4540. ------
  4541. ValueError
  4542. * When `errors='raise'` and there's overlapping non-NA data.
  4543. * When `errors` is not either `'ignore'` or `'raise'`
  4544. NotImplementedError
  4545. * If `join != 'left'`
  4546. See Also
  4547. --------
  4548. dict.update : Similar method for dictionaries.
  4549. DataFrame.merge : For column(s)-on-columns(s) operations.
  4550. Examples
  4551. --------
  4552. >>> df = pd.DataFrame({'A': [1, 2, 3],
  4553. ... 'B': [400, 500, 600]})
  4554. >>> new_df = pd.DataFrame({'B': [4, 5, 6],
  4555. ... 'C': [7, 8, 9]})
  4556. >>> df.update(new_df)
  4557. >>> df
  4558. A B
  4559. 0 1 4
  4560. 1 2 5
  4561. 2 3 6
  4562. The DataFrame's length does not increase as a result of the update,
  4563. only values at matching index/column labels are updated.
  4564. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4565. ... 'B': ['x', 'y', 'z']})
  4566. >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
  4567. >>> df.update(new_df)
  4568. >>> df
  4569. A B
  4570. 0 a d
  4571. 1 b e
  4572. 2 c f
  4573. For Series, it's name attribute must be set.
  4574. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4575. ... 'B': ['x', 'y', 'z']})
  4576. >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
  4577. >>> df.update(new_column)
  4578. >>> df
  4579. A B
  4580. 0 a d
  4581. 1 b y
  4582. 2 c e
  4583. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4584. ... 'B': ['x', 'y', 'z']})
  4585. >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
  4586. >>> df.update(new_df)
  4587. >>> df
  4588. A B
  4589. 0 a x
  4590. 1 b d
  4591. 2 c e
  4592. If `other` contains NaNs the corresponding values are not updated
  4593. in the original dataframe.
  4594. >>> df = pd.DataFrame({'A': [1, 2, 3],
  4595. ... 'B': [400, 500, 600]})
  4596. >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
  4597. >>> df.update(new_df)
  4598. >>> df
  4599. A B
  4600. 0 1 4.0
  4601. 1 2 500.0
  4602. 2 3 6.0
  4603. """
  4604. import pandas.core.computation.expressions as expressions
  4605. # TODO: Support other joins
  4606. if join != 'left': # pragma: no cover
  4607. raise NotImplementedError("Only left join is supported")
  4608. if errors not in ['ignore', 'raise']:
  4609. raise ValueError("The parameter errors must be either "
  4610. "'ignore' or 'raise'")
  4611. if not isinstance(other, DataFrame):
  4612. other = DataFrame(other)
  4613. other = other.reindex_like(self)
  4614. for col in self.columns:
  4615. this = self[col].values
  4616. that = other[col].values
  4617. if filter_func is not None:
  4618. with np.errstate(all='ignore'):
  4619. mask = ~filter_func(this) | isna(that)
  4620. else:
  4621. if errors == 'raise':
  4622. mask_this = notna(that)
  4623. mask_that = notna(this)
  4624. if any(mask_this & mask_that):
  4625. raise ValueError("Data overlaps.")
  4626. if overwrite:
  4627. mask = isna(that)
  4628. else:
  4629. mask = notna(this)
  4630. # don't overwrite columns unecessarily
  4631. if mask.all():
  4632. continue
  4633. self[col] = expressions.where(mask, this, that)
  4634. # ----------------------------------------------------------------------
  4635. # Data reshaping
  4636. _shared_docs['pivot'] = """
  4637. Return reshaped DataFrame organized by given index / column values.
  4638. Reshape data (produce a "pivot" table) based on column values. Uses
  4639. unique values from specified `index` / `columns` to form axes of the
  4640. resulting DataFrame. This function does not support data
  4641. aggregation, multiple values will result in a MultiIndex in the
  4642. columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
  4643. Parameters
  4644. ----------%s
  4645. index : string or object, optional
  4646. Column to use to make new frame's index. If None, uses
  4647. existing index.
  4648. columns : string or object
  4649. Column to use to make new frame's columns.
  4650. values : string, object or a list of the previous, optional
  4651. Column(s) to use for populating new frame's values. If not
  4652. specified, all remaining columns will be used and the result will
  4653. have hierarchically indexed columns.
  4654. .. versionchanged :: 0.23.0
  4655. Also accept list of column names.
  4656. Returns
  4657. -------
  4658. DataFrame
  4659. Returns reshaped DataFrame.
  4660. Raises
  4661. ------
  4662. ValueError:
  4663. When there are any `index`, `columns` combinations with multiple
  4664. values. `DataFrame.pivot_table` when you need to aggregate.
  4665. See Also
  4666. --------
  4667. DataFrame.pivot_table : Generalization of pivot that can handle
  4668. duplicate values for one index/column pair.
  4669. DataFrame.unstack : Pivot based on the index values instead of a
  4670. column.
  4671. Notes
  4672. -----
  4673. For finer-tuned control, see hierarchical indexing documentation along
  4674. with the related stack/unstack methods.
  4675. Examples
  4676. --------
  4677. >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
  4678. ... 'two'],
  4679. ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  4680. ... 'baz': [1, 2, 3, 4, 5, 6],
  4681. ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
  4682. >>> df
  4683. foo bar baz zoo
  4684. 0 one A 1 x
  4685. 1 one B 2 y
  4686. 2 one C 3 z
  4687. 3 two A 4 q
  4688. 4 two B 5 w
  4689. 5 two C 6 t
  4690. >>> df.pivot(index='foo', columns='bar', values='baz')
  4691. bar A B C
  4692. foo
  4693. one 1 2 3
  4694. two 4 5 6
  4695. >>> df.pivot(index='foo', columns='bar')['baz']
  4696. bar A B C
  4697. foo
  4698. one 1 2 3
  4699. two 4 5 6
  4700. >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
  4701. baz zoo
  4702. bar A B C A B C
  4703. foo
  4704. one 1 2 3 x y z
  4705. two 4 5 6 q w t
  4706. A ValueError is raised if there are any duplicates.
  4707. >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
  4708. ... "bar": ['A', 'A', 'B', 'C'],
  4709. ... "baz": [1, 2, 3, 4]})
  4710. >>> df
  4711. foo bar baz
  4712. 0 one A 1
  4713. 1 one A 2
  4714. 2 two B 3
  4715. 3 two C 4
  4716. Notice that the first two rows are the same for our `index`
  4717. and `columns` arguments.
  4718. >>> df.pivot(index='foo', columns='bar', values='baz')
  4719. Traceback (most recent call last):
  4720. ...
  4721. ValueError: Index contains duplicate entries, cannot reshape
  4722. """
  4723. @Substitution('')
  4724. @Appender(_shared_docs['pivot'])
  4725. def pivot(self, index=None, columns=None, values=None):
  4726. from pandas.core.reshape.pivot import pivot
  4727. return pivot(self, index=index, columns=columns, values=values)
  4728. _shared_docs['pivot_table'] = """
  4729. Create a spreadsheet-style pivot table as a DataFrame. The levels in
  4730. the pivot table will be stored in MultiIndex objects (hierarchical
  4731. indexes) on the index and columns of the result DataFrame.
  4732. Parameters
  4733. ----------%s
  4734. values : column to aggregate, optional
  4735. index : column, Grouper, array, or list of the previous
  4736. If an array is passed, it must be the same length as the data. The
  4737. list can contain any of the other types (except list).
  4738. Keys to group by on the pivot table index. If an array is passed,
  4739. it is being used as the same manner as column values.
  4740. columns : column, Grouper, array, or list of the previous
  4741. If an array is passed, it must be the same length as the data. The
  4742. list can contain any of the other types (except list).
  4743. Keys to group by on the pivot table column. If an array is passed,
  4744. it is being used as the same manner as column values.
  4745. aggfunc : function, list of functions, dict, default numpy.mean
  4746. If list of functions passed, the resulting pivot table will have
  4747. hierarchical columns whose top level are the function names
  4748. (inferred from the function objects themselves)
  4749. If dict is passed, the key is column to aggregate and value
  4750. is function or list of functions
  4751. fill_value : scalar, default None
  4752. Value to replace missing values with
  4753. margins : boolean, default False
  4754. Add all row / columns (e.g. for subtotal / grand totals)
  4755. dropna : boolean, default True
  4756. Do not include columns whose entries are all NaN
  4757. margins_name : string, default 'All'
  4758. Name of the row / column that will contain the totals
  4759. when margins is True.
  4760. Returns
  4761. -------
  4762. table : DataFrame
  4763. See Also
  4764. --------
  4765. DataFrame.pivot : Pivot without aggregation that can handle
  4766. non-numeric data.
  4767. Examples
  4768. --------
  4769. >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
  4770. ... "bar", "bar", "bar", "bar"],
  4771. ... "B": ["one", "one", "one", "two", "two",
  4772. ... "one", "one", "two", "two"],
  4773. ... "C": ["small", "large", "large", "small",
  4774. ... "small", "large", "small", "small",
  4775. ... "large"],
  4776. ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  4777. ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
  4778. >>> df
  4779. A B C D E
  4780. 0 foo one small 1 2
  4781. 1 foo one large 2 4
  4782. 2 foo one large 2 5
  4783. 3 foo two small 3 5
  4784. 4 foo two small 3 6
  4785. 5 bar one large 4 6
  4786. 6 bar one small 5 8
  4787. 7 bar two small 6 9
  4788. 8 bar two large 7 9
  4789. This first example aggregates values by taking the sum.
  4790. >>> table = pivot_table(df, values='D', index=['A', 'B'],
  4791. ... columns=['C'], aggfunc=np.sum)
  4792. >>> table
  4793. C large small
  4794. A B
  4795. bar one 4 5
  4796. two 7 6
  4797. foo one 4 1
  4798. two NaN 6
  4799. We can also fill missing values using the `fill_value` parameter.
  4800. >>> table = pivot_table(df, values='D', index=['A', 'B'],
  4801. ... columns=['C'], aggfunc=np.sum, fill_value=0)
  4802. >>> table
  4803. C large small
  4804. A B
  4805. bar one 4 5
  4806. two 7 6
  4807. foo one 4 1
  4808. two 0 6
  4809. The next example aggregates by taking the mean across multiple columns.
  4810. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  4811. ... aggfunc={'D': np.mean,
  4812. ... 'E': np.mean})
  4813. >>> table
  4814. D E
  4815. mean mean
  4816. A C
  4817. bar large 5.500000 7.500000
  4818. small 5.500000 8.500000
  4819. foo large 2.000000 4.500000
  4820. small 2.333333 4.333333
  4821. We can also calculate multiple types of aggregations for any given
  4822. value column.
  4823. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  4824. ... aggfunc={'D': np.mean,
  4825. ... 'E': [min, max, np.mean]})
  4826. >>> table
  4827. D E
  4828. mean max mean min
  4829. A C
  4830. bar large 5.500000 9 7.500000 6
  4831. small 5.500000 9 8.500000 8
  4832. foo large 2.000000 5 4.500000 4
  4833. small 2.333333 6 4.333333 2
  4834. """
  4835. @Substitution('')
  4836. @Appender(_shared_docs['pivot_table'])
  4837. def pivot_table(self, values=None, index=None, columns=None,
  4838. aggfunc='mean', fill_value=None, margins=False,
  4839. dropna=True, margins_name='All'):
  4840. from pandas.core.reshape.pivot import pivot_table
  4841. return pivot_table(self, values=values, index=index, columns=columns,
  4842. aggfunc=aggfunc, fill_value=fill_value,
  4843. margins=margins, dropna=dropna,
  4844. margins_name=margins_name)
  4845. def stack(self, level=-1, dropna=True):
  4846. """
  4847. Stack the prescribed level(s) from columns to index.
  4848. Return a reshaped DataFrame or Series having a multi-level
  4849. index with one or more new inner-most levels compared to the current
  4850. DataFrame. The new inner-most levels are created by pivoting the
  4851. columns of the current dataframe:
  4852. - if the columns have a single level, the output is a Series;
  4853. - if the columns have multiple levels, the new index
  4854. level(s) is (are) taken from the prescribed level(s) and
  4855. the output is a DataFrame.
  4856. The new index levels are sorted.
  4857. Parameters
  4858. ----------
  4859. level : int, str, list, default -1
  4860. Level(s) to stack from the column axis onto the index
  4861. axis, defined as one index or label, or a list of indices
  4862. or labels.
  4863. dropna : bool, default True
  4864. Whether to drop rows in the resulting Frame/Series with
  4865. missing values. Stacking a column level onto the index
  4866. axis can create combinations of index and column values
  4867. that are missing from the original dataframe. See Examples
  4868. section.
  4869. Returns
  4870. -------
  4871. DataFrame or Series
  4872. Stacked dataframe or series.
  4873. See Also
  4874. --------
  4875. DataFrame.unstack : Unstack prescribed level(s) from index axis
  4876. onto column axis.
  4877. DataFrame.pivot : Reshape dataframe from long format to wide
  4878. format.
  4879. DataFrame.pivot_table : Create a spreadsheet-style pivot table
  4880. as a DataFrame.
  4881. Notes
  4882. -----
  4883. The function is named by analogy with a collection of books
  4884. being re-organised from being side by side on a horizontal
  4885. position (the columns of the dataframe) to being stacked
  4886. vertically on top of of each other (in the index of the
  4887. dataframe).
  4888. Examples
  4889. --------
  4890. **Single level columns**
  4891. >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
  4892. ... index=['cat', 'dog'],
  4893. ... columns=['weight', 'height'])
  4894. Stacking a dataframe with a single level column axis returns a Series:
  4895. >>> df_single_level_cols
  4896. weight height
  4897. cat 0 1
  4898. dog 2 3
  4899. >>> df_single_level_cols.stack()
  4900. cat weight 0
  4901. height 1
  4902. dog weight 2
  4903. height 3
  4904. dtype: int64
  4905. **Multi level columns: simple case**
  4906. >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  4907. ... ('weight', 'pounds')])
  4908. >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
  4909. ... index=['cat', 'dog'],
  4910. ... columns=multicol1)
  4911. Stacking a dataframe with a multi-level column axis:
  4912. >>> df_multi_level_cols1
  4913. weight
  4914. kg pounds
  4915. cat 1 2
  4916. dog 2 4
  4917. >>> df_multi_level_cols1.stack()
  4918. weight
  4919. cat kg 1
  4920. pounds 2
  4921. dog kg 2
  4922. pounds 4
  4923. **Missing values**
  4924. >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  4925. ... ('height', 'm')])
  4926. >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
  4927. ... index=['cat', 'dog'],
  4928. ... columns=multicol2)
  4929. It is common to have missing values when stacking a dataframe
  4930. with multi-level columns, as the stacked dataframe typically
  4931. has more values than the original dataframe. Missing values
  4932. are filled with NaNs:
  4933. >>> df_multi_level_cols2
  4934. weight height
  4935. kg m
  4936. cat 1.0 2.0
  4937. dog 3.0 4.0
  4938. >>> df_multi_level_cols2.stack()
  4939. height weight
  4940. cat kg NaN 1.0
  4941. m 2.0 NaN
  4942. dog kg NaN 3.0
  4943. m 4.0 NaN
  4944. **Prescribing the level(s) to be stacked**
  4945. The first parameter controls which level or levels are stacked:
  4946. >>> df_multi_level_cols2.stack(0)
  4947. kg m
  4948. cat height NaN 2.0
  4949. weight 1.0 NaN
  4950. dog height NaN 4.0
  4951. weight 3.0 NaN
  4952. >>> df_multi_level_cols2.stack([0, 1])
  4953. cat height m 2.0
  4954. weight kg 1.0
  4955. dog height m 4.0
  4956. weight kg 3.0
  4957. dtype: float64
  4958. **Dropping missing values**
  4959. >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
  4960. ... index=['cat', 'dog'],
  4961. ... columns=multicol2)
  4962. Note that rows where all values are missing are dropped by
  4963. default but this behaviour can be controlled via the dropna
  4964. keyword parameter:
  4965. >>> df_multi_level_cols3
  4966. weight height
  4967. kg m
  4968. cat NaN 1.0
  4969. dog 2.0 3.0
  4970. >>> df_multi_level_cols3.stack(dropna=False)
  4971. height weight
  4972. cat kg NaN NaN
  4973. m 1.0 NaN
  4974. dog kg NaN 2.0
  4975. m 3.0 NaN
  4976. >>> df_multi_level_cols3.stack(dropna=True)
  4977. height weight
  4978. cat m 1.0 NaN
  4979. dog kg NaN 2.0
  4980. m 3.0 NaN
  4981. """
  4982. from pandas.core.reshape.reshape import stack, stack_multiple
  4983. if isinstance(level, (tuple, list)):
  4984. return stack_multiple(self, level, dropna=dropna)
  4985. else:
  4986. return stack(self, level, dropna=dropna)
  4987. def unstack(self, level=-1, fill_value=None):
  4988. """
  4989. Pivot a level of the (necessarily hierarchical) index labels, returning
  4990. a DataFrame having a new level of column labels whose inner-most level
  4991. consists of the pivoted index labels.
  4992. If the index is not a MultiIndex, the output will be a Series
  4993. (the analogue of stack when the columns are not a MultiIndex).
  4994. The level involved will automatically get sorted.
  4995. Parameters
  4996. ----------
  4997. level : int, string, or list of these, default -1 (last level)
  4998. Level(s) of index to unstack, can pass level name
  4999. fill_value : replace NaN with this value if the unstack produces
  5000. missing values
  5001. .. versionadded:: 0.18.0
  5002. Returns
  5003. -------
  5004. unstacked : DataFrame or Series
  5005. See Also
  5006. --------
  5007. DataFrame.pivot : Pivot a table based on column values.
  5008. DataFrame.stack : Pivot a level of the column labels (inverse operation
  5009. from `unstack`).
  5010. Examples
  5011. --------
  5012. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  5013. ... ('two', 'a'), ('two', 'b')])
  5014. >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
  5015. >>> s
  5016. one a 1.0
  5017. b 2.0
  5018. two a 3.0
  5019. b 4.0
  5020. dtype: float64
  5021. >>> s.unstack(level=-1)
  5022. a b
  5023. one 1.0 2.0
  5024. two 3.0 4.0
  5025. >>> s.unstack(level=0)
  5026. one two
  5027. a 1.0 3.0
  5028. b 2.0 4.0
  5029. >>> df = s.unstack(level=0)
  5030. >>> df.unstack()
  5031. one a 1.0
  5032. b 2.0
  5033. two a 3.0
  5034. b 4.0
  5035. dtype: float64
  5036. """
  5037. from pandas.core.reshape.reshape import unstack
  5038. return unstack(self, level, fill_value)
  5039. _shared_docs['melt'] = ("""
  5040. Unpivots a DataFrame from wide format to long format, optionally
  5041. leaving identifier variables set.
  5042. This function is useful to massage a DataFrame into a format where one
  5043. or more columns are identifier variables (`id_vars`), while all other
  5044. columns, considered measured variables (`value_vars`), are "unpivoted" to
  5045. the row axis, leaving just two non-identifier columns, 'variable' and
  5046. 'value'.
  5047. %(versionadded)s
  5048. Parameters
  5049. ----------
  5050. frame : DataFrame
  5051. id_vars : tuple, list, or ndarray, optional
  5052. Column(s) to use as identifier variables.
  5053. value_vars : tuple, list, or ndarray, optional
  5054. Column(s) to unpivot. If not specified, uses all columns that
  5055. are not set as `id_vars`.
  5056. var_name : scalar
  5057. Name to use for the 'variable' column. If None it uses
  5058. ``frame.columns.name`` or 'variable'.
  5059. value_name : scalar, default 'value'
  5060. Name to use for the 'value' column.
  5061. col_level : int or string, optional
  5062. If columns are a MultiIndex then use this level to melt.
  5063. See Also
  5064. --------
  5065. %(other)s
  5066. pivot_table
  5067. DataFrame.pivot
  5068. Examples
  5069. --------
  5070. >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
  5071. ... 'B': {0: 1, 1: 3, 2: 5},
  5072. ... 'C': {0: 2, 1: 4, 2: 6}})
  5073. >>> df
  5074. A B C
  5075. 0 a 1 2
  5076. 1 b 3 4
  5077. 2 c 5 6
  5078. >>> %(caller)sid_vars=['A'], value_vars=['B'])
  5079. A variable value
  5080. 0 a B 1
  5081. 1 b B 3
  5082. 2 c B 5
  5083. >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
  5084. A variable value
  5085. 0 a B 1
  5086. 1 b B 3
  5087. 2 c B 5
  5088. 3 a C 2
  5089. 4 b C 4
  5090. 5 c C 6
  5091. The names of 'variable' and 'value' columns can be customized:
  5092. >>> %(caller)sid_vars=['A'], value_vars=['B'],
  5093. ... var_name='myVarname', value_name='myValname')
  5094. A myVarname myValname
  5095. 0 a B 1
  5096. 1 b B 3
  5097. 2 c B 5
  5098. If you have multi-index columns:
  5099. >>> df.columns = [list('ABC'), list('DEF')]
  5100. >>> df
  5101. A B C
  5102. D E F
  5103. 0 a 1 2
  5104. 1 b 3 4
  5105. 2 c 5 6
  5106. >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
  5107. A variable value
  5108. 0 a B 1
  5109. 1 b B 3
  5110. 2 c B 5
  5111. >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
  5112. (A, D) variable_0 variable_1 value
  5113. 0 a B E 1
  5114. 1 b B E 3
  5115. 2 c B E 5
  5116. """)
  5117. @Appender(_shared_docs['melt'] %
  5118. dict(caller='df.melt(',
  5119. versionadded='.. versionadded:: 0.20.0\n',
  5120. other='melt'))
  5121. def melt(self, id_vars=None, value_vars=None, var_name=None,
  5122. value_name='value', col_level=None):
  5123. from pandas.core.reshape.melt import melt
  5124. return melt(self, id_vars=id_vars, value_vars=value_vars,
  5125. var_name=var_name, value_name=value_name,
  5126. col_level=col_level)
  5127. # ----------------------------------------------------------------------
  5128. # Time series-related
  5129. def diff(self, periods=1, axis=0):
  5130. """
  5131. First discrete difference of element.
  5132. Calculates the difference of a DataFrame element compared with another
  5133. element in the DataFrame (default is the element in the same column
  5134. of the previous row).
  5135. Parameters
  5136. ----------
  5137. periods : int, default 1
  5138. Periods to shift for calculating difference, accepts negative
  5139. values.
  5140. axis : {0 or 'index', 1 or 'columns'}, default 0
  5141. Take difference over rows (0) or columns (1).
  5142. .. versionadded:: 0.16.1.
  5143. Returns
  5144. -------
  5145. diffed : DataFrame
  5146. See Also
  5147. --------
  5148. Series.diff: First discrete difference for a Series.
  5149. DataFrame.pct_change: Percent change over given number of periods.
  5150. DataFrame.shift: Shift index by desired number of periods with an
  5151. optional time freq.
  5152. Examples
  5153. --------
  5154. Difference with previous row
  5155. >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
  5156. ... 'b': [1, 1, 2, 3, 5, 8],
  5157. ... 'c': [1, 4, 9, 16, 25, 36]})
  5158. >>> df
  5159. a b c
  5160. 0 1 1 1
  5161. 1 2 1 4
  5162. 2 3 2 9
  5163. 3 4 3 16
  5164. 4 5 5 25
  5165. 5 6 8 36
  5166. >>> df.diff()
  5167. a b c
  5168. 0 NaN NaN NaN
  5169. 1 1.0 0.0 3.0
  5170. 2 1.0 1.0 5.0
  5171. 3 1.0 1.0 7.0
  5172. 4 1.0 2.0 9.0
  5173. 5 1.0 3.0 11.0
  5174. Difference with previous column
  5175. >>> df.diff(axis=1)
  5176. a b c
  5177. 0 NaN 0.0 0.0
  5178. 1 NaN -1.0 3.0
  5179. 2 NaN -1.0 7.0
  5180. 3 NaN -1.0 13.0
  5181. 4 NaN 0.0 20.0
  5182. 5 NaN 2.0 28.0
  5183. Difference with 3rd previous row
  5184. >>> df.diff(periods=3)
  5185. a b c
  5186. 0 NaN NaN NaN
  5187. 1 NaN NaN NaN
  5188. 2 NaN NaN NaN
  5189. 3 3.0 2.0 15.0
  5190. 4 3.0 4.0 21.0
  5191. 5 3.0 6.0 27.0
  5192. Difference with following row
  5193. >>> df.diff(periods=-1)
  5194. a b c
  5195. 0 -1.0 0.0 -3.0
  5196. 1 -1.0 -1.0 -5.0
  5197. 2 -1.0 -1.0 -7.0
  5198. 3 -1.0 -2.0 -9.0
  5199. 4 -1.0 -3.0 -11.0
  5200. 5 NaN NaN NaN
  5201. """
  5202. bm_axis = self._get_block_manager_axis(axis)
  5203. new_data = self._data.diff(n=periods, axis=bm_axis)
  5204. return self._constructor(new_data)
  5205. # ----------------------------------------------------------------------
  5206. # Function application
  5207. def _gotitem(self,
  5208. key, # type: Union[str, List[str]]
  5209. ndim, # type: int
  5210. subset=None # type: Union[Series, DataFrame, None]
  5211. ):
  5212. # type: (...) -> Union[Series, DataFrame]
  5213. """
  5214. Sub-classes to define. Return a sliced object.
  5215. Parameters
  5216. ----------
  5217. key : string / list of selections
  5218. ndim : 1,2
  5219. requested ndim of result
  5220. subset : object, default None
  5221. subset to act on
  5222. """
  5223. if subset is None:
  5224. subset = self
  5225. elif subset.ndim == 1: # is Series
  5226. return subset
  5227. # TODO: _shallow_copy(subset)?
  5228. return subset[key]
  5229. _agg_summary_and_see_also_doc = dedent("""
  5230. The aggregation operations are always performed over an axis, either the
  5231. index (default) or the column axis. This behavior is different from
  5232. `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
  5233. `var`), where the default is to compute the aggregation of the flattened
  5234. array, e.g., ``numpy.mean(arr_2d)`` as opposed to ``numpy.mean(arr_2d,
  5235. axis=0)``.
  5236. `agg` is an alias for `aggregate`. Use the alias.
  5237. See Also
  5238. --------
  5239. DataFrame.apply : Perform any type of operations.
  5240. DataFrame.transform : Perform transformation type operations.
  5241. pandas.core.groupby.GroupBy : Perform operations over groups.
  5242. pandas.core.resample.Resampler : Perform operations over resampled bins.
  5243. pandas.core.window.Rolling : Perform operations over rolling window.
  5244. pandas.core.window.Expanding : Perform operations over expanding window.
  5245. pandas.core.window.EWM : Perform operation over exponential weighted
  5246. window.
  5247. """)
  5248. _agg_examples_doc = dedent("""
  5249. Examples
  5250. --------
  5251. >>> df = pd.DataFrame([[1, 2, 3],
  5252. ... [4, 5, 6],
  5253. ... [7, 8, 9],
  5254. ... [np.nan, np.nan, np.nan]],
  5255. ... columns=['A', 'B', 'C'])
  5256. Aggregate these functions over the rows.
  5257. >>> df.agg(['sum', 'min'])
  5258. A B C
  5259. sum 12.0 15.0 18.0
  5260. min 1.0 2.0 3.0
  5261. Different aggregations per column.
  5262. >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
  5263. A B
  5264. max NaN 8.0
  5265. min 1.0 2.0
  5266. sum 12.0 NaN
  5267. Aggregate over the columns.
  5268. >>> df.agg("mean", axis="columns")
  5269. 0 2.0
  5270. 1 5.0
  5271. 2 8.0
  5272. 3 NaN
  5273. dtype: float64
  5274. """)
  5275. @Substitution(see_also=_agg_summary_and_see_also_doc,
  5276. examples=_agg_examples_doc,
  5277. versionadded='.. versionadded:: 0.20.0',
  5278. **_shared_doc_kwargs)
  5279. @Appender(_shared_docs['aggregate'])
  5280. def aggregate(self, func, axis=0, *args, **kwargs):
  5281. axis = self._get_axis_number(axis)
  5282. result = None
  5283. try:
  5284. result, how = self._aggregate(func, axis=axis, *args, **kwargs)
  5285. except TypeError:
  5286. pass
  5287. if result is None:
  5288. return self.apply(func, axis=axis, args=args, **kwargs)
  5289. return result
  5290. def _aggregate(self, arg, axis=0, *args, **kwargs):
  5291. if axis == 1:
  5292. # NDFrame.aggregate returns a tuple, and we need to transpose
  5293. # only result
  5294. result, how = (super(DataFrame, self.T)
  5295. ._aggregate(arg, *args, **kwargs))
  5296. result = result.T if result is not None else result
  5297. return result, how
  5298. return super(DataFrame, self)._aggregate(arg, *args, **kwargs)
  5299. agg = aggregate
  5300. @Appender(_shared_docs['transform'] % _shared_doc_kwargs)
  5301. def transform(self, func, axis=0, *args, **kwargs):
  5302. axis = self._get_axis_number(axis)
  5303. if axis == 1:
  5304. return super(DataFrame, self.T).transform(func, *args, **kwargs).T
  5305. return super(DataFrame, self).transform(func, *args, **kwargs)
  5306. def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
  5307. result_type=None, args=(), **kwds):
  5308. """
  5309. Apply a function along an axis of the DataFrame.
  5310. Objects passed to the function are Series objects whose index is
  5311. either the DataFrame's index (``axis=0``) or the DataFrame's columns
  5312. (``axis=1``). By default (``result_type=None``), the final return type
  5313. is inferred from the return type of the applied function. Otherwise,
  5314. it depends on the `result_type` argument.
  5315. Parameters
  5316. ----------
  5317. func : function
  5318. Function to apply to each column or row.
  5319. axis : {0 or 'index', 1 or 'columns'}, default 0
  5320. Axis along which the function is applied:
  5321. * 0 or 'index': apply function to each column.
  5322. * 1 or 'columns': apply function to each row.
  5323. broadcast : bool, optional
  5324. Only relevant for aggregation functions:
  5325. * ``False`` or ``None`` : returns a Series whose length is the
  5326. length of the index or the number of columns (based on the
  5327. `axis` parameter)
  5328. * ``True`` : results will be broadcast to the original shape
  5329. of the frame, the original index and columns will be retained.
  5330. .. deprecated:: 0.23.0
  5331. This argument will be removed in a future version, replaced
  5332. by result_type='broadcast'.
  5333. raw : bool, default False
  5334. * ``False`` : passes each row or column as a Series to the
  5335. function.
  5336. * ``True`` : the passed function will receive ndarray objects
  5337. instead.
  5338. If you are just applying a NumPy reduction function this will
  5339. achieve much better performance.
  5340. reduce : bool or None, default None
  5341. Try to apply reduction procedures. If the DataFrame is empty,
  5342. `apply` will use `reduce` to determine whether the result
  5343. should be a Series or a DataFrame. If ``reduce=None`` (the
  5344. default), `apply`'s return value will be guessed by calling
  5345. `func` on an empty Series
  5346. (note: while guessing, exceptions raised by `func` will be
  5347. ignored).
  5348. If ``reduce=True`` a Series will always be returned, and if
  5349. ``reduce=False`` a DataFrame will always be returned.
  5350. .. deprecated:: 0.23.0
  5351. This argument will be removed in a future version, replaced
  5352. by ``result_type='reduce'``.
  5353. result_type : {'expand', 'reduce', 'broadcast', None}, default None
  5354. These only act when ``axis=1`` (columns):
  5355. * 'expand' : list-like results will be turned into columns.
  5356. * 'reduce' : returns a Series if possible rather than expanding
  5357. list-like results. This is the opposite of 'expand'.
  5358. * 'broadcast' : results will be broadcast to the original shape
  5359. of the DataFrame, the original index and columns will be
  5360. retained.
  5361. The default behaviour (None) depends on the return value of the
  5362. applied function: list-like results will be returned as a Series
  5363. of those. However if the apply function returns a Series these
  5364. are expanded to columns.
  5365. .. versionadded:: 0.23.0
  5366. args : tuple
  5367. Positional arguments to pass to `func` in addition to the
  5368. array/series.
  5369. **kwds
  5370. Additional keyword arguments to pass as keywords arguments to
  5371. `func`.
  5372. Returns
  5373. -------
  5374. applied : Series or DataFrame
  5375. See Also
  5376. --------
  5377. DataFrame.applymap: For elementwise operations.
  5378. DataFrame.aggregate: Only perform aggregating type operations.
  5379. DataFrame.transform: Only perform transforming type operations.
  5380. Notes
  5381. -----
  5382. In the current implementation apply calls `func` twice on the
  5383. first column/row to decide whether it can take a fast or slow
  5384. code path. This can lead to unexpected behavior if `func` has
  5385. side-effects, as they will take effect twice for the first
  5386. column/row.
  5387. Examples
  5388. --------
  5389. >>> df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
  5390. >>> df
  5391. A B
  5392. 0 4 9
  5393. 1 4 9
  5394. 2 4 9
  5395. Using a numpy universal function (in this case the same as
  5396. ``np.sqrt(df)``):
  5397. >>> df.apply(np.sqrt)
  5398. A B
  5399. 0 2.0 3.0
  5400. 1 2.0 3.0
  5401. 2 2.0 3.0
  5402. Using a reducing function on either axis
  5403. >>> df.apply(np.sum, axis=0)
  5404. A 12
  5405. B 27
  5406. dtype: int64
  5407. >>> df.apply(np.sum, axis=1)
  5408. 0 13
  5409. 1 13
  5410. 2 13
  5411. dtype: int64
  5412. Retuning a list-like will result in a Series
  5413. >>> df.apply(lambda x: [1, 2], axis=1)
  5414. 0 [1, 2]
  5415. 1 [1, 2]
  5416. 2 [1, 2]
  5417. dtype: object
  5418. Passing result_type='expand' will expand list-like results
  5419. to columns of a Dataframe
  5420. >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
  5421. 0 1
  5422. 0 1 2
  5423. 1 1 2
  5424. 2 1 2
  5425. Returning a Series inside the function is similar to passing
  5426. ``result_type='expand'``. The resulting column names
  5427. will be the Series index.
  5428. >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
  5429. foo bar
  5430. 0 1 2
  5431. 1 1 2
  5432. 2 1 2
  5433. Passing ``result_type='broadcast'`` will ensure the same shape
  5434. result, whether list-like or scalar is returned by the function,
  5435. and broadcast it along the axis. The resulting column names will
  5436. be the originals.
  5437. >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
  5438. A B
  5439. 0 1 2
  5440. 1 1 2
  5441. 2 1 2
  5442. """
  5443. from pandas.core.apply import frame_apply
  5444. op = frame_apply(self,
  5445. func=func,
  5446. axis=axis,
  5447. broadcast=broadcast,
  5448. raw=raw,
  5449. reduce=reduce,
  5450. result_type=result_type,
  5451. args=args,
  5452. kwds=kwds)
  5453. return op.get_result()
  5454. def applymap(self, func):
  5455. """
  5456. Apply a function to a Dataframe elementwise.
  5457. This method applies a function that accepts and returns a scalar
  5458. to every element of a DataFrame.
  5459. Parameters
  5460. ----------
  5461. func : callable
  5462. Python function, returns a single value from a single value.
  5463. Returns
  5464. -------
  5465. DataFrame
  5466. Transformed DataFrame.
  5467. See Also
  5468. --------
  5469. DataFrame.apply : Apply a function along input axis of DataFrame.
  5470. Notes
  5471. -----
  5472. In the current implementation applymap calls `func` twice on the
  5473. first column/row to decide whether it can take a fast or slow
  5474. code path. This can lead to unexpected behavior if `func` has
  5475. side-effects, as they will take effect twice for the first
  5476. column/row.
  5477. Examples
  5478. --------
  5479. >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
  5480. >>> df
  5481. 0 1
  5482. 0 1.000 2.120
  5483. 1 3.356 4.567
  5484. >>> df.applymap(lambda x: len(str(x)))
  5485. 0 1
  5486. 0 3 4
  5487. 1 5 5
  5488. Note that a vectorized version of `func` often exists, which will
  5489. be much faster. You could square each number elementwise.
  5490. >>> df.applymap(lambda x: x**2)
  5491. 0 1
  5492. 0 1.000000 4.494400
  5493. 1 11.262736 20.857489
  5494. But it's better to avoid applymap in that case.
  5495. >>> df ** 2
  5496. 0 1
  5497. 0 1.000000 4.494400
  5498. 1 11.262736 20.857489
  5499. """
  5500. # if we have a dtype == 'M8[ns]', provide boxed values
  5501. def infer(x):
  5502. if x.empty:
  5503. return lib.map_infer(x, func)
  5504. return lib.map_infer(x.astype(object).values, func)
  5505. return self.apply(infer)
  5506. # ----------------------------------------------------------------------
  5507. # Merging / joining methods
  5508. def append(self, other, ignore_index=False,
  5509. verify_integrity=False, sort=None):
  5510. """
  5511. Append rows of `other` to the end of caller, returning a new object.
  5512. Columns in `other` that are not in the caller are added as new columns.
  5513. Parameters
  5514. ----------
  5515. other : DataFrame or Series/dict-like object, or list of these
  5516. The data to append.
  5517. ignore_index : boolean, default False
  5518. If True, do not use the index labels.
  5519. verify_integrity : boolean, default False
  5520. If True, raise ValueError on creating index with duplicates.
  5521. sort : boolean, default None
  5522. Sort columns if the columns of `self` and `other` are not aligned.
  5523. The default sorting is deprecated and will change to not-sorting
  5524. in a future version of pandas. Explicitly pass ``sort=True`` to
  5525. silence the warning and sort. Explicitly pass ``sort=False`` to
  5526. silence the warning and not sort.
  5527. .. versionadded:: 0.23.0
  5528. Returns
  5529. -------
  5530. appended : DataFrame
  5531. See Also
  5532. --------
  5533. pandas.concat : General function to concatenate DataFrame, Series
  5534. or Panel objects.
  5535. Notes
  5536. -----
  5537. If a list of dict/series is passed and the keys are all contained in
  5538. the DataFrame's index, the order of the columns in the resulting
  5539. DataFrame will be unchanged.
  5540. Iteratively appending rows to a DataFrame can be more computationally
  5541. intensive than a single concatenate. A better solution is to append
  5542. those rows to a list and then concatenate the list with the original
  5543. DataFrame all at once.
  5544. Examples
  5545. --------
  5546. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
  5547. >>> df
  5548. A B
  5549. 0 1 2
  5550. 1 3 4
  5551. >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
  5552. >>> df.append(df2)
  5553. A B
  5554. 0 1 2
  5555. 1 3 4
  5556. 0 5 6
  5557. 1 7 8
  5558. With `ignore_index` set to True:
  5559. >>> df.append(df2, ignore_index=True)
  5560. A B
  5561. 0 1 2
  5562. 1 3 4
  5563. 2 5 6
  5564. 3 7 8
  5565. The following, while not recommended methods for generating DataFrames,
  5566. show two ways to generate a DataFrame from multiple data sources.
  5567. Less efficient:
  5568. >>> df = pd.DataFrame(columns=['A'])
  5569. >>> for i in range(5):
  5570. ... df = df.append({'A': i}, ignore_index=True)
  5571. >>> df
  5572. A
  5573. 0 0
  5574. 1 1
  5575. 2 2
  5576. 3 3
  5577. 4 4
  5578. More efficient:
  5579. >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
  5580. ... ignore_index=True)
  5581. A
  5582. 0 0
  5583. 1 1
  5584. 2 2
  5585. 3 3
  5586. 4 4
  5587. """
  5588. if isinstance(other, (Series, dict)):
  5589. if isinstance(other, dict):
  5590. other = Series(other)
  5591. if other.name is None and not ignore_index:
  5592. raise TypeError('Can only append a Series if ignore_index=True'
  5593. ' or if the Series has a name')
  5594. if other.name is None:
  5595. index = None
  5596. else:
  5597. # other must have the same index name as self, otherwise
  5598. # index name will be reset
  5599. index = Index([other.name], name=self.index.name)
  5600. idx_diff = other.index.difference(self.columns)
  5601. try:
  5602. combined_columns = self.columns.append(idx_diff)
  5603. except TypeError:
  5604. combined_columns = self.columns.astype(object).append(idx_diff)
  5605. other = other.reindex(combined_columns, copy=False)
  5606. other = DataFrame(other.values.reshape((1, len(other))),
  5607. index=index,
  5608. columns=combined_columns)
  5609. other = other._convert(datetime=True, timedelta=True)
  5610. if not self.columns.equals(combined_columns):
  5611. self = self.reindex(columns=combined_columns)
  5612. elif isinstance(other, list) and not isinstance(other[0], DataFrame):
  5613. other = DataFrame(other)
  5614. if (self.columns.get_indexer(other.columns) >= 0).all():
  5615. other = other.loc[:, self.columns]
  5616. from pandas.core.reshape.concat import concat
  5617. if isinstance(other, (list, tuple)):
  5618. to_concat = [self] + other
  5619. else:
  5620. to_concat = [self, other]
  5621. return concat(to_concat, ignore_index=ignore_index,
  5622. verify_integrity=verify_integrity,
  5623. sort=sort)
  5624. def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
  5625. sort=False):
  5626. """
  5627. Join columns of another DataFrame.
  5628. Join columns with `other` DataFrame either on index or on a key
  5629. column. Efficiently join multiple DataFrame objects by index at once by
  5630. passing a list.
  5631. Parameters
  5632. ----------
  5633. other : DataFrame, Series, or list of DataFrame
  5634. Index should be similar to one of the columns in this one. If a
  5635. Series is passed, its name attribute must be set, and that will be
  5636. used as the column name in the resulting joined DataFrame.
  5637. on : str, list of str, or array-like, optional
  5638. Column or index level name(s) in the caller to join on the index
  5639. in `other`, otherwise joins index-on-index. If multiple
  5640. values given, the `other` DataFrame must have a MultiIndex. Can
  5641. pass an array as the join key if it is not already contained in
  5642. the calling DataFrame. Like an Excel VLOOKUP operation.
  5643. how : {'left', 'right', 'outer', 'inner'}, default 'left'
  5644. How to handle the operation of the two objects.
  5645. * left: use calling frame's index (or column if on is specified)
  5646. * right: use `other`'s index.
  5647. * outer: form union of calling frame's index (or column if on is
  5648. specified) with `other`'s index, and sort it.
  5649. lexicographically.
  5650. * inner: form intersection of calling frame's index (or column if
  5651. on is specified) with `other`'s index, preserving the order
  5652. of the calling's one.
  5653. lsuffix : str, default ''
  5654. Suffix to use from left frame's overlapping columns.
  5655. rsuffix : str, default ''
  5656. Suffix to use from right frame's overlapping columns.
  5657. sort : bool, default False
  5658. Order result DataFrame lexicographically by the join key. If False,
  5659. the order of the join key depends on the join type (how keyword).
  5660. Returns
  5661. -------
  5662. DataFrame
  5663. A dataframe containing columns from both the caller and `other`.
  5664. See Also
  5665. --------
  5666. DataFrame.merge : For column(s)-on-columns(s) operations.
  5667. Notes
  5668. -----
  5669. Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
  5670. passing a list of `DataFrame` objects.
  5671. Support for specifying index levels as the `on` parameter was added
  5672. in version 0.23.0.
  5673. Examples
  5674. --------
  5675. >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
  5676. ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
  5677. >>> df
  5678. key A
  5679. 0 K0 A0
  5680. 1 K1 A1
  5681. 2 K2 A2
  5682. 3 K3 A3
  5683. 4 K4 A4
  5684. 5 K5 A5
  5685. >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
  5686. ... 'B': ['B0', 'B1', 'B2']})
  5687. >>> other
  5688. key B
  5689. 0 K0 B0
  5690. 1 K1 B1
  5691. 2 K2 B2
  5692. Join DataFrames using their indexes.
  5693. >>> df.join(other, lsuffix='_caller', rsuffix='_other')
  5694. key_caller A key_other B
  5695. 0 K0 A0 K0 B0
  5696. 1 K1 A1 K1 B1
  5697. 2 K2 A2 K2 B2
  5698. 3 K3 A3 NaN NaN
  5699. 4 K4 A4 NaN NaN
  5700. 5 K5 A5 NaN NaN
  5701. If we want to join using the key columns, we need to set key to be
  5702. the index in both `df` and `other`. The joined DataFrame will have
  5703. key as its index.
  5704. >>> df.set_index('key').join(other.set_index('key'))
  5705. A B
  5706. key
  5707. K0 A0 B0
  5708. K1 A1 B1
  5709. K2 A2 B2
  5710. K3 A3 NaN
  5711. K4 A4 NaN
  5712. K5 A5 NaN
  5713. Another option to join using the key columns is to use the `on`
  5714. parameter. DataFrame.join always uses `other`'s index but we can use
  5715. any column in `df`. This method preserves the original DataFrame's
  5716. index in the result.
  5717. >>> df.join(other.set_index('key'), on='key')
  5718. key A B
  5719. 0 K0 A0 B0
  5720. 1 K1 A1 B1
  5721. 2 K2 A2 B2
  5722. 3 K3 A3 NaN
  5723. 4 K4 A4 NaN
  5724. 5 K5 A5 NaN
  5725. """
  5726. # For SparseDataFrame's benefit
  5727. return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
  5728. rsuffix=rsuffix, sort=sort)
  5729. def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
  5730. sort=False):
  5731. from pandas.core.reshape.merge import merge
  5732. from pandas.core.reshape.concat import concat
  5733. if isinstance(other, Series):
  5734. if other.name is None:
  5735. raise ValueError('Other Series must have a name')
  5736. other = DataFrame({other.name: other})
  5737. if isinstance(other, DataFrame):
  5738. return merge(self, other, left_on=on, how=how,
  5739. left_index=on is None, right_index=True,
  5740. suffixes=(lsuffix, rsuffix), sort=sort)
  5741. else:
  5742. if on is not None:
  5743. raise ValueError('Joining multiple DataFrames only supported'
  5744. ' for joining on index')
  5745. frames = [self] + list(other)
  5746. can_concat = all(df.index.is_unique for df in frames)
  5747. # join indexes only using concat
  5748. if can_concat:
  5749. if how == 'left':
  5750. how = 'outer'
  5751. join_axes = [self.index]
  5752. else:
  5753. join_axes = None
  5754. return concat(frames, axis=1, join=how, join_axes=join_axes,
  5755. verify_integrity=True)
  5756. joined = frames[0]
  5757. for frame in frames[1:]:
  5758. joined = merge(joined, frame, how=how, left_index=True,
  5759. right_index=True)
  5760. return joined
  5761. @Substitution('')
  5762. @Appender(_merge_doc, indents=2)
  5763. def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
  5764. left_index=False, right_index=False, sort=False,
  5765. suffixes=('_x', '_y'), copy=True, indicator=False,
  5766. validate=None):
  5767. from pandas.core.reshape.merge import merge
  5768. return merge(self, right, how=how, on=on, left_on=left_on,
  5769. right_on=right_on, left_index=left_index,
  5770. right_index=right_index, sort=sort, suffixes=suffixes,
  5771. copy=copy, indicator=indicator, validate=validate)
  5772. def round(self, decimals=0, *args, **kwargs):
  5773. """
  5774. Round a DataFrame to a variable number of decimal places.
  5775. Parameters
  5776. ----------
  5777. decimals : int, dict, Series
  5778. Number of decimal places to round each column to. If an int is
  5779. given, round each column to the same number of places.
  5780. Otherwise dict and Series round to variable numbers of places.
  5781. Column names should be in the keys if `decimals` is a
  5782. dict-like, or in the index if `decimals` is a Series. Any
  5783. columns not included in `decimals` will be left as is. Elements
  5784. of `decimals` which are not columns of the input will be
  5785. ignored.
  5786. Returns
  5787. -------
  5788. DataFrame
  5789. See Also
  5790. --------
  5791. numpy.around
  5792. Series.round
  5793. Examples
  5794. --------
  5795. >>> df = pd.DataFrame(np.random.random([3, 3]),
  5796. ... columns=['A', 'B', 'C'], index=['first', 'second', 'third'])
  5797. >>> df
  5798. A B C
  5799. first 0.028208 0.992815 0.173891
  5800. second 0.038683 0.645646 0.577595
  5801. third 0.877076 0.149370 0.491027
  5802. >>> df.round(2)
  5803. A B C
  5804. first 0.03 0.99 0.17
  5805. second 0.04 0.65 0.58
  5806. third 0.88 0.15 0.49
  5807. >>> df.round({'A': 1, 'C': 2})
  5808. A B C
  5809. first 0.0 0.992815 0.17
  5810. second 0.0 0.645646 0.58
  5811. third 0.9 0.149370 0.49
  5812. >>> decimals = pd.Series([1, 0, 2], index=['A', 'B', 'C'])
  5813. >>> df.round(decimals)
  5814. A B C
  5815. first 0.0 1 0.17
  5816. second 0.0 1 0.58
  5817. third 0.9 0 0.49
  5818. """
  5819. from pandas.core.reshape.concat import concat
  5820. def _dict_round(df, decimals):
  5821. for col, vals in df.iteritems():
  5822. try:
  5823. yield _series_round(vals, decimals[col])
  5824. except KeyError:
  5825. yield vals
  5826. def _series_round(s, decimals):
  5827. if is_integer_dtype(s) or is_float_dtype(s):
  5828. return s.round(decimals)
  5829. return s
  5830. nv.validate_round(args, kwargs)
  5831. if isinstance(decimals, (dict, Series)):
  5832. if isinstance(decimals, Series):
  5833. if not decimals.index.is_unique:
  5834. raise ValueError("Index of decimals must be unique")
  5835. new_cols = [col for col in _dict_round(self, decimals)]
  5836. elif is_integer(decimals):
  5837. # Dispatch to Series.round
  5838. new_cols = [_series_round(v, decimals)
  5839. for _, v in self.iteritems()]
  5840. else:
  5841. raise TypeError("decimals must be an integer, a dict-like or a "
  5842. "Series")
  5843. if len(new_cols) > 0:
  5844. return self._constructor(concat(new_cols, axis=1),
  5845. index=self.index,
  5846. columns=self.columns)
  5847. else:
  5848. return self
  5849. # ----------------------------------------------------------------------
  5850. # Statistical methods, etc.
  5851. def corr(self, method='pearson', min_periods=1):
  5852. """
  5853. Compute pairwise correlation of columns, excluding NA/null values.
  5854. Parameters
  5855. ----------
  5856. method : {'pearson', 'kendall', 'spearman'} or callable
  5857. * pearson : standard correlation coefficient
  5858. * kendall : Kendall Tau correlation coefficient
  5859. * spearman : Spearman rank correlation
  5860. * callable: callable with input two 1d ndarrays
  5861. and returning a float
  5862. .. versionadded:: 0.24.0
  5863. min_periods : int, optional
  5864. Minimum number of observations required per pair of columns
  5865. to have a valid result. Currently only available for pearson
  5866. and spearman correlation
  5867. Returns
  5868. -------
  5869. y : DataFrame
  5870. See Also
  5871. --------
  5872. DataFrame.corrwith
  5873. Series.corr
  5874. Examples
  5875. --------
  5876. >>> histogram_intersection = lambda a, b: np.minimum(a, b
  5877. ... ).sum().round(decimals=1)
  5878. >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
  5879. ... columns=['dogs', 'cats'])
  5880. >>> df.corr(method=histogram_intersection)
  5881. dogs cats
  5882. dogs 1.0 0.3
  5883. cats 0.3 1.0
  5884. """
  5885. numeric_df = self._get_numeric_data()
  5886. cols = numeric_df.columns
  5887. idx = cols.copy()
  5888. mat = numeric_df.values
  5889. if method == 'pearson':
  5890. correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
  5891. elif method == 'spearman':
  5892. correl = libalgos.nancorr_spearman(ensure_float64(mat),
  5893. minp=min_periods)
  5894. elif method == 'kendall' or callable(method):
  5895. if min_periods is None:
  5896. min_periods = 1
  5897. mat = ensure_float64(mat).T
  5898. corrf = nanops.get_corr_func(method)
  5899. K = len(cols)
  5900. correl = np.empty((K, K), dtype=float)
  5901. mask = np.isfinite(mat)
  5902. for i, ac in enumerate(mat):
  5903. for j, bc in enumerate(mat):
  5904. if i > j:
  5905. continue
  5906. valid = mask[i] & mask[j]
  5907. if valid.sum() < min_periods:
  5908. c = np.nan
  5909. elif i == j:
  5910. c = 1.
  5911. elif not valid.all():
  5912. c = corrf(ac[valid], bc[valid])
  5913. else:
  5914. c = corrf(ac, bc)
  5915. correl[i, j] = c
  5916. correl[j, i] = c
  5917. else:
  5918. raise ValueError("method must be either 'pearson', "
  5919. "'spearman', or 'kendall', '{method}' "
  5920. "was supplied".format(method=method))
  5921. return self._constructor(correl, index=idx, columns=cols)
  5922. def cov(self, min_periods=None):
  5923. """
  5924. Compute pairwise covariance of columns, excluding NA/null values.
  5925. Compute the pairwise covariance among the series of a DataFrame.
  5926. The returned data frame is the `covariance matrix
  5927. <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
  5928. of the DataFrame.
  5929. Both NA and null values are automatically excluded from the
  5930. calculation. (See the note below about bias from missing values.)
  5931. A threshold can be set for the minimum number of
  5932. observations for each value created. Comparisons with observations
  5933. below this threshold will be returned as ``NaN``.
  5934. This method is generally used for the analysis of time series data to
  5935. understand the relationship between different measures
  5936. across time.
  5937. Parameters
  5938. ----------
  5939. min_periods : int, optional
  5940. Minimum number of observations required per pair of columns
  5941. to have a valid result.
  5942. Returns
  5943. -------
  5944. DataFrame
  5945. The covariance matrix of the series of the DataFrame.
  5946. See Also
  5947. --------
  5948. pandas.Series.cov : Compute covariance with another Series.
  5949. pandas.core.window.EWM.cov: Exponential weighted sample covariance.
  5950. pandas.core.window.Expanding.cov : Expanding sample covariance.
  5951. pandas.core.window.Rolling.cov : Rolling sample covariance.
  5952. Notes
  5953. -----
  5954. Returns the covariance matrix of the DataFrame's time series.
  5955. The covariance is normalized by N-1.
  5956. For DataFrames that have Series that are missing data (assuming that
  5957. data is `missing at random
  5958. <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
  5959. the returned covariance matrix will be an unbiased estimate
  5960. of the variance and covariance between the member Series.
  5961. However, for many applications this estimate may not be acceptable
  5962. because the estimate covariance matrix is not guaranteed to be positive
  5963. semi-definite. This could lead to estimate correlations having
  5964. absolute values which are greater than one, and/or a non-invertible
  5965. covariance matrix. See `Estimation of covariance matrices
  5966. <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
  5967. matrices>`__ for more details.
  5968. Examples
  5969. --------
  5970. >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
  5971. ... columns=['dogs', 'cats'])
  5972. >>> df.cov()
  5973. dogs cats
  5974. dogs 0.666667 -1.000000
  5975. cats -1.000000 1.666667
  5976. >>> np.random.seed(42)
  5977. >>> df = pd.DataFrame(np.random.randn(1000, 5),
  5978. ... columns=['a', 'b', 'c', 'd', 'e'])
  5979. >>> df.cov()
  5980. a b c d e
  5981. a 0.998438 -0.020161 0.059277 -0.008943 0.014144
  5982. b -0.020161 1.059352 -0.008543 -0.024738 0.009826
  5983. c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
  5984. d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
  5985. e 0.014144 0.009826 -0.000271 -0.013692 0.977795
  5986. **Minimum number of periods**
  5987. This method also supports an optional ``min_periods`` keyword
  5988. that specifies the required minimum number of non-NA observations for
  5989. each column pair in order to have a valid result:
  5990. >>> np.random.seed(42)
  5991. >>> df = pd.DataFrame(np.random.randn(20, 3),
  5992. ... columns=['a', 'b', 'c'])
  5993. >>> df.loc[df.index[:5], 'a'] = np.nan
  5994. >>> df.loc[df.index[5:10], 'b'] = np.nan
  5995. >>> df.cov(min_periods=12)
  5996. a b c
  5997. a 0.316741 NaN -0.150812
  5998. b NaN 1.248003 0.191417
  5999. c -0.150812 0.191417 0.895202
  6000. """
  6001. numeric_df = self._get_numeric_data()
  6002. cols = numeric_df.columns
  6003. idx = cols.copy()
  6004. mat = numeric_df.values
  6005. if notna(mat).all():
  6006. if min_periods is not None and min_periods > len(mat):
  6007. baseCov = np.empty((mat.shape[1], mat.shape[1]))
  6008. baseCov.fill(np.nan)
  6009. else:
  6010. baseCov = np.cov(mat.T)
  6011. baseCov = baseCov.reshape((len(cols), len(cols)))
  6012. else:
  6013. baseCov = libalgos.nancorr(ensure_float64(mat), cov=True,
  6014. minp=min_periods)
  6015. return self._constructor(baseCov, index=idx, columns=cols)
  6016. def corrwith(self, other, axis=0, drop=False, method='pearson'):
  6017. """
  6018. Compute pairwise correlation between rows or columns of DataFrame
  6019. with rows or columns of Series or DataFrame. DataFrames are first
  6020. aligned along both axes before computing the correlations.
  6021. Parameters
  6022. ----------
  6023. other : DataFrame, Series
  6024. axis : {0 or 'index', 1 or 'columns'}, default 0
  6025. 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise
  6026. drop : boolean, default False
  6027. Drop missing indices from result
  6028. method : {'pearson', 'kendall', 'spearman'} or callable
  6029. * pearson : standard correlation coefficient
  6030. * kendall : Kendall Tau correlation coefficient
  6031. * spearman : Spearman rank correlation
  6032. * callable: callable with input two 1d ndarrays
  6033. and returning a float
  6034. .. versionadded:: 0.24.0
  6035. Returns
  6036. -------
  6037. correls : Series
  6038. See Also
  6039. -------
  6040. DataFrame.corr
  6041. """
  6042. axis = self._get_axis_number(axis)
  6043. this = self._get_numeric_data()
  6044. if isinstance(other, Series):
  6045. return this.apply(lambda x: other.corr(x, method=method),
  6046. axis=axis)
  6047. other = other._get_numeric_data()
  6048. left, right = this.align(other, join='inner', copy=False)
  6049. if axis == 1:
  6050. left = left.T
  6051. right = right.T
  6052. if method == 'pearson':
  6053. # mask missing values
  6054. left = left + right * 0
  6055. right = right + left * 0
  6056. # demeaned data
  6057. ldem = left - left.mean()
  6058. rdem = right - right.mean()
  6059. num = (ldem * rdem).sum()
  6060. dom = (left.count() - 1) * left.std() * right.std()
  6061. correl = num / dom
  6062. elif method in ['kendall', 'spearman'] or callable(method):
  6063. def c(x):
  6064. return nanops.nancorr(x[0], x[1], method=method)
  6065. correl = Series(map(c,
  6066. zip(left.values.T, right.values.T)),
  6067. index=left.columns)
  6068. else:
  6069. raise ValueError("Invalid method {method} was passed, "
  6070. "valid methods are: 'pearson', 'kendall', "
  6071. "'spearman', or callable".
  6072. format(method=method))
  6073. if not drop:
  6074. # Find non-matching labels along the given axis
  6075. # and append missing correlations (GH 22375)
  6076. raxis = 1 if axis == 0 else 0
  6077. result_index = (this._get_axis(raxis).
  6078. union(other._get_axis(raxis)))
  6079. idx_diff = result_index.difference(correl.index)
  6080. if len(idx_diff) > 0:
  6081. correl = correl.append(Series([np.nan] * len(idx_diff),
  6082. index=idx_diff))
  6083. return correl
  6084. # ----------------------------------------------------------------------
  6085. # ndarray-like stats methods
  6086. def count(self, axis=0, level=None, numeric_only=False):
  6087. """
  6088. Count non-NA cells for each column or row.
  6089. The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
  6090. on `pandas.options.mode.use_inf_as_na`) are considered NA.
  6091. Parameters
  6092. ----------
  6093. axis : {0 or 'index', 1 or 'columns'}, default 0
  6094. If 0 or 'index' counts are generated for each column.
  6095. If 1 or 'columns' counts are generated for each **row**.
  6096. level : int or str, optional
  6097. If the axis is a `MultiIndex` (hierarchical), count along a
  6098. particular `level`, collapsing into a `DataFrame`.
  6099. A `str` specifies the level name.
  6100. numeric_only : boolean, default False
  6101. Include only `float`, `int` or `boolean` data.
  6102. Returns
  6103. -------
  6104. Series or DataFrame
  6105. For each column/row the number of non-NA/null entries.
  6106. If `level` is specified returns a `DataFrame`.
  6107. See Also
  6108. --------
  6109. Series.count: Number of non-NA elements in a Series.
  6110. DataFrame.shape: Number of DataFrame rows and columns (including NA
  6111. elements).
  6112. DataFrame.isna: Boolean same-sized DataFrame showing places of NA
  6113. elements.
  6114. Examples
  6115. --------
  6116. Constructing DataFrame from a dictionary:
  6117. >>> df = pd.DataFrame({"Person":
  6118. ... ["John", "Myla", "Lewis", "John", "Myla"],
  6119. ... "Age": [24., np.nan, 21., 33, 26],
  6120. ... "Single": [False, True, True, True, False]})
  6121. >>> df
  6122. Person Age Single
  6123. 0 John 24.0 False
  6124. 1 Myla NaN True
  6125. 2 Lewis 21.0 True
  6126. 3 John 33.0 True
  6127. 4 Myla 26.0 False
  6128. Notice the uncounted NA values:
  6129. >>> df.count()
  6130. Person 5
  6131. Age 4
  6132. Single 5
  6133. dtype: int64
  6134. Counts for each **row**:
  6135. >>> df.count(axis='columns')
  6136. 0 3
  6137. 1 2
  6138. 2 3
  6139. 3 3
  6140. 4 3
  6141. dtype: int64
  6142. Counts for one level of a `MultiIndex`:
  6143. >>> df.set_index(["Person", "Single"]).count(level="Person")
  6144. Age
  6145. Person
  6146. John 2
  6147. Lewis 1
  6148. Myla 1
  6149. """
  6150. axis = self._get_axis_number(axis)
  6151. if level is not None:
  6152. return self._count_level(level, axis=axis,
  6153. numeric_only=numeric_only)
  6154. if numeric_only:
  6155. frame = self._get_numeric_data()
  6156. else:
  6157. frame = self
  6158. # GH #423
  6159. if len(frame._get_axis(axis)) == 0:
  6160. result = Series(0, index=frame._get_agg_axis(axis))
  6161. else:
  6162. if frame._is_mixed_type or frame._data.any_extension_types:
  6163. # the or any_extension_types is really only hit for single-
  6164. # column frames with an extension array
  6165. result = notna(frame).sum(axis=axis)
  6166. else:
  6167. # GH13407
  6168. series_counts = notna(frame).sum(axis=axis)
  6169. counts = series_counts.values
  6170. result = Series(counts, index=frame._get_agg_axis(axis))
  6171. return result.astype('int64')
  6172. def _count_level(self, level, axis=0, numeric_only=False):
  6173. if numeric_only:
  6174. frame = self._get_numeric_data()
  6175. else:
  6176. frame = self
  6177. count_axis = frame._get_axis(axis)
  6178. agg_axis = frame._get_agg_axis(axis)
  6179. if not isinstance(count_axis, MultiIndex):
  6180. raise TypeError("Can only count levels on hierarchical "
  6181. "{ax}.".format(ax=self._get_axis_name(axis)))
  6182. if frame._is_mixed_type:
  6183. # Since we have mixed types, calling notna(frame.values) might
  6184. # upcast everything to object
  6185. mask = notna(frame).values
  6186. else:
  6187. # But use the speedup when we have homogeneous dtypes
  6188. mask = notna(frame.values)
  6189. if axis == 1:
  6190. # We're transposing the mask rather than frame to avoid potential
  6191. # upcasts to object, which induces a ~20x slowdown
  6192. mask = mask.T
  6193. if isinstance(level, compat.string_types):
  6194. level = count_axis._get_level_number(level)
  6195. level_index = count_axis.levels[level]
  6196. level_codes = ensure_int64(count_axis.codes[level])
  6197. counts = lib.count_level_2d(mask, level_codes, len(level_index),
  6198. axis=0)
  6199. result = DataFrame(counts, index=level_index, columns=agg_axis)
  6200. if axis == 1:
  6201. # Undo our earlier transpose
  6202. return result.T
  6203. else:
  6204. return result
  6205. def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
  6206. filter_type=None, **kwds):
  6207. if axis is None and filter_type == 'bool':
  6208. labels = None
  6209. constructor = None
  6210. else:
  6211. # TODO: Make other agg func handle axis=None properly
  6212. axis = self._get_axis_number(axis)
  6213. labels = self._get_agg_axis(axis)
  6214. constructor = self._constructor
  6215. def f(x):
  6216. return op(x, axis=axis, skipna=skipna, **kwds)
  6217. # exclude timedelta/datetime unless we are uniform types
  6218. if (axis == 1 and self._is_datelike_mixed_type
  6219. and (not self._is_homogeneous_type
  6220. and not is_datetime64tz_dtype(self.dtypes[0]))):
  6221. numeric_only = True
  6222. if numeric_only is None:
  6223. try:
  6224. values = self.values
  6225. result = f(values)
  6226. if (filter_type == 'bool' and is_object_dtype(values) and
  6227. axis is None):
  6228. # work around https://github.com/numpy/numpy/issues/10489
  6229. # TODO: combine with hasattr(result, 'dtype') further down
  6230. # hard since we don't have `values` down there.
  6231. result = np.bool_(result)
  6232. except Exception as e:
  6233. # try by-column first
  6234. if filter_type is None and axis == 0:
  6235. try:
  6236. # this can end up with a non-reduction
  6237. # but not always. if the types are mixed
  6238. # with datelike then need to make sure a series
  6239. # we only end up here if we have not specified
  6240. # numeric_only and yet we have tried a
  6241. # column-by-column reduction, where we have mixed type.
  6242. # So let's just do what we can
  6243. from pandas.core.apply import frame_apply
  6244. opa = frame_apply(self,
  6245. func=f,
  6246. result_type='expand',
  6247. ignore_failures=True)
  6248. result = opa.get_result()
  6249. if result.ndim == self.ndim:
  6250. result = result.iloc[0]
  6251. return result
  6252. except Exception:
  6253. pass
  6254. if filter_type is None or filter_type == 'numeric':
  6255. data = self._get_numeric_data()
  6256. elif filter_type == 'bool':
  6257. data = self._get_bool_data()
  6258. else: # pragma: no cover
  6259. e = NotImplementedError(
  6260. "Handling exception with filter_type {f} not"
  6261. "implemented.".format(f=filter_type))
  6262. raise_with_traceback(e)
  6263. with np.errstate(all='ignore'):
  6264. result = f(data.values)
  6265. labels = data._get_agg_axis(axis)
  6266. else:
  6267. if numeric_only:
  6268. if filter_type is None or filter_type == 'numeric':
  6269. data = self._get_numeric_data()
  6270. elif filter_type == 'bool':
  6271. # GH 25101, # GH 24434
  6272. data = self._get_bool_data() if axis == 0 else self
  6273. else: # pragma: no cover
  6274. msg = ("Generating numeric_only data with filter_type {f}"
  6275. "not supported.".format(f=filter_type))
  6276. raise NotImplementedError(msg)
  6277. values = data.values
  6278. labels = data._get_agg_axis(axis)
  6279. else:
  6280. values = self.values
  6281. result = f(values)
  6282. if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
  6283. try:
  6284. if filter_type is None or filter_type == 'numeric':
  6285. result = result.astype(np.float64)
  6286. elif filter_type == 'bool' and notna(result).all():
  6287. result = result.astype(np.bool_)
  6288. except (ValueError, TypeError):
  6289. # try to coerce to the original dtypes item by item if we can
  6290. if axis == 0:
  6291. result = coerce_to_dtypes(result, self.dtypes)
  6292. if constructor is not None:
  6293. result = Series(result, index=labels)
  6294. return result
  6295. def nunique(self, axis=0, dropna=True):
  6296. """
  6297. Count distinct observations over requested axis.
  6298. Return Series with number of distinct observations. Can ignore NaN
  6299. values.
  6300. .. versionadded:: 0.20.0
  6301. Parameters
  6302. ----------
  6303. axis : {0 or 'index', 1 or 'columns'}, default 0
  6304. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
  6305. column-wise.
  6306. dropna : bool, default True
  6307. Don't include NaN in the counts.
  6308. Returns
  6309. -------
  6310. nunique : Series
  6311. See Also
  6312. --------
  6313. Series.nunique: Method nunique for Series.
  6314. DataFrame.count: Count non-NA cells for each column or row.
  6315. Examples
  6316. --------
  6317. >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
  6318. >>> df.nunique()
  6319. A 3
  6320. B 1
  6321. dtype: int64
  6322. >>> df.nunique(axis=1)
  6323. 0 1
  6324. 1 2
  6325. 2 2
  6326. dtype: int64
  6327. """
  6328. return self.apply(Series.nunique, axis=axis, dropna=dropna)
  6329. def idxmin(self, axis=0, skipna=True):
  6330. """
  6331. Return index of first occurrence of minimum over requested axis.
  6332. NA/null values are excluded.
  6333. Parameters
  6334. ----------
  6335. axis : {0 or 'index', 1 or 'columns'}, default 0
  6336. 0 or 'index' for row-wise, 1 or 'columns' for column-wise
  6337. skipna : boolean, default True
  6338. Exclude NA/null values. If an entire row/column is NA, the result
  6339. will be NA.
  6340. Returns
  6341. -------
  6342. idxmin : Series
  6343. Raises
  6344. ------
  6345. ValueError
  6346. * If the row/column is empty
  6347. See Also
  6348. --------
  6349. Series.idxmin
  6350. Notes
  6351. -----
  6352. This method is the DataFrame version of ``ndarray.argmin``.
  6353. """
  6354. axis = self._get_axis_number(axis)
  6355. indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
  6356. index = self._get_axis(axis)
  6357. result = [index[i] if i >= 0 else np.nan for i in indices]
  6358. return Series(result, index=self._get_agg_axis(axis))
  6359. def idxmax(self, axis=0, skipna=True):
  6360. """
  6361. Return index of first occurrence of maximum over requested axis.
  6362. NA/null values are excluded.
  6363. Parameters
  6364. ----------
  6365. axis : {0 or 'index', 1 or 'columns'}, default 0
  6366. 0 or 'index' for row-wise, 1 or 'columns' for column-wise
  6367. skipna : boolean, default True
  6368. Exclude NA/null values. If an entire row/column is NA, the result
  6369. will be NA.
  6370. Returns
  6371. -------
  6372. idxmax : Series
  6373. Raises
  6374. ------
  6375. ValueError
  6376. * If the row/column is empty
  6377. See Also
  6378. --------
  6379. Series.idxmax
  6380. Notes
  6381. -----
  6382. This method is the DataFrame version of ``ndarray.argmax``.
  6383. """
  6384. axis = self._get_axis_number(axis)
  6385. indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
  6386. index = self._get_axis(axis)
  6387. result = [index[i] if i >= 0 else np.nan for i in indices]
  6388. return Series(result, index=self._get_agg_axis(axis))
  6389. def _get_agg_axis(self, axis_num):
  6390. """
  6391. Let's be explicit about this.
  6392. """
  6393. if axis_num == 0:
  6394. return self.columns
  6395. elif axis_num == 1:
  6396. return self.index
  6397. else:
  6398. raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
  6399. def mode(self, axis=0, numeric_only=False, dropna=True):
  6400. """
  6401. Get the mode(s) of each element along the selected axis.
  6402. The mode of a set of values is the value that appears most often.
  6403. It can be multiple values.
  6404. Parameters
  6405. ----------
  6406. axis : {0 or 'index', 1 or 'columns'}, default 0
  6407. The axis to iterate over while searching for the mode:
  6408. * 0 or 'index' : get mode of each column
  6409. * 1 or 'columns' : get mode of each row
  6410. numeric_only : bool, default False
  6411. If True, only apply to numeric columns.
  6412. dropna : bool, default True
  6413. Don't consider counts of NaN/NaT.
  6414. .. versionadded:: 0.24.0
  6415. Returns
  6416. -------
  6417. DataFrame
  6418. The modes of each column or row.
  6419. See Also
  6420. --------
  6421. Series.mode : Return the highest frequency value in a Series.
  6422. Series.value_counts : Return the counts of values in a Series.
  6423. Examples
  6424. --------
  6425. >>> df = pd.DataFrame([('bird', 2, 2),
  6426. ... ('mammal', 4, np.nan),
  6427. ... ('arthropod', 8, 0),
  6428. ... ('bird', 2, np.nan)],
  6429. ... index=('falcon', 'horse', 'spider', 'ostrich'),
  6430. ... columns=('species', 'legs', 'wings'))
  6431. >>> df
  6432. species legs wings
  6433. falcon bird 2 2.0
  6434. horse mammal 4 NaN
  6435. spider arthropod 8 0.0
  6436. ostrich bird 2 NaN
  6437. By default, missing values are not considered, and the mode of wings
  6438. are both 0 and 2. The second row of species and legs contains ``NaN``,
  6439. because they have only one mode, but the DataFrame has two rows.
  6440. >>> df.mode()
  6441. species legs wings
  6442. 0 bird 2.0 0.0
  6443. 1 NaN NaN 2.0
  6444. Setting ``dropna=False`` ``NaN`` values are considered and they can be
  6445. the mode (like for wings).
  6446. >>> df.mode(dropna=False)
  6447. species legs wings
  6448. 0 bird 2 NaN
  6449. Setting ``numeric_only=True``, only the mode of numeric columns is
  6450. computed, and columns of other types are ignored.
  6451. >>> df.mode(numeric_only=True)
  6452. legs wings
  6453. 0 2.0 0.0
  6454. 1 NaN 2.0
  6455. To compute the mode over columns and not rows, use the axis parameter:
  6456. >>> df.mode(axis='columns', numeric_only=True)
  6457. 0 1
  6458. falcon 2.0 NaN
  6459. horse 4.0 NaN
  6460. spider 0.0 8.0
  6461. ostrich 2.0 NaN
  6462. """
  6463. data = self if not numeric_only else self._get_numeric_data()
  6464. def f(s):
  6465. return s.mode(dropna=dropna)
  6466. return data.apply(f, axis=axis)
  6467. def quantile(self, q=0.5, axis=0, numeric_only=True,
  6468. interpolation='linear'):
  6469. """
  6470. Return values at the given quantile over requested axis.
  6471. Parameters
  6472. ----------
  6473. q : float or array-like, default 0.5 (50% quantile)
  6474. Value between 0 <= q <= 1, the quantile(s) to compute.
  6475. axis : {0, 1, 'index', 'columns'} (default 0)
  6476. Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  6477. numeric_only : bool, default True
  6478. If False, the quantile of datetime and timedelta data will be
  6479. computed as well.
  6480. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
  6481. This optional parameter specifies the interpolation method to use,
  6482. when the desired quantile lies between two data points `i` and `j`:
  6483. * linear: `i + (j - i) * fraction`, where `fraction` is the
  6484. fractional part of the index surrounded by `i` and `j`.
  6485. * lower: `i`.
  6486. * higher: `j`.
  6487. * nearest: `i` or `j` whichever is nearest.
  6488. * midpoint: (`i` + `j`) / 2.
  6489. .. versionadded:: 0.18.0
  6490. Returns
  6491. -------
  6492. quantiles : Series or DataFrame
  6493. - If ``q`` is an array, a DataFrame will be returned where the
  6494. index is ``q``, the columns are the columns of self, and the
  6495. values are the quantiles.
  6496. - If ``q`` is a float, a Series will be returned where the
  6497. index is the columns of self and the values are the quantiles.
  6498. See Also
  6499. --------
  6500. core.window.Rolling.quantile: Rolling quantile.
  6501. numpy.percentile: Numpy function to compute the percentile.
  6502. Examples
  6503. --------
  6504. >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
  6505. ... columns=['a', 'b'])
  6506. >>> df.quantile(.1)
  6507. a 1.3
  6508. b 3.7
  6509. Name: 0.1, dtype: float64
  6510. >>> df.quantile([.1, .5])
  6511. a b
  6512. 0.1 1.3 3.7
  6513. 0.5 2.5 55.0
  6514. Specifying `numeric_only=False` will also compute the quantile of
  6515. datetime and timedelta data.
  6516. >>> df = pd.DataFrame({'A': [1, 2],
  6517. ... 'B': [pd.Timestamp('2010'),
  6518. ... pd.Timestamp('2011')],
  6519. ... 'C': [pd.Timedelta('1 days'),
  6520. ... pd.Timedelta('2 days')]})
  6521. >>> df.quantile(0.5, numeric_only=False)
  6522. A 1.5
  6523. B 2010-07-02 12:00:00
  6524. C 1 days 12:00:00
  6525. Name: 0.5, dtype: object
  6526. """
  6527. self._check_percentile(q)
  6528. data = self._get_numeric_data() if numeric_only else self
  6529. axis = self._get_axis_number(axis)
  6530. is_transposed = axis == 1
  6531. if is_transposed:
  6532. data = data.T
  6533. result = data._data.quantile(qs=q,
  6534. axis=1,
  6535. interpolation=interpolation,
  6536. transposed=is_transposed)
  6537. if result.ndim == 2:
  6538. result = self._constructor(result)
  6539. else:
  6540. result = self._constructor_sliced(result, name=q)
  6541. if is_transposed:
  6542. result = result.T
  6543. return result
  6544. def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
  6545. """
  6546. Cast to DatetimeIndex of timestamps, at *beginning* of period.
  6547. Parameters
  6548. ----------
  6549. freq : string, default frequency of PeriodIndex
  6550. Desired frequency
  6551. how : {'s', 'e', 'start', 'end'}
  6552. Convention for converting period to timestamp; start of period
  6553. vs. end
  6554. axis : {0 or 'index', 1 or 'columns'}, default 0
  6555. The axis to convert (the index by default)
  6556. copy : boolean, default True
  6557. If false then underlying input data is not copied
  6558. Returns
  6559. -------
  6560. df : DataFrame with DatetimeIndex
  6561. """
  6562. new_data = self._data
  6563. if copy:
  6564. new_data = new_data.copy()
  6565. axis = self._get_axis_number(axis)
  6566. if axis == 0:
  6567. new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
  6568. elif axis == 1:
  6569. new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
  6570. else: # pragma: no cover
  6571. raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
  6572. ax=axis))
  6573. return self._constructor(new_data)
  6574. def to_period(self, freq=None, axis=0, copy=True):
  6575. """
  6576. Convert DataFrame from DatetimeIndex to PeriodIndex with desired
  6577. frequency (inferred from index if not passed).
  6578. Parameters
  6579. ----------
  6580. freq : string, default
  6581. axis : {0 or 'index', 1 or 'columns'}, default 0
  6582. The axis to convert (the index by default)
  6583. copy : boolean, default True
  6584. If False then underlying input data is not copied
  6585. Returns
  6586. -------
  6587. ts : TimeSeries with PeriodIndex
  6588. """
  6589. new_data = self._data
  6590. if copy:
  6591. new_data = new_data.copy()
  6592. axis = self._get_axis_number(axis)
  6593. if axis == 0:
  6594. new_data.set_axis(1, self.index.to_period(freq=freq))
  6595. elif axis == 1:
  6596. new_data.set_axis(0, self.columns.to_period(freq=freq))
  6597. else: # pragma: no cover
  6598. raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
  6599. ax=axis))
  6600. return self._constructor(new_data)
  6601. def isin(self, values):
  6602. """
  6603. Whether each element in the DataFrame is contained in values.
  6604. Parameters
  6605. ----------
  6606. values : iterable, Series, DataFrame or dict
  6607. The result will only be true at a location if all the
  6608. labels match. If `values` is a Series, that's the index. If
  6609. `values` is a dict, the keys must be the column names,
  6610. which must match. If `values` is a DataFrame,
  6611. then both the index and column labels must match.
  6612. Returns
  6613. -------
  6614. DataFrame
  6615. DataFrame of booleans showing whether each element in the DataFrame
  6616. is contained in values.
  6617. See Also
  6618. --------
  6619. DataFrame.eq: Equality test for DataFrame.
  6620. Series.isin: Equivalent method on Series.
  6621. Series.str.contains: Test if pattern or regex is contained within a
  6622. string of a Series or Index.
  6623. Examples
  6624. --------
  6625. >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
  6626. ... index=['falcon', 'dog'])
  6627. >>> df
  6628. num_legs num_wings
  6629. falcon 2 2
  6630. dog 4 0
  6631. When ``values`` is a list check whether every value in the DataFrame
  6632. is present in the list (which animals have 0 or 2 legs or wings)
  6633. >>> df.isin([0, 2])
  6634. num_legs num_wings
  6635. falcon True True
  6636. dog False True
  6637. When ``values`` is a dict, we can pass values to check for each
  6638. column separately:
  6639. >>> df.isin({'num_wings': [0, 3]})
  6640. num_legs num_wings
  6641. falcon False False
  6642. dog False True
  6643. When ``values`` is a Series or DataFrame the index and column must
  6644. match. Note that 'falcon' does not match based on the number of legs
  6645. in df2.
  6646. >>> other = pd.DataFrame({'num_legs': [8, 2],'num_wings': [0, 2]},
  6647. ... index=['spider', 'falcon'])
  6648. >>> df.isin(other)
  6649. num_legs num_wings
  6650. falcon True True
  6651. dog False False
  6652. """
  6653. if isinstance(values, dict):
  6654. from pandas.core.reshape.concat import concat
  6655. values = collections.defaultdict(list, values)
  6656. return concat((self.iloc[:, [i]].isin(values[col])
  6657. for i, col in enumerate(self.columns)), axis=1)
  6658. elif isinstance(values, Series):
  6659. if not values.index.is_unique:
  6660. raise ValueError("cannot compute isin with "
  6661. "a duplicate axis.")
  6662. return self.eq(values.reindex_like(self), axis='index')
  6663. elif isinstance(values, DataFrame):
  6664. if not (values.columns.is_unique and values.index.is_unique):
  6665. raise ValueError("cannot compute isin with "
  6666. "a duplicate axis.")
  6667. return self.eq(values.reindex_like(self))
  6668. else:
  6669. if not is_list_like(values):
  6670. raise TypeError("only list-like or dict-like objects are "
  6671. "allowed to be passed to DataFrame.isin(), "
  6672. "you passed a "
  6673. "{0!r}".format(type(values).__name__))
  6674. return DataFrame(
  6675. algorithms.isin(self.values.ravel(),
  6676. values).reshape(self.shape), self.index,
  6677. self.columns)
  6678. # ----------------------------------------------------------------------
  6679. # Add plotting methods to DataFrame
  6680. plot = CachedAccessor("plot", gfx.FramePlotMethods)
  6681. hist = gfx.hist_frame
  6682. boxplot = gfx.boxplot_frame
  6683. DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
  6684. axes_are_reversed=True, aliases={'rows': 0},
  6685. docs={
  6686. 'index': 'The index (row labels) of the DataFrame.',
  6687. 'columns': 'The column labels of the DataFrame.'})
  6688. DataFrame._add_numeric_operations()
  6689. DataFrame._add_series_or_dataframe_operations()
  6690. ops.add_flex_arithmetic_methods(DataFrame)
  6691. ops.add_special_arithmetic_methods(DataFrame)
  6692. def _from_nested_dict(data):
  6693. # TODO: this should be seriously cythonized
  6694. new_data = OrderedDict()
  6695. for index, s in compat.iteritems(data):
  6696. for col, v in compat.iteritems(s):
  6697. new_data[col] = new_data.get(col, OrderedDict())
  6698. new_data[col][index] = v
  6699. return new_data
  6700. def _put_str(s, space):
  6701. return u'{s}'.format(s=s)[:space].ljust(space)