hierarchy.py 145 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186
  1. """
  2. ========================================================
  3. Hierarchical clustering (:mod:`scipy.cluster.hierarchy`)
  4. ========================================================
  5. .. currentmodule:: scipy.cluster.hierarchy
  6. These functions cut hierarchical clusterings into flat clusterings
  7. or find the roots of the forest formed by a cut by providing the flat
  8. cluster ids of each observation.
  9. .. autosummary::
  10. :toctree: generated/
  11. fcluster
  12. fclusterdata
  13. leaders
  14. These are routines for agglomerative clustering.
  15. .. autosummary::
  16. :toctree: generated/
  17. linkage
  18. single
  19. complete
  20. average
  21. weighted
  22. centroid
  23. median
  24. ward
  25. These routines compute statistics on hierarchies.
  26. .. autosummary::
  27. :toctree: generated/
  28. cophenet
  29. from_mlab_linkage
  30. inconsistent
  31. maxinconsts
  32. maxdists
  33. maxRstat
  34. to_mlab_linkage
  35. Routines for visualizing flat clusters.
  36. .. autosummary::
  37. :toctree: generated/
  38. dendrogram
  39. These are data structures and routines for representing hierarchies as
  40. tree objects.
  41. .. autosummary::
  42. :toctree: generated/
  43. ClusterNode
  44. leaves_list
  45. to_tree
  46. cut_tree
  47. optimal_leaf_ordering
  48. These are predicates for checking the validity of linkage and
  49. inconsistency matrices as well as for checking isomorphism of two
  50. flat cluster assignments.
  51. .. autosummary::
  52. :toctree: generated/
  53. is_valid_im
  54. is_valid_linkage
  55. is_isomorphic
  56. is_monotonic
  57. correspond
  58. num_obs_linkage
  59. Utility routines for plotting:
  60. .. autosummary::
  61. :toctree: generated/
  62. set_link_color_palette
  63. References
  64. ----------
  65. .. [1] "Statistics toolbox." API Reference Documentation. The MathWorks.
  66. https://www.mathworks.com/access/helpdesk/help/toolbox/stats/.
  67. Accessed October 1, 2007.
  68. .. [2] "Hierarchical clustering." API Reference Documentation.
  69. The Wolfram Research, Inc.
  70. https://reference.wolfram.com/language/HierarchicalClustering/tutorial/HierarchicalClustering.html.
  71. Accessed October 1, 2007.
  72. .. [3] Gower, JC and Ross, GJS. "Minimum Spanning Trees and Single Linkage
  73. Cluster Analysis." Applied Statistics. 18(1): pp. 54--64. 1969.
  74. .. [4] Ward Jr, JH. "Hierarchical grouping to optimize an objective
  75. function." Journal of the American Statistical Association. 58(301):
  76. pp. 236--44. 1963.
  77. .. [5] Johnson, SC. "Hierarchical clustering schemes." Psychometrika.
  78. 32(2): pp. 241--54. 1966.
  79. .. [6] Sneath, PH and Sokal, RR. "Numerical taxonomy." Nature. 193: pp.
  80. 855--60. 1962.
  81. .. [7] Batagelj, V. "Comparing resemblance measures." Journal of
  82. Classification. 12: pp. 73--90. 1995.
  83. .. [8] Sokal, RR and Michener, CD. "A statistical method for evaluating
  84. systematic relationships." Scientific Bulletins. 38(22):
  85. pp. 1409--38. 1958.
  86. .. [9] Edelbrock, C. "Mixture model tests of hierarchical clustering
  87. algorithms: the problem of classifying everybody." Multivariate
  88. Behavioral Research. 14: pp. 367--84. 1979.
  89. .. [10] Jain, A., and Dubes, R., "Algorithms for Clustering Data."
  90. Prentice-Hall. Englewood Cliffs, NJ. 1988.
  91. .. [11] Fisher, RA "The use of multiple measurements in taxonomic
  92. problems." Annals of Eugenics, 7(2): 179-188. 1936
  93. * MATLAB and MathWorks are registered trademarks of The MathWorks, Inc.
  94. * Mathematica is a registered trademark of The Wolfram Research, Inc.
  95. """
  96. from __future__ import division, print_function, absolute_import
  97. # Copyright (C) Damian Eads, 2007-2008. New BSD License.
  98. # hierarchy.py (derived from cluster.py, http://scipy-cluster.googlecode.com)
  99. #
  100. # Author: Damian Eads
  101. # Date: September 22, 2007
  102. #
  103. # Copyright (c) 2007, 2008, Damian Eads
  104. #
  105. # All rights reserved.
  106. #
  107. # Redistribution and use in source and binary forms, with or without
  108. # modification, are permitted provided that the following conditions
  109. # are met:
  110. # - Redistributions of source code must retain the above
  111. # copyright notice, this list of conditions and the
  112. # following disclaimer.
  113. # - Redistributions in binary form must reproduce the above copyright
  114. # notice, this list of conditions and the following disclaimer
  115. # in the documentation and/or other materials provided with the
  116. # distribution.
  117. # - Neither the name of the author nor the names of its
  118. # contributors may be used to endorse or promote products derived
  119. # from this software without specific prior written permission.
  120. #
  121. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  122. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  123. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  124. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  125. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  126. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  127. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  128. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  129. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  130. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  131. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  132. import warnings
  133. import bisect
  134. from collections import deque
  135. import numpy as np
  136. from . import _hierarchy, _optimal_leaf_ordering
  137. import scipy.spatial.distance as distance
  138. from scipy._lib.six import string_types
  139. from scipy._lib.six import xrange
  140. _LINKAGE_METHODS = {'single': 0, 'complete': 1, 'average': 2, 'centroid': 3,
  141. 'median': 4, 'ward': 5, 'weighted': 6}
  142. _EUCLIDEAN_METHODS = ('centroid', 'median', 'ward')
  143. __all__ = ['ClusterNode', 'average', 'centroid', 'complete', 'cophenet',
  144. 'correspond', 'cut_tree', 'dendrogram', 'fcluster', 'fclusterdata',
  145. 'from_mlab_linkage', 'inconsistent', 'is_isomorphic',
  146. 'is_monotonic', 'is_valid_im', 'is_valid_linkage', 'leaders',
  147. 'leaves_list', 'linkage', 'maxRstat', 'maxdists', 'maxinconsts',
  148. 'median', 'num_obs_linkage', 'optimal_leaf_ordering',
  149. 'set_link_color_palette', 'single', 'to_mlab_linkage', 'to_tree',
  150. 'ward', 'weighted', 'distance']
  151. class ClusterWarning(UserWarning):
  152. pass
  153. def _warning(s):
  154. warnings.warn('scipy.cluster: %s' % s, ClusterWarning, stacklevel=3)
  155. def _copy_array_if_base_present(a):
  156. """
  157. Copy the array if its base points to a parent array.
  158. """
  159. if a.base is not None:
  160. return a.copy()
  161. elif np.issubsctype(a, np.float32):
  162. return np.array(a, dtype=np.double)
  163. else:
  164. return a
  165. def _copy_arrays_if_base_present(T):
  166. """
  167. Accept a tuple of arrays T. Copies the array T[i] if its base array
  168. points to an actual array. Otherwise, the reference is just copied.
  169. This is useful if the arrays are being passed to a C function that
  170. does not do proper striding.
  171. """
  172. l = [_copy_array_if_base_present(a) for a in T]
  173. return l
  174. def _randdm(pnts):
  175. """
  176. Generate a random distance matrix stored in condensed form.
  177. Parameters
  178. ----------
  179. pnts : int
  180. The number of points in the distance matrix. Has to be at least 2.
  181. Returns
  182. -------
  183. D : ndarray
  184. A ``pnts * (pnts - 1) / 2`` sized vector is returned.
  185. """
  186. if pnts >= 2:
  187. D = np.random.rand(pnts * (pnts - 1) / 2)
  188. else:
  189. raise ValueError("The number of points in the distance matrix "
  190. "must be at least 2.")
  191. return D
  192. def single(y):
  193. """
  194. Perform single/min/nearest linkage on the condensed distance matrix ``y``.
  195. Parameters
  196. ----------
  197. y : ndarray
  198. The upper triangular of the distance matrix. The result of
  199. ``pdist`` is returned in this form.
  200. Returns
  201. -------
  202. Z : ndarray
  203. The linkage matrix.
  204. See Also
  205. --------
  206. linkage: for advanced creation of hierarchical clusterings.
  207. scipy.spatial.distance.pdist : pairwise distance metrics
  208. Examples
  209. --------
  210. >>> from scipy.cluster.hierarchy import single, fcluster
  211. >>> from scipy.spatial.distance import pdist
  212. First we need a toy dataset to play with::
  213. x x x x
  214. x x
  215. x x
  216. x x x x
  217. >>> X = [[0, 0], [0, 1], [1, 0],
  218. ... [0, 4], [0, 3], [1, 4],
  219. ... [4, 0], [3, 0], [4, 1],
  220. ... [4, 4], [3, 4], [4, 3]]
  221. Then we get a condensed distance matrix from this dataset:
  222. >>> y = pdist(X)
  223. Finally, we can perform the clustering:
  224. >>> Z = single(y)
  225. >>> Z
  226. array([[ 0., 1., 1., 2.],
  227. [ 2., 12., 1., 3.],
  228. [ 3., 4., 1., 2.],
  229. [ 5., 14., 1., 3.],
  230. [ 6., 7., 1., 2.],
  231. [ 8., 16., 1., 3.],
  232. [ 9., 10., 1., 2.],
  233. [11., 18., 1., 3.],
  234. [13., 15., 2., 6.],
  235. [17., 20., 2., 9.],
  236. [19., 21., 2., 12.]])
  237. The linkage matrix ``Z`` represents a dendrogram - see
  238. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  239. contents.
  240. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  241. each initial point would belong given a distance threshold:
  242. >>> fcluster(Z, 0.9, criterion='distance')
  243. array([ 7, 8, 9, 10, 11, 12, 4, 5, 6, 1, 2, 3], dtype=int32)
  244. >>> fcluster(Z, 1, criterion='distance')
  245. array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
  246. >>> fcluster(Z, 2, criterion='distance')
  247. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  248. Also `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  249. plot of the dendrogram.
  250. """
  251. return linkage(y, method='single', metric='euclidean')
  252. def complete(y):
  253. """
  254. Perform complete/max/farthest point linkage on a condensed distance matrix.
  255. Parameters
  256. ----------
  257. y : ndarray
  258. The upper triangular of the distance matrix. The result of
  259. ``pdist`` is returned in this form.
  260. Returns
  261. -------
  262. Z : ndarray
  263. A linkage matrix containing the hierarchical clustering. See
  264. the `linkage` function documentation for more information
  265. on its structure.
  266. See Also
  267. --------
  268. linkage: for advanced creation of hierarchical clusterings.
  269. scipy.spatial.distance.pdist : pairwise distance metrics
  270. Examples
  271. --------
  272. >>> from scipy.cluster.hierarchy import complete, fcluster
  273. >>> from scipy.spatial.distance import pdist
  274. First we need a toy dataset to play with::
  275. x x x x
  276. x x
  277. x x
  278. x x x x
  279. >>> X = [[0, 0], [0, 1], [1, 0],
  280. ... [0, 4], [0, 3], [1, 4],
  281. ... [4, 0], [3, 0], [4, 1],
  282. ... [4, 4], [3, 4], [4, 3]]
  283. Then we get a condensed distance matrix from this dataset:
  284. >>> y = pdist(X)
  285. Finally, we can perform the clustering:
  286. >>> Z = complete(y)
  287. >>> Z
  288. array([[ 0. , 1. , 1. , 2. ],
  289. [ 3. , 4. , 1. , 2. ],
  290. [ 6. , 7. , 1. , 2. ],
  291. [ 9. , 10. , 1. , 2. ],
  292. [ 2. , 12. , 1.41421356, 3. ],
  293. [ 5. , 13. , 1.41421356, 3. ],
  294. [ 8. , 14. , 1.41421356, 3. ],
  295. [11. , 15. , 1.41421356, 3. ],
  296. [16. , 17. , 4.12310563, 6. ],
  297. [18. , 19. , 4.12310563, 6. ],
  298. [20. , 21. , 5.65685425, 12. ]])
  299. The linkage matrix ``Z`` represents a dendrogram - see
  300. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  301. contents.
  302. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  303. each initial point would belong given a distance threshold:
  304. >>> fcluster(Z, 0.9, criterion='distance')
  305. array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
  306. >>> fcluster(Z, 1.5, criterion='distance')
  307. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  308. >>> fcluster(Z, 4.5, criterion='distance')
  309. array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)
  310. >>> fcluster(Z, 6, criterion='distance')
  311. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  312. Also `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  313. plot of the dendrogram.
  314. """
  315. return linkage(y, method='complete', metric='euclidean')
  316. def average(y):
  317. """
  318. Perform average/UPGMA linkage on a condensed distance matrix.
  319. Parameters
  320. ----------
  321. y : ndarray
  322. The upper triangular of the distance matrix. The result of
  323. ``pdist`` is returned in this form.
  324. Returns
  325. -------
  326. Z : ndarray
  327. A linkage matrix containing the hierarchical clustering. See
  328. `linkage` for more information on its structure.
  329. See Also
  330. --------
  331. linkage: for advanced creation of hierarchical clusterings.
  332. scipy.spatial.distance.pdist : pairwise distance metrics
  333. Examples
  334. --------
  335. >>> from scipy.cluster.hierarchy import average, fcluster
  336. >>> from scipy.spatial.distance import pdist
  337. First we need a toy dataset to play with::
  338. x x x x
  339. x x
  340. x x
  341. x x x x
  342. >>> X = [[0, 0], [0, 1], [1, 0],
  343. ... [0, 4], [0, 3], [1, 4],
  344. ... [4, 0], [3, 0], [4, 1],
  345. ... [4, 4], [3, 4], [4, 3]]
  346. Then we get a condensed distance matrix from this dataset:
  347. >>> y = pdist(X)
  348. Finally, we can perform the clustering:
  349. >>> Z = average(y)
  350. >>> Z
  351. array([[ 0. , 1. , 1. , 2. ],
  352. [ 3. , 4. , 1. , 2. ],
  353. [ 6. , 7. , 1. , 2. ],
  354. [ 9. , 10. , 1. , 2. ],
  355. [ 2. , 12. , 1.20710678, 3. ],
  356. [ 5. , 13. , 1.20710678, 3. ],
  357. [ 8. , 14. , 1.20710678, 3. ],
  358. [11. , 15. , 1.20710678, 3. ],
  359. [16. , 17. , 3.39675184, 6. ],
  360. [18. , 19. , 3.39675184, 6. ],
  361. [20. , 21. , 4.09206523, 12. ]])
  362. The linkage matrix ``Z`` represents a dendrogram - see
  363. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  364. contents.
  365. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  366. each initial point would belong given a distance threshold:
  367. >>> fcluster(Z, 0.9, criterion='distance')
  368. array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
  369. >>> fcluster(Z, 1.5, criterion='distance')
  370. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  371. >>> fcluster(Z, 4, criterion='distance')
  372. array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)
  373. >>> fcluster(Z, 6, criterion='distance')
  374. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  375. Also `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  376. plot of the dendrogram.
  377. """
  378. return linkage(y, method='average', metric='euclidean')
  379. def weighted(y):
  380. """
  381. Perform weighted/WPGMA linkage on the condensed distance matrix.
  382. See `linkage` for more information on the return
  383. structure and algorithm.
  384. Parameters
  385. ----------
  386. y : ndarray
  387. The upper triangular of the distance matrix. The result of
  388. ``pdist`` is returned in this form.
  389. Returns
  390. -------
  391. Z : ndarray
  392. A linkage matrix containing the hierarchical clustering. See
  393. `linkage` for more information on its structure.
  394. See Also
  395. --------
  396. linkage : for advanced creation of hierarchical clusterings.
  397. scipy.spatial.distance.pdist : pairwise distance metrics
  398. Examples
  399. --------
  400. >>> from scipy.cluster.hierarchy import weighted, fcluster
  401. >>> from scipy.spatial.distance import pdist
  402. First we need a toy dataset to play with::
  403. x x x x
  404. x x
  405. x x
  406. x x x x
  407. >>> X = [[0, 0], [0, 1], [1, 0],
  408. ... [0, 4], [0, 3], [1, 4],
  409. ... [4, 0], [3, 0], [4, 1],
  410. ... [4, 4], [3, 4], [4, 3]]
  411. Then we get a condensed distance matrix from this dataset:
  412. >>> y = pdist(X)
  413. Finally, we can perform the clustering:
  414. >>> Z = weighted(y)
  415. >>> Z
  416. array([[ 0. , 1. , 1. , 2. ],
  417. [ 6. , 7. , 1. , 2. ],
  418. [ 3. , 4. , 1. , 2. ],
  419. [ 9. , 11. , 1. , 2. ],
  420. [ 2. , 12. , 1.20710678, 3. ],
  421. [ 8. , 13. , 1.20710678, 3. ],
  422. [ 5. , 14. , 1.20710678, 3. ],
  423. [10. , 15. , 1.20710678, 3. ],
  424. [18. , 19. , 3.05595762, 6. ],
  425. [16. , 17. , 3.32379407, 6. ],
  426. [20. , 21. , 4.06357713, 12. ]])
  427. The linkage matrix ``Z`` represents a dendrogram - see
  428. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  429. contents.
  430. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  431. each initial point would belong given a distance threshold:
  432. >>> fcluster(Z, 0.9, criterion='distance')
  433. array([ 7, 8, 9, 1, 2, 3, 10, 11, 12, 4, 6, 5], dtype=int32)
  434. >>> fcluster(Z, 1.5, criterion='distance')
  435. array([3, 3, 3, 1, 1, 1, 4, 4, 4, 2, 2, 2], dtype=int32)
  436. >>> fcluster(Z, 4, criterion='distance')
  437. array([2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1], dtype=int32)
  438. >>> fcluster(Z, 6, criterion='distance')
  439. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  440. Also `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  441. plot of the dendrogram.
  442. """
  443. return linkage(y, method='weighted', metric='euclidean')
  444. def centroid(y):
  445. """
  446. Perform centroid/UPGMC linkage.
  447. See `linkage` for more information on the input matrix,
  448. return structure, and algorithm.
  449. The following are common calling conventions:
  450. 1. ``Z = centroid(y)``
  451. Performs centroid/UPGMC linkage on the condensed distance
  452. matrix ``y``.
  453. 2. ``Z = centroid(X)``
  454. Performs centroid/UPGMC linkage on the observation matrix ``X``
  455. using Euclidean distance as the distance metric.
  456. Parameters
  457. ----------
  458. y : ndarray
  459. A condensed distance matrix. A condensed
  460. distance matrix is a flat array containing the upper
  461. triangular of the distance matrix. This is the form that
  462. ``pdist`` returns. Alternatively, a collection of
  463. m observation vectors in n dimensions may be passed as
  464. a m by n array.
  465. Returns
  466. -------
  467. Z : ndarray
  468. A linkage matrix containing the hierarchical clustering. See
  469. the `linkage` function documentation for more information
  470. on its structure.
  471. See Also
  472. --------
  473. linkage: for advanced creation of hierarchical clusterings.
  474. scipy.spatial.distance.pdist : pairwise distance metrics
  475. Examples
  476. --------
  477. >>> from scipy.cluster.hierarchy import centroid, fcluster
  478. >>> from scipy.spatial.distance import pdist
  479. First we need a toy dataset to play with::
  480. x x x x
  481. x x
  482. x x
  483. x x x x
  484. >>> X = [[0, 0], [0, 1], [1, 0],
  485. ... [0, 4], [0, 3], [1, 4],
  486. ... [4, 0], [3, 0], [4, 1],
  487. ... [4, 4], [3, 4], [4, 3]]
  488. Then we get a condensed distance matrix from this dataset:
  489. >>> y = pdist(X)
  490. Finally, we can perform the clustering:
  491. >>> Z = centroid(y)
  492. >>> Z
  493. array([[ 0. , 1. , 1. , 2. ],
  494. [ 3. , 4. , 1. , 2. ],
  495. [ 9. , 10. , 1. , 2. ],
  496. [ 6. , 7. , 1. , 2. ],
  497. [ 2. , 12. , 1.11803399, 3. ],
  498. [ 5. , 13. , 1.11803399, 3. ],
  499. [ 8. , 15. , 1.11803399, 3. ],
  500. [11. , 14. , 1.11803399, 3. ],
  501. [18. , 19. , 3.33333333, 6. ],
  502. [16. , 17. , 3.33333333, 6. ],
  503. [20. , 21. , 3.33333333, 12. ]])
  504. The linkage matrix ``Z`` represents a dendrogram - see
  505. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  506. contents.
  507. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  508. each initial point would belong given a distance threshold:
  509. >>> fcluster(Z, 0.9, criterion='distance')
  510. array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32)
  511. >>> fcluster(Z, 1.1, criterion='distance')
  512. array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32)
  513. >>> fcluster(Z, 2, criterion='distance')
  514. array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32)
  515. >>> fcluster(Z, 4, criterion='distance')
  516. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  517. Also `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  518. plot of the dendrogram.
  519. """
  520. return linkage(y, method='centroid', metric='euclidean')
  521. def median(y):
  522. """
  523. Perform median/WPGMC linkage.
  524. See `linkage` for more information on the return structure
  525. and algorithm.
  526. The following are common calling conventions:
  527. 1. ``Z = median(y)``
  528. Performs median/WPGMC linkage on the condensed distance matrix
  529. ``y``. See ``linkage`` for more information on the return
  530. structure and algorithm.
  531. 2. ``Z = median(X)``
  532. Performs median/WPGMC linkage on the observation matrix ``X``
  533. using Euclidean distance as the distance metric. See `linkage`
  534. for more information on the return structure and algorithm.
  535. Parameters
  536. ----------
  537. y : ndarray
  538. A condensed distance matrix. A condensed
  539. distance matrix is a flat array containing the upper
  540. triangular of the distance matrix. This is the form that
  541. ``pdist`` returns. Alternatively, a collection of
  542. m observation vectors in n dimensions may be passed as
  543. a m by n array.
  544. Returns
  545. -------
  546. Z : ndarray
  547. The hierarchical clustering encoded as a linkage matrix.
  548. See Also
  549. --------
  550. linkage: for advanced creation of hierarchical clusterings.
  551. scipy.spatial.distance.pdist : pairwise distance metrics
  552. Examples
  553. --------
  554. >>> from scipy.cluster.hierarchy import median, fcluster
  555. >>> from scipy.spatial.distance import pdist
  556. First we need a toy dataset to play with::
  557. x x x x
  558. x x
  559. x x
  560. x x x x
  561. >>> X = [[0, 0], [0, 1], [1, 0],
  562. ... [0, 4], [0, 3], [1, 4],
  563. ... [4, 0], [3, 0], [4, 1],
  564. ... [4, 4], [3, 4], [4, 3]]
  565. Then we get a condensed distance matrix from this dataset:
  566. >>> y = pdist(X)
  567. Finally, we can perform the clustering:
  568. >>> Z = median(y)
  569. >>> Z
  570. array([[ 0. , 1. , 1. , 2. ],
  571. [ 3. , 4. , 1. , 2. ],
  572. [ 9. , 10. , 1. , 2. ],
  573. [ 6. , 7. , 1. , 2. ],
  574. [ 2. , 12. , 1.11803399, 3. ],
  575. [ 5. , 13. , 1.11803399, 3. ],
  576. [ 8. , 15. , 1.11803399, 3. ],
  577. [11. , 14. , 1.11803399, 3. ],
  578. [18. , 19. , 3. , 6. ],
  579. [16. , 17. , 3.5 , 6. ],
  580. [20. , 21. , 3.25 , 12. ]])
  581. The linkage matrix ``Z`` represents a dendrogram - see
  582. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  583. contents.
  584. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  585. each initial point would belong given a distance threshold:
  586. >>> fcluster(Z, 0.9, criterion='distance')
  587. array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32)
  588. >>> fcluster(Z, 1.1, criterion='distance')
  589. array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32)
  590. >>> fcluster(Z, 2, criterion='distance')
  591. array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32)
  592. >>> fcluster(Z, 4, criterion='distance')
  593. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  594. Also `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  595. plot of the dendrogram.
  596. """
  597. return linkage(y, method='median', metric='euclidean')
  598. def ward(y):
  599. """
  600. Perform Ward's linkage on a condensed distance matrix.
  601. See `linkage` for more information on the return structure
  602. and algorithm.
  603. The following are common calling conventions:
  604. 1. ``Z = ward(y)``
  605. Performs Ward's linkage on the condensed distance matrix ``y``.
  606. 2. ``Z = ward(X)``
  607. Performs Ward's linkage on the observation matrix ``X`` using
  608. Euclidean distance as the distance metric.
  609. Parameters
  610. ----------
  611. y : ndarray
  612. A condensed distance matrix. A condensed
  613. distance matrix is a flat array containing the upper
  614. triangular of the distance matrix. This is the form that
  615. ``pdist`` returns. Alternatively, a collection of
  616. m observation vectors in n dimensions may be passed as
  617. a m by n array.
  618. Returns
  619. -------
  620. Z : ndarray
  621. The hierarchical clustering encoded as a linkage matrix. See
  622. `linkage` for more information on the return structure and
  623. algorithm.
  624. See Also
  625. --------
  626. linkage: for advanced creation of hierarchical clusterings.
  627. scipy.spatial.distance.pdist : pairwise distance metrics
  628. Examples
  629. --------
  630. >>> from scipy.cluster.hierarchy import ward, fcluster
  631. >>> from scipy.spatial.distance import pdist
  632. First we need a toy dataset to play with::
  633. x x x x
  634. x x
  635. x x
  636. x x x x
  637. >>> X = [[0, 0], [0, 1], [1, 0],
  638. ... [0, 4], [0, 3], [1, 4],
  639. ... [4, 0], [3, 0], [4, 1],
  640. ... [4, 4], [3, 4], [4, 3]]
  641. Then we get a condensed distance matrix from this dataset:
  642. >>> y = pdist(X)
  643. Finally, we can perform the clustering:
  644. >>> Z = ward(y)
  645. >>> Z
  646. array([[ 0. , 1. , 1. , 2. ],
  647. [ 3. , 4. , 1. , 2. ],
  648. [ 6. , 7. , 1. , 2. ],
  649. [ 9. , 10. , 1. , 2. ],
  650. [ 2. , 12. , 1.29099445, 3. ],
  651. [ 5. , 13. , 1.29099445, 3. ],
  652. [ 8. , 14. , 1.29099445, 3. ],
  653. [11. , 15. , 1.29099445, 3. ],
  654. [16. , 17. , 5.77350269, 6. ],
  655. [18. , 19. , 5.77350269, 6. ],
  656. [20. , 21. , 8.16496581, 12. ]])
  657. The linkage matrix ``Z`` represents a dendrogram - see
  658. `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
  659. contents.
  660. We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
  661. each initial point would belong given a distance threshold:
  662. >>> fcluster(Z, 0.9, criterion='distance')
  663. array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
  664. >>> fcluster(Z, 1.1, criterion='distance')
  665. array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32)
  666. >>> fcluster(Z, 3, criterion='distance')
  667. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  668. >>> fcluster(Z, 9, criterion='distance')
  669. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  670. Also `scipy.cluster.hierarchy.dendrogram` can be used to generate a
  671. plot of the dendrogram.
  672. """
  673. return linkage(y, method='ward', metric='euclidean')
  674. def linkage(y, method='single', metric='euclidean', optimal_ordering=False):
  675. """
  676. Perform hierarchical/agglomerative clustering.
  677. The input y may be either a 1d condensed distance matrix
  678. or a 2d array of observation vectors.
  679. If y is a 1d condensed distance matrix,
  680. then y must be a :math:`\\binom{n}{2}` sized
  681. vector where n is the number of original observations paired
  682. in the distance matrix. The behavior of this function is very
  683. similar to the MATLAB linkage function.
  684. A :math:`(n-1)` by 4 matrix ``Z`` is returned. At the
  685. :math:`i`-th iteration, clusters with indices ``Z[i, 0]`` and
  686. ``Z[i, 1]`` are combined to form cluster :math:`n + i`. A
  687. cluster with an index less than :math:`n` corresponds to one of
  688. the :math:`n` original observations. The distance between
  689. clusters ``Z[i, 0]`` and ``Z[i, 1]`` is given by ``Z[i, 2]``. The
  690. fourth value ``Z[i, 3]`` represents the number of original
  691. observations in the newly formed cluster.
  692. The following linkage methods are used to compute the distance
  693. :math:`d(s, t)` between two clusters :math:`s` and
  694. :math:`t`. The algorithm begins with a forest of clusters that
  695. have yet to be used in the hierarchy being formed. When two
  696. clusters :math:`s` and :math:`t` from this forest are combined
  697. into a single cluster :math:`u`, :math:`s` and :math:`t` are
  698. removed from the forest, and :math:`u` is added to the
  699. forest. When only one cluster remains in the forest, the algorithm
  700. stops, and this cluster becomes the root.
  701. A distance matrix is maintained at each iteration. The ``d[i,j]``
  702. entry corresponds to the distance between cluster :math:`i` and
  703. :math:`j` in the original forest.
  704. At each iteration, the algorithm must update the distance matrix
  705. to reflect the distance of the newly formed cluster u with the
  706. remaining clusters in the forest.
  707. Suppose there are :math:`|u|` original observations
  708. :math:`u[0], \\ldots, u[|u|-1]` in cluster :math:`u` and
  709. :math:`|v|` original objects :math:`v[0], \\ldots, v[|v|-1]` in
  710. cluster :math:`v`. Recall :math:`s` and :math:`t` are
  711. combined to form cluster :math:`u`. Let :math:`v` be any
  712. remaining cluster in the forest that is not :math:`u`.
  713. The following are methods for calculating the distance between the
  714. newly formed cluster :math:`u` and each :math:`v`.
  715. * method='single' assigns
  716. .. math::
  717. d(u,v) = \\min(dist(u[i],v[j]))
  718. for all points :math:`i` in cluster :math:`u` and
  719. :math:`j` in cluster :math:`v`. This is also known as the
  720. Nearest Point Algorithm.
  721. * method='complete' assigns
  722. .. math::
  723. d(u, v) = \\max(dist(u[i],v[j]))
  724. for all points :math:`i` in cluster u and :math:`j` in
  725. cluster :math:`v`. This is also known by the Farthest Point
  726. Algorithm or Voor Hees Algorithm.
  727. * method='average' assigns
  728. .. math::
  729. d(u,v) = \\sum_{ij} \\frac{d(u[i], v[j])}
  730. {(|u|*|v|)}
  731. for all points :math:`i` and :math:`j` where :math:`|u|`
  732. and :math:`|v|` are the cardinalities of clusters :math:`u`
  733. and :math:`v`, respectively. This is also called the UPGMA
  734. algorithm.
  735. * method='weighted' assigns
  736. .. math::
  737. d(u,v) = (dist(s,v) + dist(t,v))/2
  738. where cluster u was formed with cluster s and t and v
  739. is a remaining cluster in the forest. (also called WPGMA)
  740. * method='centroid' assigns
  741. .. math::
  742. dist(s,t) = ||c_s-c_t||_2
  743. where :math:`c_s` and :math:`c_t` are the centroids of
  744. clusters :math:`s` and :math:`t`, respectively. When two
  745. clusters :math:`s` and :math:`t` are combined into a new
  746. cluster :math:`u`, the new centroid is computed over all the
  747. original objects in clusters :math:`s` and :math:`t`. The
  748. distance then becomes the Euclidean distance between the
  749. centroid of :math:`u` and the centroid of a remaining cluster
  750. :math:`v` in the forest. This is also known as the UPGMC
  751. algorithm.
  752. * method='median' assigns :math:`d(s,t)` like the ``centroid``
  753. method. When two clusters :math:`s` and :math:`t` are combined
  754. into a new cluster :math:`u`, the average of centroids s and t
  755. give the new centroid :math:`u`. This is also known as the
  756. WPGMC algorithm.
  757. * method='ward' uses the Ward variance minimization algorithm.
  758. The new entry :math:`d(u,v)` is computed as follows,
  759. .. math::
  760. d(u,v) = \\sqrt{\\frac{|v|+|s|}
  761. {T}d(v,s)^2
  762. + \\frac{|v|+|t|}
  763. {T}d(v,t)^2
  764. - \\frac{|v|}
  765. {T}d(s,t)^2}
  766. where :math:`u` is the newly joined cluster consisting of
  767. clusters :math:`s` and :math:`t`, :math:`v` is an unused
  768. cluster in the forest, :math:`T=|v|+|s|+|t|`, and
  769. :math:`|*|` is the cardinality of its argument. This is also
  770. known as the incremental algorithm.
  771. Warning: When the minimum distance pair in the forest is chosen, there
  772. may be two or more pairs with the same minimum distance. This
  773. implementation may choose a different minimum than the MATLAB
  774. version.
  775. Parameters
  776. ----------
  777. y : ndarray
  778. A condensed distance matrix. A condensed distance matrix
  779. is a flat array containing the upper triangular of the distance matrix.
  780. This is the form that ``pdist`` returns. Alternatively, a collection of
  781. :math:`m` observation vectors in :math:`n` dimensions may be passed as
  782. an :math:`m` by :math:`n` array. All elements of the condensed distance
  783. matrix must be finite, i.e. no NaNs or infs.
  784. method : str, optional
  785. The linkage algorithm to use. See the ``Linkage Methods`` section below
  786. for full descriptions.
  787. metric : str or function, optional
  788. The distance metric to use in the case that y is a collection of
  789. observation vectors; ignored otherwise. See the ``pdist``
  790. function for a list of valid distance metrics. A custom distance
  791. function can also be used.
  792. optimal_ordering : bool, optional
  793. If True, the linkage matrix will be reordered so that the distance
  794. between successive leaves is minimal. This results in a more intuitive
  795. tree structure when the data are visualized. defaults to False, because
  796. this algorithm can be slow, particularly on large datasets [2]_. See
  797. also the `optimal_leaf_ordering` function.
  798. .. versionadded:: 1.0.0
  799. Returns
  800. -------
  801. Z : ndarray
  802. The hierarchical clustering encoded as a linkage matrix.
  803. Notes
  804. -----
  805. 1. For method 'single' an optimized algorithm based on minimum spanning
  806. tree is implemented. It has time complexity :math:`O(n^2)`.
  807. For methods 'complete', 'average', 'weighted' and 'ward' an algorithm
  808. called nearest-neighbors chain is implemented. It also has time
  809. complexity :math:`O(n^2)`.
  810. For other methods a naive algorithm is implemented with :math:`O(n^3)`
  811. time complexity.
  812. All algorithms use :math:`O(n^2)` memory.
  813. Refer to [1]_ for details about the algorithms.
  814. 2. Methods 'centroid', 'median' and 'ward' are correctly defined only if
  815. Euclidean pairwise metric is used. If `y` is passed as precomputed
  816. pairwise distances, then it is a user responsibility to assure that
  817. these distances are in fact Euclidean, otherwise the produced result
  818. will be incorrect.
  819. See Also
  820. --------
  821. scipy.spatial.distance.pdist : pairwise distance metrics
  822. References
  823. ----------
  824. .. [1] Daniel Mullner, "Modern hierarchical, agglomerative clustering
  825. algorithms", :arXiv:`1109.2378v1`.
  826. .. [2] Ziv Bar-Joseph, David K. Gifford, Tommi S. Jaakkola, "Fast optimal
  827. leaf ordering for hierarchical clustering", 2001. Bioinformatics
  828. :doi:`10.1093/bioinformatics/17.suppl_1.S22`
  829. Examples
  830. --------
  831. >>> from scipy.cluster.hierarchy import dendrogram, linkage
  832. >>> from matplotlib import pyplot as plt
  833. >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
  834. >>> Z = linkage(X, 'ward')
  835. >>> fig = plt.figure(figsize=(25, 10))
  836. >>> dn = dendrogram(Z)
  837. >>> Z = linkage(X, 'single')
  838. >>> fig = plt.figure(figsize=(25, 10))
  839. >>> dn = dendrogram(Z)
  840. >>> plt.show()
  841. """
  842. if method not in _LINKAGE_METHODS:
  843. raise ValueError("Invalid method: {0}".format(method))
  844. y = _convert_to_double(np.asarray(y, order='c'))
  845. if y.ndim == 1:
  846. distance.is_valid_y(y, throw=True, name='y')
  847. [y] = _copy_arrays_if_base_present([y])
  848. elif y.ndim == 2:
  849. if method in _EUCLIDEAN_METHODS and metric != 'euclidean':
  850. raise ValueError("Method '{0}' requires the distance metric "
  851. "to be Euclidean".format(method))
  852. if y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0):
  853. if np.all(y >= 0) and np.allclose(y, y.T):
  854. _warning('The symmetric non-negative hollow observation '
  855. 'matrix looks suspiciously like an uncondensed '
  856. 'distance matrix')
  857. y = distance.pdist(y, metric)
  858. else:
  859. raise ValueError("`y` must be 1 or 2 dimensional.")
  860. if not np.all(np.isfinite(y)):
  861. raise ValueError("The condensed distance matrix must contain only "
  862. "finite values.")
  863. n = int(distance.num_obs_y(y))
  864. method_code = _LINKAGE_METHODS[method]
  865. if method == 'single':
  866. result = _hierarchy.mst_single_linkage(y, n)
  867. elif method in ['complete', 'average', 'weighted', 'ward']:
  868. result = _hierarchy.nn_chain(y, n, method_code)
  869. else:
  870. result = _hierarchy.fast_linkage(y, n, method_code)
  871. if optimal_ordering:
  872. return optimal_leaf_ordering(result, y)
  873. else:
  874. return result
  875. class ClusterNode:
  876. """
  877. A tree node class for representing a cluster.
  878. Leaf nodes correspond to original observations, while non-leaf nodes
  879. correspond to non-singleton clusters.
  880. The `to_tree` function converts a matrix returned by the linkage
  881. function into an easy-to-use tree representation.
  882. All parameter names are also attributes.
  883. Parameters
  884. ----------
  885. id : int
  886. The node id.
  887. left : ClusterNode instance, optional
  888. The left child tree node.
  889. right : ClusterNode instance, optional
  890. The right child tree node.
  891. dist : float, optional
  892. Distance for this cluster in the linkage matrix.
  893. count : int, optional
  894. The number of samples in this cluster.
  895. See Also
  896. --------
  897. to_tree : for converting a linkage matrix ``Z`` into a tree object.
  898. """
  899. def __init__(self, id, left=None, right=None, dist=0, count=1):
  900. if id < 0:
  901. raise ValueError('The id must be non-negative.')
  902. if dist < 0:
  903. raise ValueError('The distance must be non-negative.')
  904. if (left is None and right is not None) or \
  905. (left is not None and right is None):
  906. raise ValueError('Only full or proper binary trees are permitted.'
  907. ' This node has one child.')
  908. if count < 1:
  909. raise ValueError('A cluster must contain at least one original '
  910. 'observation.')
  911. self.id = id
  912. self.left = left
  913. self.right = right
  914. self.dist = dist
  915. if self.left is None:
  916. self.count = count
  917. else:
  918. self.count = left.count + right.count
  919. def __lt__(self, node):
  920. if not isinstance(node, ClusterNode):
  921. raise ValueError("Can't compare ClusterNode "
  922. "to type {}".format(type(node)))
  923. return self.dist < node.dist
  924. def __gt__(self, node):
  925. if not isinstance(node, ClusterNode):
  926. raise ValueError("Can't compare ClusterNode "
  927. "to type {}".format(type(node)))
  928. return self.dist > node.dist
  929. def __eq__(self, node):
  930. if not isinstance(node, ClusterNode):
  931. raise ValueError("Can't compare ClusterNode "
  932. "to type {}".format(type(node)))
  933. return self.dist == node.dist
  934. def get_id(self):
  935. """
  936. The identifier of the target node.
  937. For ``0 <= i < n``, `i` corresponds to original observation i.
  938. For ``n <= i < 2n-1``, `i` corresponds to non-singleton cluster formed
  939. at iteration ``i-n``.
  940. Returns
  941. -------
  942. id : int
  943. The identifier of the target node.
  944. """
  945. return self.id
  946. def get_count(self):
  947. """
  948. The number of leaf nodes (original observations) belonging to
  949. the cluster node nd. If the target node is a leaf, 1 is
  950. returned.
  951. Returns
  952. -------
  953. get_count : int
  954. The number of leaf nodes below the target node.
  955. """
  956. return self.count
  957. def get_left(self):
  958. """
  959. Return a reference to the left child tree object.
  960. Returns
  961. -------
  962. left : ClusterNode
  963. The left child of the target node. If the node is a leaf,
  964. None is returned.
  965. """
  966. return self.left
  967. def get_right(self):
  968. """
  969. Return a reference to the right child tree object.
  970. Returns
  971. -------
  972. right : ClusterNode
  973. The left child of the target node. If the node is a leaf,
  974. None is returned.
  975. """
  976. return self.right
  977. def is_leaf(self):
  978. """
  979. Return True if the target node is a leaf.
  980. Returns
  981. -------
  982. leafness : bool
  983. True if the target node is a leaf node.
  984. """
  985. return self.left is None
  986. def pre_order(self, func=(lambda x: x.id)):
  987. """
  988. Perform pre-order traversal without recursive function calls.
  989. When a leaf node is first encountered, ``func`` is called with
  990. the leaf node as its argument, and its result is appended to
  991. the list.
  992. For example, the statement::
  993. ids = root.pre_order(lambda x: x.id)
  994. returns a list of the node ids corresponding to the leaf nodes
  995. of the tree as they appear from left to right.
  996. Parameters
  997. ----------
  998. func : function
  999. Applied to each leaf ClusterNode object in the pre-order traversal.
  1000. Given the ``i``-th leaf node in the pre-order traversal ``n[i]``,
  1001. the result of ``func(n[i])`` is stored in ``L[i]``. If not
  1002. provided, the index of the original observation to which the node
  1003. corresponds is used.
  1004. Returns
  1005. -------
  1006. L : list
  1007. The pre-order traversal.
  1008. """
  1009. # Do a preorder traversal, caching the result. To avoid having to do
  1010. # recursion, we'll store the previous index we've visited in a vector.
  1011. n = self.count
  1012. curNode = [None] * (2 * n)
  1013. lvisited = set()
  1014. rvisited = set()
  1015. curNode[0] = self
  1016. k = 0
  1017. preorder = []
  1018. while k >= 0:
  1019. nd = curNode[k]
  1020. ndid = nd.id
  1021. if nd.is_leaf():
  1022. preorder.append(func(nd))
  1023. k = k - 1
  1024. else:
  1025. if ndid not in lvisited:
  1026. curNode[k + 1] = nd.left
  1027. lvisited.add(ndid)
  1028. k = k + 1
  1029. elif ndid not in rvisited:
  1030. curNode[k + 1] = nd.right
  1031. rvisited.add(ndid)
  1032. k = k + 1
  1033. # If we've visited the left and right of this non-leaf
  1034. # node already, go up in the tree.
  1035. else:
  1036. k = k - 1
  1037. return preorder
  1038. _cnode_bare = ClusterNode(0)
  1039. _cnode_type = type(ClusterNode)
  1040. def _order_cluster_tree(Z):
  1041. """
  1042. Return clustering nodes in bottom-up order by distance.
  1043. Parameters
  1044. ----------
  1045. Z : scipy.cluster.linkage array
  1046. The linkage matrix.
  1047. Returns
  1048. -------
  1049. nodes : list
  1050. A list of ClusterNode objects.
  1051. """
  1052. q = deque()
  1053. tree = to_tree(Z)
  1054. q.append(tree)
  1055. nodes = []
  1056. while q:
  1057. node = q.popleft()
  1058. if not node.is_leaf():
  1059. bisect.insort_left(nodes, node)
  1060. q.append(node.get_right())
  1061. q.append(node.get_left())
  1062. return nodes
  1063. def cut_tree(Z, n_clusters=None, height=None):
  1064. """
  1065. Given a linkage matrix Z, return the cut tree.
  1066. Parameters
  1067. ----------
  1068. Z : scipy.cluster.linkage array
  1069. The linkage matrix.
  1070. n_clusters : array_like, optional
  1071. Number of clusters in the tree at the cut point.
  1072. height : array_like, optional
  1073. The height at which to cut the tree. Only possible for ultrametric
  1074. trees.
  1075. Returns
  1076. -------
  1077. cutree : array
  1078. An array indicating group membership at each agglomeration step. I.e.,
  1079. for a full cut tree, in the first column each data point is in its own
  1080. cluster. At the next step, two nodes are merged. Finally all
  1081. singleton and non-singleton clusters are in one group. If `n_clusters`
  1082. or `height` is given, the columns correspond to the columns of
  1083. `n_clusters` or `height`.
  1084. Examples
  1085. --------
  1086. >>> from scipy import cluster
  1087. >>> np.random.seed(23)
  1088. >>> X = np.random.randn(50, 4)
  1089. >>> Z = cluster.hierarchy.ward(X)
  1090. >>> cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])
  1091. >>> cutree[:10]
  1092. array([[0, 0],
  1093. [1, 1],
  1094. [2, 2],
  1095. [3, 3],
  1096. [3, 4],
  1097. [2, 2],
  1098. [0, 0],
  1099. [1, 5],
  1100. [3, 6],
  1101. [4, 7]])
  1102. """
  1103. nobs = num_obs_linkage(Z)
  1104. nodes = _order_cluster_tree(Z)
  1105. if height is not None and n_clusters is not None:
  1106. raise ValueError("At least one of either height or n_clusters "
  1107. "must be None")
  1108. elif height is None and n_clusters is None: # return the full cut tree
  1109. cols_idx = np.arange(nobs)
  1110. elif height is not None:
  1111. heights = np.array([x.dist for x in nodes])
  1112. cols_idx = np.searchsorted(heights, height)
  1113. else:
  1114. cols_idx = nobs - np.searchsorted(np.arange(nobs), n_clusters)
  1115. try:
  1116. n_cols = len(cols_idx)
  1117. except TypeError: # scalar
  1118. n_cols = 1
  1119. cols_idx = np.array([cols_idx])
  1120. groups = np.zeros((n_cols, nobs), dtype=int)
  1121. last_group = np.arange(nobs)
  1122. if 0 in cols_idx:
  1123. groups[0] = last_group
  1124. for i, node in enumerate(nodes):
  1125. idx = node.pre_order()
  1126. this_group = last_group.copy()
  1127. this_group[idx] = last_group[idx].min()
  1128. this_group[this_group > last_group[idx].max()] -= 1
  1129. if i + 1 in cols_idx:
  1130. groups[np.nonzero(i + 1 == cols_idx)[0]] = this_group
  1131. last_group = this_group
  1132. return groups.T
  1133. def to_tree(Z, rd=False):
  1134. """
  1135. Convert a linkage matrix into an easy-to-use tree object.
  1136. The reference to the root `ClusterNode` object is returned (by default).
  1137. Each `ClusterNode` object has a ``left``, ``right``, ``dist``, ``id``,
  1138. and ``count`` attribute. The left and right attributes point to
  1139. ClusterNode objects that were combined to generate the cluster.
  1140. If both are None then the `ClusterNode` object is a leaf node, its count
  1141. must be 1, and its distance is meaningless but set to 0.
  1142. *Note: This function is provided for the convenience of the library
  1143. user. ClusterNodes are not used as input to any of the functions in this
  1144. library.*
  1145. Parameters
  1146. ----------
  1147. Z : ndarray
  1148. The linkage matrix in proper form (see the `linkage`
  1149. function documentation).
  1150. rd : bool, optional
  1151. When False (default), a reference to the root `ClusterNode` object is
  1152. returned. Otherwise, a tuple ``(r, d)`` is returned. ``r`` is a
  1153. reference to the root node while ``d`` is a list of `ClusterNode`
  1154. objects - one per original entry in the linkage matrix plus entries
  1155. for all clustering steps. If a cluster id is
  1156. less than the number of samples ``n`` in the data that the linkage
  1157. matrix describes, then it corresponds to a singleton cluster (leaf
  1158. node).
  1159. See `linkage` for more information on the assignment of cluster ids
  1160. to clusters.
  1161. Returns
  1162. -------
  1163. tree : ClusterNode or tuple (ClusterNode, list of ClusterNode)
  1164. If ``rd`` is False, a `ClusterNode`.
  1165. If ``rd`` is True, a list of length ``2*n - 1``, with ``n`` the number
  1166. of samples. See the description of `rd` above for more details.
  1167. See Also
  1168. --------
  1169. linkage, is_valid_linkage, ClusterNode
  1170. Examples
  1171. --------
  1172. >>> from scipy.cluster import hierarchy
  1173. >>> x = np.random.rand(10).reshape(5, 2)
  1174. >>> Z = hierarchy.linkage(x)
  1175. >>> hierarchy.to_tree(Z)
  1176. <scipy.cluster.hierarchy.ClusterNode object at ...
  1177. >>> rootnode, nodelist = hierarchy.to_tree(Z, rd=True)
  1178. >>> rootnode
  1179. <scipy.cluster.hierarchy.ClusterNode object at ...
  1180. >>> len(nodelist)
  1181. 9
  1182. """
  1183. Z = np.asarray(Z, order='c')
  1184. is_valid_linkage(Z, throw=True, name='Z')
  1185. # Number of original objects is equal to the number of rows minus 1.
  1186. n = Z.shape[0] + 1
  1187. # Create a list full of None's to store the node objects
  1188. d = [None] * (n * 2 - 1)
  1189. # Create the nodes corresponding to the n original objects.
  1190. for i in xrange(0, n):
  1191. d[i] = ClusterNode(i)
  1192. nd = None
  1193. for i in xrange(0, n - 1):
  1194. fi = int(Z[i, 0])
  1195. fj = int(Z[i, 1])
  1196. if fi > i + n:
  1197. raise ValueError(('Corrupt matrix Z. Index to derivative cluster '
  1198. 'is used before it is formed. See row %d, '
  1199. 'column 0') % fi)
  1200. if fj > i + n:
  1201. raise ValueError(('Corrupt matrix Z. Index to derivative cluster '
  1202. 'is used before it is formed. See row %d, '
  1203. 'column 1') % fj)
  1204. nd = ClusterNode(i + n, d[fi], d[fj], Z[i, 2])
  1205. # ^ id ^ left ^ right ^ dist
  1206. if Z[i, 3] != nd.count:
  1207. raise ValueError(('Corrupt matrix Z. The count Z[%d,3] is '
  1208. 'incorrect.') % i)
  1209. d[n + i] = nd
  1210. if rd:
  1211. return (nd, d)
  1212. else:
  1213. return nd
  1214. def optimal_leaf_ordering(Z, y, metric='euclidean'):
  1215. """
  1216. Given a linkage matrix Z and distance, reorder the cut tree.
  1217. Parameters
  1218. ----------
  1219. Z : ndarray
  1220. The hierarchical clustering encoded as a linkage matrix. See
  1221. `linkage` for more information on the return structure and
  1222. algorithm.
  1223. y : ndarray
  1224. The condensed distance matrix from which Z was generated.
  1225. Alternatively, a collection of m observation vectors in n
  1226. dimensions may be passed as a m by n array.
  1227. metric : str or function, optional
  1228. The distance metric to use in the case that y is a collection of
  1229. observation vectors; ignored otherwise. See the ``pdist``
  1230. function for a list of valid distance metrics. A custom distance
  1231. function can also be used.
  1232. Returns
  1233. -------
  1234. Z_ordered : ndarray
  1235. A copy of the linkage matrix Z, reordered to minimize the distance
  1236. between adjacent leaves.
  1237. Examples
  1238. --------
  1239. >>> from scipy.cluster import hierarchy
  1240. >>> np.random.seed(23)
  1241. >>> X = np.random.randn(10,10)
  1242. >>> Z = hierarchy.ward(X)
  1243. >>> hierarchy.leaves_list(Z)
  1244. array([0, 5, 3, 9, 6, 8, 1, 4, 2, 7], dtype=int32)
  1245. >>> hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X))
  1246. array([3, 9, 0, 5, 8, 2, 7, 4, 1, 6], dtype=int32)
  1247. """
  1248. Z = np.asarray(Z, order='c')
  1249. is_valid_linkage(Z, throw=True, name='Z')
  1250. y = _convert_to_double(np.asarray(y, order='c'))
  1251. if y.ndim == 1:
  1252. distance.is_valid_y(y, throw=True, name='y')
  1253. [y] = _copy_arrays_if_base_present([y])
  1254. elif y.ndim == 2:
  1255. if y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0):
  1256. if np.all(y >= 0) and np.allclose(y, y.T):
  1257. _warning('The symmetric non-negative hollow observation '
  1258. 'matrix looks suspiciously like an uncondensed '
  1259. 'distance matrix')
  1260. y = distance.pdist(y, metric)
  1261. else:
  1262. raise ValueError("`y` must be 1 or 2 dimensional.")
  1263. if not np.all(np.isfinite(y)):
  1264. raise ValueError("The condensed distance matrix must contain only "
  1265. "finite values.")
  1266. return _optimal_leaf_ordering.optimal_leaf_ordering(Z, y)
  1267. def _convert_to_bool(X):
  1268. if X.dtype != bool:
  1269. X = X.astype(bool)
  1270. if not X.flags.contiguous:
  1271. X = X.copy()
  1272. return X
  1273. def _convert_to_double(X):
  1274. if X.dtype != np.double:
  1275. X = X.astype(np.double)
  1276. if not X.flags.contiguous:
  1277. X = X.copy()
  1278. return X
  1279. def cophenet(Z, Y=None):
  1280. """
  1281. Calculate the cophenetic distances between each observation in
  1282. the hierarchical clustering defined by the linkage ``Z``.
  1283. Suppose ``p`` and ``q`` are original observations in
  1284. disjoint clusters ``s`` and ``t``, respectively and
  1285. ``s`` and ``t`` are joined by a direct parent cluster
  1286. ``u``. The cophenetic distance between observations
  1287. ``i`` and ``j`` is simply the distance between
  1288. clusters ``s`` and ``t``.
  1289. Parameters
  1290. ----------
  1291. Z : ndarray
  1292. The hierarchical clustering encoded as an array
  1293. (see `linkage` function).
  1294. Y : ndarray (optional)
  1295. Calculates the cophenetic correlation coefficient ``c`` of a
  1296. hierarchical clustering defined by the linkage matrix `Z`
  1297. of a set of :math:`n` observations in :math:`m`
  1298. dimensions. `Y` is the condensed distance matrix from which
  1299. `Z` was generated.
  1300. Returns
  1301. -------
  1302. c : ndarray
  1303. The cophentic correlation distance (if ``Y`` is passed).
  1304. d : ndarray
  1305. The cophenetic distance matrix in condensed form. The
  1306. :math:`ij` th entry is the cophenetic distance between
  1307. original observations :math:`i` and :math:`j`.
  1308. See Also
  1309. --------
  1310. linkage: for a description of what a linkage matrix is.
  1311. scipy.spatial.distance.squareform: transforming condensed matrices into square ones.
  1312. Examples
  1313. --------
  1314. >>> from scipy.cluster.hierarchy import single, cophenet
  1315. >>> from scipy.spatial.distance import pdist, squareform
  1316. Given a dataset ``X`` and a linkage matrix ``Z``, the cophenetic distance
  1317. between two points of ``X`` is the distance between the largest two
  1318. distinct clusters that each of the points:
  1319. >>> X = [[0, 0], [0, 1], [1, 0],
  1320. ... [0, 4], [0, 3], [1, 4],
  1321. ... [4, 0], [3, 0], [4, 1],
  1322. ... [4, 4], [3, 4], [4, 3]]
  1323. ``X`` corresponds to this dataset ::
  1324. x x x x
  1325. x x
  1326. x x
  1327. x x x x
  1328. >>> Z = single(pdist(X))
  1329. >>> Z
  1330. array([[ 0., 1., 1., 2.],
  1331. [ 2., 12., 1., 3.],
  1332. [ 3., 4., 1., 2.],
  1333. [ 5., 14., 1., 3.],
  1334. [ 6., 7., 1., 2.],
  1335. [ 8., 16., 1., 3.],
  1336. [ 9., 10., 1., 2.],
  1337. [11., 18., 1., 3.],
  1338. [13., 15., 2., 6.],
  1339. [17., 20., 2., 9.],
  1340. [19., 21., 2., 12.]])
  1341. >>> cophenet(Z)
  1342. array([1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2.,
  1343. 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2.,
  1344. 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
  1345. 1., 1., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1.])
  1346. The output of the `scipy.cluster.hierarchy.cophenet` method is
  1347. represented in condensed form. We can use
  1348. `scipy.spatial.distance.squareform` to see the output as a
  1349. regular matrix (where each element ``ij`` denotes the cophenetic distance
  1350. between each ``i``, ``j`` pair of points in ``X``):
  1351. >>> squareform(cophenet(Z))
  1352. array([[0., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
  1353. [1., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
  1354. [1., 1., 0., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
  1355. [2., 2., 2., 0., 1., 1., 2., 2., 2., 2., 2., 2.],
  1356. [2., 2., 2., 1., 0., 1., 2., 2., 2., 2., 2., 2.],
  1357. [2., 2., 2., 1., 1., 0., 2., 2., 2., 2., 2., 2.],
  1358. [2., 2., 2., 2., 2., 2., 0., 1., 1., 2., 2., 2.],
  1359. [2., 2., 2., 2., 2., 2., 1., 0., 1., 2., 2., 2.],
  1360. [2., 2., 2., 2., 2., 2., 1., 1., 0., 2., 2., 2.],
  1361. [2., 2., 2., 2., 2., 2., 2., 2., 2., 0., 1., 1.],
  1362. [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 0., 1.],
  1363. [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 0.]])
  1364. In this example, the cophenetic distance between points on ``X`` that are
  1365. very close (i.e. in the same corner) is 1. For other pairs of points is 2,
  1366. because the points will be located in clusters at different
  1367. corners - thus the distance between these clusters will be larger.
  1368. """
  1369. Z = np.asarray(Z, order='c')
  1370. is_valid_linkage(Z, throw=True, name='Z')
  1371. Zs = Z.shape
  1372. n = Zs[0] + 1
  1373. zz = np.zeros((n * (n-1)) // 2, dtype=np.double)
  1374. # Since the C code does not support striding using strides.
  1375. # The dimensions are used instead.
  1376. Z = _convert_to_double(Z)
  1377. _hierarchy.cophenetic_distances(Z, zz, int(n))
  1378. if Y is None:
  1379. return zz
  1380. Y = np.asarray(Y, order='c')
  1381. distance.is_valid_y(Y, throw=True, name='Y')
  1382. z = zz.mean()
  1383. y = Y.mean()
  1384. Yy = Y - y
  1385. Zz = zz - z
  1386. numerator = (Yy * Zz)
  1387. denomA = Yy**2
  1388. denomB = Zz**2
  1389. c = numerator.sum() / np.sqrt((denomA.sum() * denomB.sum()))
  1390. return (c, zz)
  1391. def inconsistent(Z, d=2):
  1392. r"""
  1393. Calculate inconsistency statistics on a linkage matrix.
  1394. Parameters
  1395. ----------
  1396. Z : ndarray
  1397. The :math:`(n-1)` by 4 matrix encoding the linkage (hierarchical
  1398. clustering). See `linkage` documentation for more information on its
  1399. form.
  1400. d : int, optional
  1401. The number of links up to `d` levels below each non-singleton cluster.
  1402. Returns
  1403. -------
  1404. R : ndarray
  1405. A :math:`(n-1)` by 4 matrix where the ``i``'th row contains the link
  1406. statistics for the non-singleton cluster ``i``. The link statistics are
  1407. computed over the link heights for links :math:`d` levels below the
  1408. cluster ``i``. ``R[i,0]`` and ``R[i,1]`` are the mean and standard
  1409. deviation of the link heights, respectively; ``R[i,2]`` is the number
  1410. of links included in the calculation; and ``R[i,3]`` is the
  1411. inconsistency coefficient,
  1412. .. math:: \frac{\mathtt{Z[i,2]} - \mathtt{R[i,0]}} {R[i,1]}
  1413. Notes
  1414. -----
  1415. This function behaves similarly to the MATLAB(TM) ``inconsistent``
  1416. function.
  1417. Examples
  1418. --------
  1419. >>> from scipy.cluster.hierarchy import inconsistent, linkage
  1420. >>> from matplotlib import pyplot as plt
  1421. >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
  1422. >>> Z = linkage(X, 'ward')
  1423. >>> print(Z)
  1424. [[ 5. 6. 0. 2. ]
  1425. [ 2. 7. 0. 2. ]
  1426. [ 0. 4. 1. 2. ]
  1427. [ 1. 8. 1.15470054 3. ]
  1428. [ 9. 10. 2.12132034 4. ]
  1429. [ 3. 12. 4.11096096 5. ]
  1430. [11. 13. 14.07183949 8. ]]
  1431. >>> inconsistent(Z)
  1432. array([[ 0. , 0. , 1. , 0. ],
  1433. [ 0. , 0. , 1. , 0. ],
  1434. [ 1. , 0. , 1. , 0. ],
  1435. [ 0.57735027, 0.81649658, 2. , 0.70710678],
  1436. [ 1.04044011, 1.06123822, 3. , 1.01850858],
  1437. [ 3.11614065, 1.40688837, 2. , 0.70710678],
  1438. [ 6.44583366, 6.76770586, 3. , 1.12682288]])
  1439. """
  1440. Z = np.asarray(Z, order='c')
  1441. Zs = Z.shape
  1442. is_valid_linkage(Z, throw=True, name='Z')
  1443. if (not d == np.floor(d)) or d < 0:
  1444. raise ValueError('The second argument d must be a nonnegative '
  1445. 'integer value.')
  1446. # Since the C code does not support striding using strides.
  1447. # The dimensions are used instead.
  1448. [Z] = _copy_arrays_if_base_present([Z])
  1449. n = Zs[0] + 1
  1450. R = np.zeros((n - 1, 4), dtype=np.double)
  1451. _hierarchy.inconsistent(Z, R, int(n), int(d))
  1452. return R
  1453. def from_mlab_linkage(Z):
  1454. """
  1455. Convert a linkage matrix generated by MATLAB(TM) to a new
  1456. linkage matrix compatible with this module.
  1457. The conversion does two things:
  1458. * the indices are converted from ``1..N`` to ``0..(N-1)`` form,
  1459. and
  1460. * a fourth column ``Z[:,3]`` is added where ``Z[i,3]`` represents the
  1461. number of original observations (leaves) in the non-singleton
  1462. cluster ``i``.
  1463. This function is useful when loading in linkages from legacy data
  1464. files generated by MATLAB.
  1465. Parameters
  1466. ----------
  1467. Z : ndarray
  1468. A linkage matrix generated by MATLAB(TM).
  1469. Returns
  1470. -------
  1471. ZS : ndarray
  1472. A linkage matrix compatible with ``scipy.cluster.hierarchy``.
  1473. See Also
  1474. --------
  1475. linkage: for a description of what a linkage matrix is.
  1476. to_mlab_linkage: transform from Scipy to MATLAB format.
  1477. Examples
  1478. --------
  1479. >>> import numpy as np
  1480. >>> from scipy.cluster.hierarchy import ward, from_mlab_linkage
  1481. Given a linkage matrix in MATLAB format ``mZ``, we can use
  1482. `scipy.cluster.hierarchy.from_mlab_linkage` to import
  1483. it into Scipy format:
  1484. >>> mZ = np.array([[1, 2, 1], [4, 5, 1], [7, 8, 1],
  1485. ... [10, 11, 1], [3, 13, 1.29099445],
  1486. ... [6, 14, 1.29099445],
  1487. ... [9, 15, 1.29099445],
  1488. ... [12, 16, 1.29099445],
  1489. ... [17, 18, 5.77350269],
  1490. ... [19, 20, 5.77350269],
  1491. ... [21, 22, 8.16496581]])
  1492. >>> Z = from_mlab_linkage(mZ)
  1493. >>> Z
  1494. array([[ 0. , 1. , 1. , 2. ],
  1495. [ 3. , 4. , 1. , 2. ],
  1496. [ 6. , 7. , 1. , 2. ],
  1497. [ 9. , 10. , 1. , 2. ],
  1498. [ 2. , 12. , 1.29099445, 3. ],
  1499. [ 5. , 13. , 1.29099445, 3. ],
  1500. [ 8. , 14. , 1.29099445, 3. ],
  1501. [ 11. , 15. , 1.29099445, 3. ],
  1502. [ 16. , 17. , 5.77350269, 6. ],
  1503. [ 18. , 19. , 5.77350269, 6. ],
  1504. [ 20. , 21. , 8.16496581, 12. ]])
  1505. As expected, the linkage matrix ``Z`` returned includes an
  1506. additional column counting the number of original samples in
  1507. each cluster. Also, all cluster indexes are reduced by 1
  1508. (MATLAB format uses 1-indexing, whereas Scipy uses 0-indexing).
  1509. """
  1510. Z = np.asarray(Z, dtype=np.double, order='c')
  1511. Zs = Z.shape
  1512. # If it's empty, return it.
  1513. if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0):
  1514. return Z.copy()
  1515. if len(Zs) != 2:
  1516. raise ValueError("The linkage array must be rectangular.")
  1517. # If it contains no rows, return it.
  1518. if Zs[0] == 0:
  1519. return Z.copy()
  1520. Zpart = Z.copy()
  1521. if Zpart[:, 0:2].min() != 1.0 and Zpart[:, 0:2].max() != 2 * Zs[0]:
  1522. raise ValueError('The format of the indices is not 1..N')
  1523. Zpart[:, 0:2] -= 1.0
  1524. CS = np.zeros((Zs[0],), dtype=np.double)
  1525. _hierarchy.calculate_cluster_sizes(Zpart, CS, int(Zs[0]) + 1)
  1526. return np.hstack([Zpart, CS.reshape(Zs[0], 1)])
  1527. def to_mlab_linkage(Z):
  1528. """
  1529. Convert a linkage matrix to a MATLAB(TM) compatible one.
  1530. Converts a linkage matrix ``Z`` generated by the linkage function
  1531. of this module to a MATLAB(TM) compatible one. The return linkage
  1532. matrix has the last column removed and the cluster indices are
  1533. converted to ``1..N`` indexing.
  1534. Parameters
  1535. ----------
  1536. Z : ndarray
  1537. A linkage matrix generated by ``scipy.cluster.hierarchy``.
  1538. Returns
  1539. -------
  1540. to_mlab_linkage : ndarray
  1541. A linkage matrix compatible with MATLAB(TM)'s hierarchical
  1542. clustering functions.
  1543. The return linkage matrix has the last column removed
  1544. and the cluster indices are converted to ``1..N`` indexing.
  1545. See Also
  1546. --------
  1547. linkage: for a description of what a linkage matrix is.
  1548. from_mlab_linkage: transform from Matlab to Scipy format.
  1549. Examples
  1550. --------
  1551. >>> from scipy.cluster.hierarchy import ward, to_mlab_linkage
  1552. >>> from scipy.spatial.distance import pdist
  1553. >>> X = [[0, 0], [0, 1], [1, 0],
  1554. ... [0, 4], [0, 3], [1, 4],
  1555. ... [4, 0], [3, 0], [4, 1],
  1556. ... [4, 4], [3, 4], [4, 3]]
  1557. >>> Z = ward(pdist(X))
  1558. >>> Z
  1559. array([[ 0. , 1. , 1. , 2. ],
  1560. [ 3. , 4. , 1. , 2. ],
  1561. [ 6. , 7. , 1. , 2. ],
  1562. [ 9. , 10. , 1. , 2. ],
  1563. [ 2. , 12. , 1.29099445, 3. ],
  1564. [ 5. , 13. , 1.29099445, 3. ],
  1565. [ 8. , 14. , 1.29099445, 3. ],
  1566. [11. , 15. , 1.29099445, 3. ],
  1567. [16. , 17. , 5.77350269, 6. ],
  1568. [18. , 19. , 5.77350269, 6. ],
  1569. [20. , 21. , 8.16496581, 12. ]])
  1570. After a linkage matrix ``Z`` has been created, we can use
  1571. `scipy.cluster.hierarchy.to_mlab_linkage` to convert it
  1572. into MATLAB format:
  1573. >>> mZ = to_mlab_linkage(Z)
  1574. >>> mZ
  1575. array([[ 1. , 2. , 1. ],
  1576. [ 4. , 5. , 1. ],
  1577. [ 7. , 8. , 1. ],
  1578. [ 10. , 11. , 1. ],
  1579. [ 3. , 13. , 1.29099445],
  1580. [ 6. , 14. , 1.29099445],
  1581. [ 9. , 15. , 1.29099445],
  1582. [ 12. , 16. , 1.29099445],
  1583. [ 17. , 18. , 5.77350269],
  1584. [ 19. , 20. , 5.77350269],
  1585. [ 21. , 22. , 8.16496581]])
  1586. The new linkage matrix ``mZ`` uses 1-indexing for all the
  1587. clusters (instead of 0-indexing). Also, the last column of
  1588. the original linkage matrix has been dropped.
  1589. """
  1590. Z = np.asarray(Z, order='c', dtype=np.double)
  1591. Zs = Z.shape
  1592. if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0):
  1593. return Z.copy()
  1594. is_valid_linkage(Z, throw=True, name='Z')
  1595. ZP = Z[:, 0:3].copy()
  1596. ZP[:, 0:2] += 1.0
  1597. return ZP
  1598. def is_monotonic(Z):
  1599. """
  1600. Return True if the linkage passed is monotonic.
  1601. The linkage is monotonic if for every cluster :math:`s` and :math:`t`
  1602. joined, the distance between them is no less than the distance
  1603. between any previously joined clusters.
  1604. Parameters
  1605. ----------
  1606. Z : ndarray
  1607. The linkage matrix to check for monotonicity.
  1608. Returns
  1609. -------
  1610. b : bool
  1611. A boolean indicating whether the linkage is monotonic.
  1612. See Also
  1613. --------
  1614. linkage: for a description of what a linkage matrix is.
  1615. Examples
  1616. --------
  1617. >>> from scipy.cluster.hierarchy import median, ward, is_monotonic
  1618. >>> from scipy.spatial.distance import pdist
  1619. By definition, some hierarchical clustering algorithms - such as
  1620. `scipy.cluster.hierarchy.ward` - produce monotonic assignments of
  1621. samples to clusters; however, this is not always true for other
  1622. hierarchical methods - e.g. `scipy.cluster.hierarchy.median`.
  1623. Given a linkage matrix ``Z`` (as the result of a hierarchical clustering
  1624. method) we can test programmatically whether if is has the monotonicity
  1625. property or not, using `scipy.cluster.hierarchy.is_monotonic`:
  1626. >>> X = [[0, 0], [0, 1], [1, 0],
  1627. ... [0, 4], [0, 3], [1, 4],
  1628. ... [4, 0], [3, 0], [4, 1],
  1629. ... [4, 4], [3, 4], [4, 3]]
  1630. >>> Z = ward(pdist(X))
  1631. >>> Z
  1632. array([[ 0. , 1. , 1. , 2. ],
  1633. [ 3. , 4. , 1. , 2. ],
  1634. [ 6. , 7. , 1. , 2. ],
  1635. [ 9. , 10. , 1. , 2. ],
  1636. [ 2. , 12. , 1.29099445, 3. ],
  1637. [ 5. , 13. , 1.29099445, 3. ],
  1638. [ 8. , 14. , 1.29099445, 3. ],
  1639. [11. , 15. , 1.29099445, 3. ],
  1640. [16. , 17. , 5.77350269, 6. ],
  1641. [18. , 19. , 5.77350269, 6. ],
  1642. [20. , 21. , 8.16496581, 12. ]])
  1643. >>> is_monotonic(Z)
  1644. True
  1645. >>> Z = median(pdist(X))
  1646. >>> Z
  1647. array([[ 0. , 1. , 1. , 2. ],
  1648. [ 3. , 4. , 1. , 2. ],
  1649. [ 9. , 10. , 1. , 2. ],
  1650. [ 6. , 7. , 1. , 2. ],
  1651. [ 2. , 12. , 1.11803399, 3. ],
  1652. [ 5. , 13. , 1.11803399, 3. ],
  1653. [ 8. , 15. , 1.11803399, 3. ],
  1654. [11. , 14. , 1.11803399, 3. ],
  1655. [18. , 19. , 3. , 6. ],
  1656. [16. , 17. , 3.5 , 6. ],
  1657. [20. , 21. , 3.25 , 12. ]])
  1658. >>> is_monotonic(Z)
  1659. False
  1660. Note that this method is equivalent to just verifying that the distances
  1661. in the third column of the linkage matrix appear in a monotonically
  1662. increasing order.
  1663. """
  1664. Z = np.asarray(Z, order='c')
  1665. is_valid_linkage(Z, throw=True, name='Z')
  1666. # We expect the i'th value to be greater than its successor.
  1667. return (Z[1:, 2] >= Z[:-1, 2]).all()
  1668. def is_valid_im(R, warning=False, throw=False, name=None):
  1669. """Return True if the inconsistency matrix passed is valid.
  1670. It must be a :math:`n` by 4 array of doubles. The standard
  1671. deviations ``R[:,1]`` must be nonnegative. The link counts
  1672. ``R[:,2]`` must be positive and no greater than :math:`n-1`.
  1673. Parameters
  1674. ----------
  1675. R : ndarray
  1676. The inconsistency matrix to check for validity.
  1677. warning : bool, optional
  1678. When True, issues a Python warning if the linkage
  1679. matrix passed is invalid.
  1680. throw : bool, optional
  1681. When True, throws a Python exception if the linkage
  1682. matrix passed is invalid.
  1683. name : str, optional
  1684. This string refers to the variable name of the invalid
  1685. linkage matrix.
  1686. Returns
  1687. -------
  1688. b : bool
  1689. True if the inconsistency matrix is valid.
  1690. See Also
  1691. --------
  1692. linkage: for a description of what a linkage matrix is.
  1693. inconsistent: for the creation of a inconsistency matrix.
  1694. Examples
  1695. --------
  1696. >>> from scipy.cluster.hierarchy import ward, inconsistent, is_valid_im
  1697. >>> from scipy.spatial.distance import pdist
  1698. Given a data set ``X``, we can apply a clustering method to obtain a
  1699. linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
  1700. be also used to obtain the inconsistency matrix ``R`` associated to
  1701. this clustering process:
  1702. >>> X = [[0, 0], [0, 1], [1, 0],
  1703. ... [0, 4], [0, 3], [1, 4],
  1704. ... [4, 0], [3, 0], [4, 1],
  1705. ... [4, 4], [3, 4], [4, 3]]
  1706. >>> Z = ward(pdist(X))
  1707. >>> R = inconsistent(Z)
  1708. >>> Z
  1709. array([[ 0. , 1. , 1. , 2. ],
  1710. [ 3. , 4. , 1. , 2. ],
  1711. [ 6. , 7. , 1. , 2. ],
  1712. [ 9. , 10. , 1. , 2. ],
  1713. [ 2. , 12. , 1.29099445, 3. ],
  1714. [ 5. , 13. , 1.29099445, 3. ],
  1715. [ 8. , 14. , 1.29099445, 3. ],
  1716. [11. , 15. , 1.29099445, 3. ],
  1717. [16. , 17. , 5.77350269, 6. ],
  1718. [18. , 19. , 5.77350269, 6. ],
  1719. [20. , 21. , 8.16496581, 12. ]])
  1720. >>> R
  1721. array([[1. , 0. , 1. , 0. ],
  1722. [1. , 0. , 1. , 0. ],
  1723. [1. , 0. , 1. , 0. ],
  1724. [1. , 0. , 1. , 0. ],
  1725. [1.14549722, 0.20576415, 2. , 0.70710678],
  1726. [1.14549722, 0.20576415, 2. , 0.70710678],
  1727. [1.14549722, 0.20576415, 2. , 0.70710678],
  1728. [1.14549722, 0.20576415, 2. , 0.70710678],
  1729. [2.78516386, 2.58797734, 3. , 1.15470054],
  1730. [2.78516386, 2.58797734, 3. , 1.15470054],
  1731. [6.57065706, 1.38071187, 3. , 1.15470054]])
  1732. Now we can use `scipy.cluster.hierarchy.is_valid_im` to verify that
  1733. ``R`` is correct:
  1734. >>> is_valid_im(R)
  1735. True
  1736. However, if ``R`` is wrongly constructed (e.g one of the standard
  1737. deviations is set to a negative value) then the check will fail:
  1738. >>> R[-1,1] = R[-1,1] * -1
  1739. >>> is_valid_im(R)
  1740. False
  1741. """
  1742. R = np.asarray(R, order='c')
  1743. valid = True
  1744. name_str = "%r " % name if name else ''
  1745. try:
  1746. if type(R) != np.ndarray:
  1747. raise TypeError('Variable %spassed as inconsistency matrix is not '
  1748. 'a numpy array.' % name_str)
  1749. if R.dtype != np.double:
  1750. raise TypeError('Inconsistency matrix %smust contain doubles '
  1751. '(double).' % name_str)
  1752. if len(R.shape) != 2:
  1753. raise ValueError('Inconsistency matrix %smust have shape=2 (i.e. '
  1754. 'be two-dimensional).' % name_str)
  1755. if R.shape[1] != 4:
  1756. raise ValueError('Inconsistency matrix %smust have 4 columns.' %
  1757. name_str)
  1758. if R.shape[0] < 1:
  1759. raise ValueError('Inconsistency matrix %smust have at least one '
  1760. 'row.' % name_str)
  1761. if (R[:, 0] < 0).any():
  1762. raise ValueError('Inconsistency matrix %scontains negative link '
  1763. 'height means.' % name_str)
  1764. if (R[:, 1] < 0).any():
  1765. raise ValueError('Inconsistency matrix %scontains negative link '
  1766. 'height standard deviations.' % name_str)
  1767. if (R[:, 2] < 0).any():
  1768. raise ValueError('Inconsistency matrix %scontains negative link '
  1769. 'counts.' % name_str)
  1770. except Exception as e:
  1771. if throw:
  1772. raise
  1773. if warning:
  1774. _warning(str(e))
  1775. valid = False
  1776. return valid
  1777. def is_valid_linkage(Z, warning=False, throw=False, name=None):
  1778. """
  1779. Check the validity of a linkage matrix.
  1780. A linkage matrix is valid if it is a two dimensional array (type double)
  1781. with :math:`n` rows and 4 columns. The first two columns must contain
  1782. indices between 0 and :math:`2n-1`. For a given row ``i``, the following
  1783. two expressions have to hold:
  1784. .. math::
  1785. 0 \\leq \\mathtt{Z[i,0]} \\leq i+n-1
  1786. 0 \\leq Z[i,1] \\leq i+n-1
  1787. I.e. a cluster cannot join another cluster unless the cluster being joined
  1788. has been generated.
  1789. Parameters
  1790. ----------
  1791. Z : array_like
  1792. Linkage matrix.
  1793. warning : bool, optional
  1794. When True, issues a Python warning if the linkage
  1795. matrix passed is invalid.
  1796. throw : bool, optional
  1797. When True, throws a Python exception if the linkage
  1798. matrix passed is invalid.
  1799. name : str, optional
  1800. This string refers to the variable name of the invalid
  1801. linkage matrix.
  1802. Returns
  1803. -------
  1804. b : bool
  1805. True if the inconsistency matrix is valid.
  1806. See Also
  1807. --------
  1808. linkage: for a description of what a linkage matrix is.
  1809. Examples
  1810. --------
  1811. >>> from scipy.cluster.hierarchy import ward, is_valid_linkage
  1812. >>> from scipy.spatial.distance import pdist
  1813. All linkage matrices generated by the clustering methods in this module
  1814. will be valid (i.e. they will have the appropriate dimensions and the two
  1815. required expressions will hold for all the rows).
  1816. We can check this using `scipy.cluster.hierarchy.is_valid_linkage`:
  1817. >>> X = [[0, 0], [0, 1], [1, 0],
  1818. ... [0, 4], [0, 3], [1, 4],
  1819. ... [4, 0], [3, 0], [4, 1],
  1820. ... [4, 4], [3, 4], [4, 3]]
  1821. >>> Z = ward(pdist(X))
  1822. >>> Z
  1823. array([[ 0. , 1. , 1. , 2. ],
  1824. [ 3. , 4. , 1. , 2. ],
  1825. [ 6. , 7. , 1. , 2. ],
  1826. [ 9. , 10. , 1. , 2. ],
  1827. [ 2. , 12. , 1.29099445, 3. ],
  1828. [ 5. , 13. , 1.29099445, 3. ],
  1829. [ 8. , 14. , 1.29099445, 3. ],
  1830. [11. , 15. , 1.29099445, 3. ],
  1831. [16. , 17. , 5.77350269, 6. ],
  1832. [18. , 19. , 5.77350269, 6. ],
  1833. [20. , 21. , 8.16496581, 12. ]])
  1834. >>> is_valid_linkage(Z)
  1835. True
  1836. However, is we create a linkage matrix in a wrong way - or if we modify
  1837. a valid one in a way that any of the required expressions don't hold
  1838. anymore, then the check will fail:
  1839. >>> Z[3][1] = 20 # the cluster number 20 is not defined at this point
  1840. >>> is_valid_linkage(Z)
  1841. False
  1842. """
  1843. Z = np.asarray(Z, order='c')
  1844. valid = True
  1845. name_str = "%r " % name if name else ''
  1846. try:
  1847. if type(Z) != np.ndarray:
  1848. raise TypeError('Passed linkage argument %sis not a valid array.' %
  1849. name_str)
  1850. if Z.dtype != np.double:
  1851. raise TypeError('Linkage matrix %smust contain doubles.' % name_str)
  1852. if len(Z.shape) != 2:
  1853. raise ValueError('Linkage matrix %smust have shape=2 (i.e. be '
  1854. 'two-dimensional).' % name_str)
  1855. if Z.shape[1] != 4:
  1856. raise ValueError('Linkage matrix %smust have 4 columns.' % name_str)
  1857. if Z.shape[0] == 0:
  1858. raise ValueError('Linkage must be computed on at least two '
  1859. 'observations.')
  1860. n = Z.shape[0]
  1861. if n > 1:
  1862. if ((Z[:, 0] < 0).any() or (Z[:, 1] < 0).any()):
  1863. raise ValueError('Linkage %scontains negative indices.' %
  1864. name_str)
  1865. if (Z[:, 2] < 0).any():
  1866. raise ValueError('Linkage %scontains negative distances.' %
  1867. name_str)
  1868. if (Z[:, 3] < 0).any():
  1869. raise ValueError('Linkage %scontains negative counts.' %
  1870. name_str)
  1871. if _check_hierarchy_uses_cluster_before_formed(Z):
  1872. raise ValueError('Linkage %suses non-singleton cluster before '
  1873. 'it is formed.' % name_str)
  1874. if _check_hierarchy_uses_cluster_more_than_once(Z):
  1875. raise ValueError('Linkage %suses the same cluster more than once.'
  1876. % name_str)
  1877. except Exception as e:
  1878. if throw:
  1879. raise
  1880. if warning:
  1881. _warning(str(e))
  1882. valid = False
  1883. return valid
  1884. def _check_hierarchy_uses_cluster_before_formed(Z):
  1885. n = Z.shape[0] + 1
  1886. for i in xrange(0, n - 1):
  1887. if Z[i, 0] >= n + i or Z[i, 1] >= n + i:
  1888. return True
  1889. return False
  1890. def _check_hierarchy_uses_cluster_more_than_once(Z):
  1891. n = Z.shape[0] + 1
  1892. chosen = set([])
  1893. for i in xrange(0, n - 1):
  1894. if (Z[i, 0] in chosen) or (Z[i, 1] in chosen) or Z[i, 0] == Z[i, 1]:
  1895. return True
  1896. chosen.add(Z[i, 0])
  1897. chosen.add(Z[i, 1])
  1898. return False
  1899. def _check_hierarchy_not_all_clusters_used(Z):
  1900. n = Z.shape[0] + 1
  1901. chosen = set([])
  1902. for i in xrange(0, n - 1):
  1903. chosen.add(int(Z[i, 0]))
  1904. chosen.add(int(Z[i, 1]))
  1905. must_chosen = set(range(0, 2 * n - 2))
  1906. return len(must_chosen.difference(chosen)) > 0
  1907. def num_obs_linkage(Z):
  1908. """
  1909. Return the number of original observations of the linkage matrix passed.
  1910. Parameters
  1911. ----------
  1912. Z : ndarray
  1913. The linkage matrix on which to perform the operation.
  1914. Returns
  1915. -------
  1916. n : int
  1917. The number of original observations in the linkage.
  1918. Examples
  1919. --------
  1920. >>> from scipy.cluster.hierarchy import ward, num_obs_linkage
  1921. >>> from scipy.spatial.distance import pdist
  1922. >>> X = [[0, 0], [0, 1], [1, 0],
  1923. ... [0, 4], [0, 3], [1, 4],
  1924. ... [4, 0], [3, 0], [4, 1],
  1925. ... [4, 4], [3, 4], [4, 3]]
  1926. >>> Z = ward(pdist(X))
  1927. ``Z`` is a linkage matrix obtained after using the Ward clustering method
  1928. with ``X``, a dataset with 12 data points.
  1929. >>> num_obs_linkage(Z)
  1930. 12
  1931. """
  1932. Z = np.asarray(Z, order='c')
  1933. is_valid_linkage(Z, throw=True, name='Z')
  1934. return (Z.shape[0] + 1)
  1935. def correspond(Z, Y):
  1936. """
  1937. Check for correspondence between linkage and condensed distance matrices.
  1938. They must have the same number of original observations for
  1939. the check to succeed.
  1940. This function is useful as a sanity check in algorithms that make
  1941. extensive use of linkage and distance matrices that must
  1942. correspond to the same set of original observations.
  1943. Parameters
  1944. ----------
  1945. Z : array_like
  1946. The linkage matrix to check for correspondence.
  1947. Y : array_like
  1948. The condensed distance matrix to check for correspondence.
  1949. Returns
  1950. -------
  1951. b : bool
  1952. A boolean indicating whether the linkage matrix and distance
  1953. matrix could possibly correspond to one another.
  1954. See Also
  1955. --------
  1956. linkage: for a description of what a linkage matrix is.
  1957. Examples
  1958. --------
  1959. >>> from scipy.cluster.hierarchy import ward, correspond
  1960. >>> from scipy.spatial.distance import pdist
  1961. This method can be used to check if a given linkage matrix ``Z`` has been
  1962. obtained from the application of a cluster method over a dataset ``X``:
  1963. >>> X = [[0, 0], [0, 1], [1, 0],
  1964. ... [0, 4], [0, 3], [1, 4],
  1965. ... [4, 0], [3, 0], [4, 1],
  1966. ... [4, 4], [3, 4], [4, 3]]
  1967. >>> X_condensed = pdist(X)
  1968. >>> Z = ward(X_condensed)
  1969. Here we can compare ``Z`` and ``X`` (in condensed form):
  1970. >>> correspond(Z, X_condensed)
  1971. True
  1972. """
  1973. is_valid_linkage(Z, throw=True)
  1974. distance.is_valid_y(Y, throw=True)
  1975. Z = np.asarray(Z, order='c')
  1976. Y = np.asarray(Y, order='c')
  1977. return distance.num_obs_y(Y) == num_obs_linkage(Z)
  1978. def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None):
  1979. """
  1980. Form flat clusters from the hierarchical clustering defined by
  1981. the given linkage matrix.
  1982. Parameters
  1983. ----------
  1984. Z : ndarray
  1985. The hierarchical clustering encoded with the matrix returned
  1986. by the `linkage` function.
  1987. t : scalar
  1988. For criteria 'inconsistent', 'distance' or 'monocrit',
  1989. this is the threshold to apply when forming flat clusters.
  1990. For 'maxclust' or 'maxclust_monocrit' criteria,
  1991. this would be max number of clusters requested.
  1992. criterion : str, optional
  1993. The criterion to use in forming flat clusters. This can
  1994. be any of the following values:
  1995. ``inconsistent`` :
  1996. If a cluster node and all its
  1997. descendants have an inconsistent value less than or equal
  1998. to `t` then all its leaf descendants belong to the
  1999. same flat cluster. When no non-singleton cluster meets
  2000. this criterion, every node is assigned to its own
  2001. cluster. (Default)
  2002. ``distance`` :
  2003. Forms flat clusters so that the original
  2004. observations in each flat cluster have no greater a
  2005. cophenetic distance than `t`.
  2006. ``maxclust`` :
  2007. Finds a minimum threshold ``r`` so that
  2008. the cophenetic distance between any two original
  2009. observations in the same flat cluster is no more than
  2010. ``r`` and no more than `t` flat clusters are formed.
  2011. ``monocrit`` :
  2012. Forms a flat cluster from a cluster node c
  2013. with index i when ``monocrit[j] <= t``.
  2014. For example, to threshold on the maximum mean distance
  2015. as computed in the inconsistency matrix R with a
  2016. threshold of 0.8 do::
  2017. MR = maxRstat(Z, R, 3)
  2018. cluster(Z, t=0.8, criterion='monocrit', monocrit=MR)
  2019. ``maxclust_monocrit`` :
  2020. Forms a flat cluster from a
  2021. non-singleton cluster node ``c`` when ``monocrit[i] <=
  2022. r`` for all cluster indices ``i`` below and including
  2023. ``c``. ``r`` is minimized such that no more than ``t``
  2024. flat clusters are formed. monocrit must be
  2025. monotonic. For example, to minimize the threshold t on
  2026. maximum inconsistency values so that no more than 3 flat
  2027. clusters are formed, do::
  2028. MI = maxinconsts(Z, R)
  2029. cluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI)
  2030. depth : int, optional
  2031. The maximum depth to perform the inconsistency calculation.
  2032. It has no meaning for the other criteria. Default is 2.
  2033. R : ndarray, optional
  2034. The inconsistency matrix to use for the 'inconsistent'
  2035. criterion. This matrix is computed if not provided.
  2036. monocrit : ndarray, optional
  2037. An array of length n-1. `monocrit[i]` is the
  2038. statistics upon which non-singleton i is thresholded. The
  2039. monocrit vector must be monotonic, i.e. given a node c with
  2040. index i, for all node indices j corresponding to nodes
  2041. below c, ``monocrit[i] >= monocrit[j]``.
  2042. Returns
  2043. -------
  2044. fcluster : ndarray
  2045. An array of length ``n``. ``T[i]`` is the flat cluster number to
  2046. which original observation ``i`` belongs.
  2047. See Also
  2048. --------
  2049. linkage : for information about hierarchical clustering methods work.
  2050. Examples
  2051. --------
  2052. >>> from scipy.cluster.hierarchy import ward, fcluster
  2053. >>> from scipy.spatial.distance import pdist
  2054. All cluster linkage methods - e.g. `scipy.cluster.hierarchy.ward`
  2055. generate a linkage matrix ``Z`` as their output:
  2056. >>> X = [[0, 0], [0, 1], [1, 0],
  2057. ... [0, 4], [0, 3], [1, 4],
  2058. ... [4, 0], [3, 0], [4, 1],
  2059. ... [4, 4], [3, 4], [4, 3]]
  2060. >>> Z = ward(pdist(X))
  2061. >>> Z
  2062. array([[ 0. , 1. , 1. , 2. ],
  2063. [ 3. , 4. , 1. , 2. ],
  2064. [ 6. , 7. , 1. , 2. ],
  2065. [ 9. , 10. , 1. , 2. ],
  2066. [ 2. , 12. , 1.29099445, 3. ],
  2067. [ 5. , 13. , 1.29099445, 3. ],
  2068. [ 8. , 14. , 1.29099445, 3. ],
  2069. [11. , 15. , 1.29099445, 3. ],
  2070. [16. , 17. , 5.77350269, 6. ],
  2071. [18. , 19. , 5.77350269, 6. ],
  2072. [20. , 21. , 8.16496581, 12. ]])
  2073. This matrix represents a dendrogram, where the first and second elements
  2074. are the two clusters merged at each step, the third element is the
  2075. distance between these clusters, and the fourth element is the size of
  2076. the new cluster - the number of original data points included.
  2077. `scipy.cluster.hierarchy.fcluster` can be used to flatten the
  2078. dendrogram, obtaining as a result an assignation of the original data
  2079. points to single clusters.
  2080. This assignation mostly depends on a distance threshold ``t`` - the maximum
  2081. inter-cluster distance allowed:
  2082. >>> fcluster(Z, t=0.9, criterion='distance')
  2083. array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32)
  2084. >>> fcluster(Z, t=1.1, criterion='distance')
  2085. array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32)
  2086. >>> fcluster(Z, t=3, criterion='distance')
  2087. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  2088. >>> fcluster(Z, t=9, criterion='distance')
  2089. array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
  2090. In the first case, the threshold ``t`` is too small to allow any two
  2091. samples in the data to form a cluster, so 12 different clusters are
  2092. returned.
  2093. In the second case, the threshold is large enough to allow the first
  2094. 4 points to be merged with their nearest neighbors. So here only 8
  2095. clusters are returned.
  2096. The third case, with a much higher threshold, allows for up to 8 data
  2097. points to be connected - so 4 clusters are returned here.
  2098. Lastly, the threshold of the fourth case is large enough to allow for
  2099. all data points to be merged together - so a single cluster is returned.
  2100. """
  2101. Z = np.asarray(Z, order='c')
  2102. is_valid_linkage(Z, throw=True, name='Z')
  2103. n = Z.shape[0] + 1
  2104. T = np.zeros((n,), dtype='i')
  2105. # Since the C code does not support striding using strides.
  2106. # The dimensions are used instead.
  2107. [Z] = _copy_arrays_if_base_present([Z])
  2108. if criterion == 'inconsistent':
  2109. if R is None:
  2110. R = inconsistent(Z, depth)
  2111. else:
  2112. R = np.asarray(R, order='c')
  2113. is_valid_im(R, throw=True, name='R')
  2114. # Since the C code does not support striding using strides.
  2115. # The dimensions are used instead.
  2116. [R] = _copy_arrays_if_base_present([R])
  2117. _hierarchy.cluster_in(Z, R, T, float(t), int(n))
  2118. elif criterion == 'distance':
  2119. _hierarchy.cluster_dist(Z, T, float(t), int(n))
  2120. elif criterion == 'maxclust':
  2121. _hierarchy.cluster_maxclust_dist(Z, T, int(n), int(t))
  2122. elif criterion == 'monocrit':
  2123. [monocrit] = _copy_arrays_if_base_present([monocrit])
  2124. _hierarchy.cluster_monocrit(Z, monocrit, T, float(t), int(n))
  2125. elif criterion == 'maxclust_monocrit':
  2126. [monocrit] = _copy_arrays_if_base_present([monocrit])
  2127. _hierarchy.cluster_maxclust_monocrit(Z, monocrit, T, int(n), int(t))
  2128. else:
  2129. raise ValueError('Invalid cluster formation criterion: %s'
  2130. % str(criterion))
  2131. return T
  2132. def fclusterdata(X, t, criterion='inconsistent',
  2133. metric='euclidean', depth=2, method='single', R=None):
  2134. """
  2135. Cluster observation data using a given metric.
  2136. Clusters the original observations in the n-by-m data
  2137. matrix X (n observations in m dimensions), using the euclidean
  2138. distance metric to calculate distances between original observations,
  2139. performs hierarchical clustering using the single linkage algorithm,
  2140. and forms flat clusters using the inconsistency method with `t` as the
  2141. cut-off threshold.
  2142. A one-dimensional array ``T`` of length ``n`` is returned. ``T[i]`` is
  2143. the index of the flat cluster to which the original observation ``i``
  2144. belongs.
  2145. Parameters
  2146. ----------
  2147. X : (N, M) ndarray
  2148. N by M data matrix with N observations in M dimensions.
  2149. t : scalar
  2150. For criteria 'inconsistent', 'distance' or 'monocrit',
  2151. this is the threshold to apply when forming flat clusters.
  2152. For 'maxclust' or 'maxclust_monocrit' criteria,
  2153. this would be max number of clusters requested.
  2154. criterion : str, optional
  2155. Specifies the criterion for forming flat clusters. Valid
  2156. values are 'inconsistent' (default), 'distance', or 'maxclust'
  2157. cluster formation algorithms. See `fcluster` for descriptions.
  2158. metric : str, optional
  2159. The distance metric for calculating pairwise distances. See
  2160. ``distance.pdist`` for descriptions and linkage to verify
  2161. compatibility with the linkage method.
  2162. depth : int, optional
  2163. The maximum depth for the inconsistency calculation. See
  2164. `inconsistent` for more information.
  2165. method : str, optional
  2166. The linkage method to use (single, complete, average,
  2167. weighted, median centroid, ward). See `linkage` for more
  2168. information. Default is "single".
  2169. R : ndarray, optional
  2170. The inconsistency matrix. It will be computed if necessary
  2171. if it is not passed.
  2172. Returns
  2173. -------
  2174. fclusterdata : ndarray
  2175. A vector of length n. T[i] is the flat cluster number to
  2176. which original observation i belongs.
  2177. See Also
  2178. --------
  2179. scipy.spatial.distance.pdist : pairwise distance metrics
  2180. Notes
  2181. -----
  2182. This function is similar to the MATLAB function ``clusterdata``.
  2183. Examples
  2184. --------
  2185. >>> from scipy.cluster.hierarchy import fclusterdata
  2186. This is a convenience method that abstracts all the steps to perform in a
  2187. typical Scipy's hierarchical clustering workflow.
  2188. * Transform the input data into a condensed matrix with `scipy.spatial.distance.pdist`.
  2189. * Apply a clustering method.
  2190. * Obtain flat clusters at a user defined distance threshold ``t`` using `scipy.cluster.hierarchy.fcluster`.
  2191. >>> X = [[0, 0], [0, 1], [1, 0],
  2192. ... [0, 4], [0, 3], [1, 4],
  2193. ... [4, 0], [3, 0], [4, 1],
  2194. ... [4, 4], [3, 4], [4, 3]]
  2195. >>> fclusterdata(X, t=1)
  2196. array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
  2197. The output here (for the dataset ``X``, distance threshold ``t``, and the
  2198. default settings) is four clusters with three data points each.
  2199. """
  2200. X = np.asarray(X, order='c', dtype=np.double)
  2201. if type(X) != np.ndarray or len(X.shape) != 2:
  2202. raise TypeError('The observation matrix X must be an n by m numpy '
  2203. 'array.')
  2204. Y = distance.pdist(X, metric=metric)
  2205. Z = linkage(Y, method=method)
  2206. if R is None:
  2207. R = inconsistent(Z, d=depth)
  2208. else:
  2209. R = np.asarray(R, order='c')
  2210. T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t)
  2211. return T
  2212. def leaves_list(Z):
  2213. """
  2214. Return a list of leaf node ids.
  2215. The return corresponds to the observation vector index as it appears
  2216. in the tree from left to right. Z is a linkage matrix.
  2217. Parameters
  2218. ----------
  2219. Z : ndarray
  2220. The hierarchical clustering encoded as a matrix. `Z` is
  2221. a linkage matrix. See `linkage` for more information.
  2222. Returns
  2223. -------
  2224. leaves_list : ndarray
  2225. The list of leaf node ids.
  2226. See Also
  2227. --------
  2228. dendrogram: for information about dendrogram structure.
  2229. Examples
  2230. --------
  2231. >>> from scipy.cluster.hierarchy import ward, dendrogram, leaves_list
  2232. >>> from scipy.spatial.distance import pdist
  2233. >>> from matplotlib import pyplot as plt
  2234. >>> X = [[0, 0], [0, 1], [1, 0],
  2235. ... [0, 4], [0, 3], [1, 4],
  2236. ... [4, 0], [3, 0], [4, 1],
  2237. ... [4, 4], [3, 4], [4, 3]]
  2238. >>> Z = ward(pdist(X))
  2239. The linkage matrix ``Z`` represents a dendrogram, that is, a tree that
  2240. encodes the structure of the clustering performed.
  2241. `scipy.cluster.hierarchy.leaves_list` shows the mapping between
  2242. indexes in the ``X`` dataset and leaves in the dendrogram:
  2243. >>> leaves_list(Z)
  2244. array([ 2, 0, 1, 5, 3, 4, 8, 6, 7, 11, 9, 10], dtype=int32)
  2245. >>> fig = plt.figure(figsize=(25, 10))
  2246. >>> dn = dendrogram(Z)
  2247. >>> plt.show()
  2248. """
  2249. Z = np.asarray(Z, order='c')
  2250. is_valid_linkage(Z, throw=True, name='Z')
  2251. n = Z.shape[0] + 1
  2252. ML = np.zeros((n,), dtype='i')
  2253. [Z] = _copy_arrays_if_base_present([Z])
  2254. _hierarchy.prelist(Z, ML, int(n))
  2255. return ML
  2256. # Maps number of leaves to text size.
  2257. #
  2258. # p <= 20, size="12"
  2259. # 20 < p <= 30, size="10"
  2260. # 30 < p <= 50, size="8"
  2261. # 50 < p <= np.inf, size="6"
  2262. _dtextsizes = {20: 12, 30: 10, 50: 8, 85: 6, np.inf: 5}
  2263. _drotation = {20: 0, 40: 45, np.inf: 90}
  2264. _dtextsortedkeys = list(_dtextsizes.keys())
  2265. _dtextsortedkeys.sort()
  2266. _drotationsortedkeys = list(_drotation.keys())
  2267. _drotationsortedkeys.sort()
  2268. def _remove_dups(L):
  2269. """
  2270. Remove duplicates AND preserve the original order of the elements.
  2271. The set class is not guaranteed to do this.
  2272. """
  2273. seen_before = set([])
  2274. L2 = []
  2275. for i in L:
  2276. if i not in seen_before:
  2277. seen_before.add(i)
  2278. L2.append(i)
  2279. return L2
  2280. def _get_tick_text_size(p):
  2281. for k in _dtextsortedkeys:
  2282. if p <= k:
  2283. return _dtextsizes[k]
  2284. def _get_tick_rotation(p):
  2285. for k in _drotationsortedkeys:
  2286. if p <= k:
  2287. return _drotation[k]
  2288. def _plot_dendrogram(icoords, dcoords, ivl, p, n, mh, orientation,
  2289. no_labels, color_list, leaf_font_size=None,
  2290. leaf_rotation=None, contraction_marks=None,
  2291. ax=None, above_threshold_color='b'):
  2292. # Import matplotlib here so that it's not imported unless dendrograms
  2293. # are plotted. Raise an informative error if importing fails.
  2294. try:
  2295. # if an axis is provided, don't use pylab at all
  2296. if ax is None:
  2297. import matplotlib.pylab
  2298. import matplotlib.patches
  2299. import matplotlib.collections
  2300. except ImportError:
  2301. raise ImportError("You must install the matplotlib library to plot "
  2302. "the dendrogram. Use no_plot=True to calculate the "
  2303. "dendrogram without plotting.")
  2304. if ax is None:
  2305. ax = matplotlib.pylab.gca()
  2306. # if we're using pylab, we want to trigger a draw at the end
  2307. trigger_redraw = True
  2308. else:
  2309. trigger_redraw = False
  2310. # Independent variable plot width
  2311. ivw = len(ivl) * 10
  2312. # Dependent variable plot height
  2313. dvw = mh + mh * 0.05
  2314. iv_ticks = np.arange(5, len(ivl) * 10 + 5, 10)
  2315. if orientation in ('top', 'bottom'):
  2316. if orientation == 'top':
  2317. ax.set_ylim([0, dvw])
  2318. ax.set_xlim([0, ivw])
  2319. else:
  2320. ax.set_ylim([dvw, 0])
  2321. ax.set_xlim([0, ivw])
  2322. xlines = icoords
  2323. ylines = dcoords
  2324. if no_labels:
  2325. ax.set_xticks([])
  2326. ax.set_xticklabels([])
  2327. else:
  2328. ax.set_xticks(iv_ticks)
  2329. if orientation == 'top':
  2330. ax.xaxis.set_ticks_position('bottom')
  2331. else:
  2332. ax.xaxis.set_ticks_position('top')
  2333. # Make the tick marks invisible because they cover up the links
  2334. for line in ax.get_xticklines():
  2335. line.set_visible(False)
  2336. leaf_rot = (float(_get_tick_rotation(len(ivl)))
  2337. if (leaf_rotation is None) else leaf_rotation)
  2338. leaf_font = (float(_get_tick_text_size(len(ivl)))
  2339. if (leaf_font_size is None) else leaf_font_size)
  2340. ax.set_xticklabels(ivl, rotation=leaf_rot, size=leaf_font)
  2341. elif orientation in ('left', 'right'):
  2342. if orientation == 'left':
  2343. ax.set_xlim([dvw, 0])
  2344. ax.set_ylim([0, ivw])
  2345. else:
  2346. ax.set_xlim([0, dvw])
  2347. ax.set_ylim([0, ivw])
  2348. xlines = dcoords
  2349. ylines = icoords
  2350. if no_labels:
  2351. ax.set_yticks([])
  2352. ax.set_yticklabels([])
  2353. else:
  2354. ax.set_yticks(iv_ticks)
  2355. if orientation == 'left':
  2356. ax.yaxis.set_ticks_position('right')
  2357. else:
  2358. ax.yaxis.set_ticks_position('left')
  2359. # Make the tick marks invisible because they cover up the links
  2360. for line in ax.get_yticklines():
  2361. line.set_visible(False)
  2362. leaf_font = (float(_get_tick_text_size(len(ivl)))
  2363. if (leaf_font_size is None) else leaf_font_size)
  2364. if leaf_rotation is not None:
  2365. ax.set_yticklabels(ivl, rotation=leaf_rotation, size=leaf_font)
  2366. else:
  2367. ax.set_yticklabels(ivl, size=leaf_font)
  2368. # Let's use collections instead. This way there is a separate legend item
  2369. # for each tree grouping, rather than stupidly one for each line segment.
  2370. colors_used = _remove_dups(color_list)
  2371. color_to_lines = {}
  2372. for color in colors_used:
  2373. color_to_lines[color] = []
  2374. for (xline, yline, color) in zip(xlines, ylines, color_list):
  2375. color_to_lines[color].append(list(zip(xline, yline)))
  2376. colors_to_collections = {}
  2377. # Construct the collections.
  2378. for color in colors_used:
  2379. coll = matplotlib.collections.LineCollection(color_to_lines[color],
  2380. colors=(color,))
  2381. colors_to_collections[color] = coll
  2382. # Add all the groupings below the color threshold.
  2383. for color in colors_used:
  2384. if color != above_threshold_color:
  2385. ax.add_collection(colors_to_collections[color])
  2386. # If there's a grouping of links above the color threshold, it goes last.
  2387. if above_threshold_color in colors_to_collections:
  2388. ax.add_collection(colors_to_collections[above_threshold_color])
  2389. if contraction_marks is not None:
  2390. Ellipse = matplotlib.patches.Ellipse
  2391. for (x, y) in contraction_marks:
  2392. if orientation in ('left', 'right'):
  2393. e = Ellipse((y, x), width=dvw / 100, height=1.0)
  2394. else:
  2395. e = Ellipse((x, y), width=1.0, height=dvw / 100)
  2396. ax.add_artist(e)
  2397. e.set_clip_box(ax.bbox)
  2398. e.set_alpha(0.5)
  2399. e.set_facecolor('k')
  2400. if trigger_redraw:
  2401. matplotlib.pylab.draw_if_interactive()
  2402. _link_line_colors = ['g', 'r', 'c', 'm', 'y', 'k']
  2403. def set_link_color_palette(palette):
  2404. """
  2405. Set list of matplotlib color codes for use by dendrogram.
  2406. Note that this palette is global (i.e. setting it once changes the colors
  2407. for all subsequent calls to `dendrogram`) and that it affects only the
  2408. the colors below ``color_threshold``.
  2409. Note that `dendrogram` also accepts a custom coloring function through its
  2410. ``link_color_func`` keyword, which is more flexible and non-global.
  2411. Parameters
  2412. ----------
  2413. palette : list of str or None
  2414. A list of matplotlib color codes. The order of the color codes is the
  2415. order in which the colors are cycled through when color thresholding in
  2416. the dendrogram.
  2417. If ``None``, resets the palette to its default (which is
  2418. ``['g', 'r', 'c', 'm', 'y', 'k']``).
  2419. Returns
  2420. -------
  2421. None
  2422. See Also
  2423. --------
  2424. dendrogram
  2425. Notes
  2426. -----
  2427. Ability to reset the palette with ``None`` added in Scipy 0.17.0.
  2428. Examples
  2429. --------
  2430. >>> from scipy.cluster import hierarchy
  2431. >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
  2432. ... 400., 754., 564., 138., 219., 869., 669.])
  2433. >>> Z = hierarchy.linkage(ytdist, 'single')
  2434. >>> dn = hierarchy.dendrogram(Z, no_plot=True)
  2435. >>> dn['color_list']
  2436. ['g', 'b', 'b', 'b', 'b']
  2437. >>> hierarchy.set_link_color_palette(['c', 'm', 'y', 'k'])
  2438. >>> dn = hierarchy.dendrogram(Z, no_plot=True)
  2439. >>> dn['color_list']
  2440. ['c', 'b', 'b', 'b', 'b']
  2441. >>> dn = hierarchy.dendrogram(Z, no_plot=True, color_threshold=267,
  2442. ... above_threshold_color='k')
  2443. >>> dn['color_list']
  2444. ['c', 'm', 'm', 'k', 'k']
  2445. Now reset the color palette to its default:
  2446. >>> hierarchy.set_link_color_palette(None)
  2447. """
  2448. if palette is None:
  2449. # reset to its default
  2450. palette = ['g', 'r', 'c', 'm', 'y', 'k']
  2451. elif type(palette) not in (list, tuple):
  2452. raise TypeError("palette must be a list or tuple")
  2453. _ptypes = [isinstance(p, string_types) for p in palette]
  2454. if False in _ptypes:
  2455. raise TypeError("all palette list elements must be color strings")
  2456. for i in list(_link_line_colors):
  2457. _link_line_colors.remove(i)
  2458. _link_line_colors.extend(list(palette))
  2459. def dendrogram(Z, p=30, truncate_mode=None, color_threshold=None,
  2460. get_leaves=True, orientation='top', labels=None,
  2461. count_sort=False, distance_sort=False, show_leaf_counts=True,
  2462. no_plot=False, no_labels=False, leaf_font_size=None,
  2463. leaf_rotation=None, leaf_label_func=None,
  2464. show_contracted=False, link_color_func=None, ax=None,
  2465. above_threshold_color='b'):
  2466. """
  2467. Plot the hierarchical clustering as a dendrogram.
  2468. The dendrogram illustrates how each cluster is
  2469. composed by drawing a U-shaped link between a non-singleton
  2470. cluster and its children. The top of the U-link indicates a
  2471. cluster merge. The two legs of the U-link indicate which clusters
  2472. were merged. The length of the two legs of the U-link represents
  2473. the distance between the child clusters. It is also the
  2474. cophenetic distance between original observations in the two
  2475. children clusters.
  2476. Parameters
  2477. ----------
  2478. Z : ndarray
  2479. The linkage matrix encoding the hierarchical clustering to
  2480. render as a dendrogram. See the ``linkage`` function for more
  2481. information on the format of ``Z``.
  2482. p : int, optional
  2483. The ``p`` parameter for ``truncate_mode``.
  2484. truncate_mode : str, optional
  2485. The dendrogram can be hard to read when the original
  2486. observation matrix from which the linkage is derived is
  2487. large. Truncation is used to condense the dendrogram. There
  2488. are several modes:
  2489. ``None``
  2490. No truncation is performed (default).
  2491. Note: ``'none'`` is an alias for ``None`` that's kept for
  2492. backward compatibility.
  2493. ``'lastp'``
  2494. The last ``p`` non-singleton clusters formed in the linkage are the
  2495. only non-leaf nodes in the linkage; they correspond to rows
  2496. ``Z[n-p-2:end]`` in ``Z``. All other non-singleton clusters are
  2497. contracted into leaf nodes.
  2498. ``'level'``
  2499. No more than ``p`` levels of the dendrogram tree are displayed.
  2500. A "level" includes all nodes with ``p`` merges from the last merge.
  2501. Note: ``'mtica'`` is an alias for ``'level'`` that's kept for
  2502. backward compatibility.
  2503. color_threshold : double, optional
  2504. For brevity, let :math:`t` be the ``color_threshold``.
  2505. Colors all the descendent links below a cluster node
  2506. :math:`k` the same color if :math:`k` is the first node below
  2507. the cut threshold :math:`t`. All links connecting nodes with
  2508. distances greater than or equal to the threshold are colored
  2509. blue. If :math:`t` is less than or equal to zero, all nodes
  2510. are colored blue. If ``color_threshold`` is None or
  2511. 'default', corresponding with MATLAB(TM) behavior, the
  2512. threshold is set to ``0.7*max(Z[:,2])``.
  2513. get_leaves : bool, optional
  2514. Includes a list ``R['leaves']=H`` in the result
  2515. dictionary. For each :math:`i`, ``H[i] == j``, cluster node
  2516. ``j`` appears in position ``i`` in the left-to-right traversal
  2517. of the leaves, where :math:`j < 2n-1` and :math:`i < n`.
  2518. orientation : str, optional
  2519. The direction to plot the dendrogram, which can be any
  2520. of the following strings:
  2521. ``'top'``
  2522. Plots the root at the top, and plot descendent links going downwards.
  2523. (default).
  2524. ``'bottom'``
  2525. Plots the root at the bottom, and plot descendent links going
  2526. upwards.
  2527. ``'left'``
  2528. Plots the root at the left, and plot descendent links going right.
  2529. ``'right'``
  2530. Plots the root at the right, and plot descendent links going left.
  2531. labels : ndarray, optional
  2532. By default ``labels`` is None so the index of the original observation
  2533. is used to label the leaf nodes. Otherwise, this is an :math:`n`
  2534. -sized list (or tuple). The ``labels[i]`` value is the text to put
  2535. under the :math:`i` th leaf node only if it corresponds to an original
  2536. observation and not a non-singleton cluster.
  2537. count_sort : str or bool, optional
  2538. For each node n, the order (visually, from left-to-right) n's
  2539. two descendent links are plotted is determined by this
  2540. parameter, which can be any of the following values:
  2541. ``False``
  2542. Nothing is done.
  2543. ``'ascending'`` or ``True``
  2544. The child with the minimum number of original objects in its cluster
  2545. is plotted first.
  2546. ``'descending'``
  2547. The child with the maximum number of original objects in its cluster
  2548. is plotted first.
  2549. Note ``distance_sort`` and ``count_sort`` cannot both be True.
  2550. distance_sort : str or bool, optional
  2551. For each node n, the order (visually, from left-to-right) n's
  2552. two descendent links are plotted is determined by this
  2553. parameter, which can be any of the following values:
  2554. ``False``
  2555. Nothing is done.
  2556. ``'ascending'`` or ``True``
  2557. The child with the minimum distance between its direct descendents is
  2558. plotted first.
  2559. ``'descending'``
  2560. The child with the maximum distance between its direct descendents is
  2561. plotted first.
  2562. Note ``distance_sort`` and ``count_sort`` cannot both be True.
  2563. show_leaf_counts : bool, optional
  2564. When True, leaf nodes representing :math:`k>1` original
  2565. observation are labeled with the number of observations they
  2566. contain in parentheses.
  2567. no_plot : bool, optional
  2568. When True, the final rendering is not performed. This is
  2569. useful if only the data structures computed for the rendering
  2570. are needed or if matplotlib is not available.
  2571. no_labels : bool, optional
  2572. When True, no labels appear next to the leaf nodes in the
  2573. rendering of the dendrogram.
  2574. leaf_rotation : double, optional
  2575. Specifies the angle (in degrees) to rotate the leaf
  2576. labels. When unspecified, the rotation is based on the number of
  2577. nodes in the dendrogram (default is 0).
  2578. leaf_font_size : int, optional
  2579. Specifies the font size (in points) of the leaf labels. When
  2580. unspecified, the size based on the number of nodes in the
  2581. dendrogram.
  2582. leaf_label_func : lambda or function, optional
  2583. When leaf_label_func is a callable function, for each
  2584. leaf with cluster index :math:`k < 2n-1`. The function
  2585. is expected to return a string with the label for the
  2586. leaf.
  2587. Indices :math:`k < n` correspond to original observations
  2588. while indices :math:`k \\geq n` correspond to non-singleton
  2589. clusters.
  2590. For example, to label singletons with their node id and
  2591. non-singletons with their id, count, and inconsistency
  2592. coefficient, simply do::
  2593. # First define the leaf label function.
  2594. def llf(id):
  2595. if id < n:
  2596. return str(id)
  2597. else:
  2598. return '[%d %d %1.2f]' % (id, count, R[n-id,3])
  2599. # The text for the leaf nodes is going to be big so force
  2600. # a rotation of 90 degrees.
  2601. dendrogram(Z, leaf_label_func=llf, leaf_rotation=90)
  2602. show_contracted : bool, optional
  2603. When True the heights of non-singleton nodes contracted
  2604. into a leaf node are plotted as crosses along the link
  2605. connecting that leaf node. This really is only useful when
  2606. truncation is used (see ``truncate_mode`` parameter).
  2607. link_color_func : callable, optional
  2608. If given, `link_color_function` is called with each non-singleton id
  2609. corresponding to each U-shaped link it will paint. The function is
  2610. expected to return the color to paint the link, encoded as a matplotlib
  2611. color string code. For example::
  2612. dendrogram(Z, link_color_func=lambda k: colors[k])
  2613. colors the direct links below each untruncated non-singleton node
  2614. ``k`` using ``colors[k]``.
  2615. ax : matplotlib Axes instance, optional
  2616. If None and `no_plot` is not True, the dendrogram will be plotted
  2617. on the current axes. Otherwise if `no_plot` is not True the
  2618. dendrogram will be plotted on the given ``Axes`` instance. This can be
  2619. useful if the dendrogram is part of a more complex figure.
  2620. above_threshold_color : str, optional
  2621. This matplotlib color string sets the color of the links above the
  2622. color_threshold. The default is 'b'.
  2623. Returns
  2624. -------
  2625. R : dict
  2626. A dictionary of data structures computed to render the
  2627. dendrogram. Its has the following keys:
  2628. ``'color_list'``
  2629. A list of color names. The k'th element represents the color of the
  2630. k'th link.
  2631. ``'icoord'`` and ``'dcoord'``
  2632. Each of them is a list of lists. Let ``icoord = [I1, I2, ..., Ip]``
  2633. where ``Ik = [xk1, xk2, xk3, xk4]`` and ``dcoord = [D1, D2, ..., Dp]``
  2634. where ``Dk = [yk1, yk2, yk3, yk4]``, then the k'th link painted is
  2635. ``(xk1, yk1)`` - ``(xk2, yk2)`` - ``(xk3, yk3)`` - ``(xk4, yk4)``.
  2636. ``'ivl'``
  2637. A list of labels corresponding to the leaf nodes.
  2638. ``'leaves'``
  2639. For each i, ``H[i] == j``, cluster node ``j`` appears in position
  2640. ``i`` in the left-to-right traversal of the leaves, where
  2641. :math:`j < 2n-1` and :math:`i < n`. If ``j`` is less than ``n``, the
  2642. ``i``-th leaf node corresponds to an original observation.
  2643. Otherwise, it corresponds to a non-singleton cluster.
  2644. See Also
  2645. --------
  2646. linkage, set_link_color_palette
  2647. Notes
  2648. -----
  2649. It is expected that the distances in ``Z[:,2]`` be monotonic, otherwise
  2650. crossings appear in the dendrogram.
  2651. Examples
  2652. --------
  2653. >>> from scipy.cluster import hierarchy
  2654. >>> import matplotlib.pyplot as plt
  2655. A very basic example:
  2656. >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
  2657. ... 400., 754., 564., 138., 219., 869., 669.])
  2658. >>> Z = hierarchy.linkage(ytdist, 'single')
  2659. >>> plt.figure()
  2660. >>> dn = hierarchy.dendrogram(Z)
  2661. Now plot in given axes, improve the color scheme and use both vertical and
  2662. horizontal orientations:
  2663. >>> hierarchy.set_link_color_palette(['m', 'c', 'y', 'k'])
  2664. >>> fig, axes = plt.subplots(1, 2, figsize=(8, 3))
  2665. >>> dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y',
  2666. ... orientation='top')
  2667. >>> dn2 = hierarchy.dendrogram(Z, ax=axes[1],
  2668. ... above_threshold_color='#bcbddc',
  2669. ... orientation='right')
  2670. >>> hierarchy.set_link_color_palette(None) # reset to default after use
  2671. >>> plt.show()
  2672. """
  2673. # This feature was thought about but never implemented (still useful?):
  2674. #
  2675. # ... = dendrogram(..., leaves_order=None)
  2676. #
  2677. # Plots the leaves in the order specified by a vector of
  2678. # original observation indices. If the vector contains duplicates
  2679. # or results in a crossing, an exception will be thrown. Passing
  2680. # None orders leaf nodes based on the order they appear in the
  2681. # pre-order traversal.
  2682. Z = np.asarray(Z, order='c')
  2683. if orientation not in ["top", "left", "bottom", "right"]:
  2684. raise ValueError("orientation must be one of 'top', 'left', "
  2685. "'bottom', or 'right'")
  2686. is_valid_linkage(Z, throw=True, name='Z')
  2687. Zs = Z.shape
  2688. n = Zs[0] + 1
  2689. if type(p) in (int, float):
  2690. p = int(p)
  2691. else:
  2692. raise TypeError('The second argument must be a number')
  2693. if truncate_mode not in ('lastp', 'mlab', 'mtica', 'level', 'none', None):
  2694. # 'mlab' and 'mtica' are kept working for backwards compat.
  2695. raise ValueError('Invalid truncation mode.')
  2696. if truncate_mode == 'lastp' or truncate_mode == 'mlab':
  2697. if p > n or p == 0:
  2698. p = n
  2699. if truncate_mode == 'mtica':
  2700. # 'mtica' is an alias
  2701. truncate_mode = 'level'
  2702. if truncate_mode == 'level':
  2703. if p <= 0:
  2704. p = np.inf
  2705. if get_leaves:
  2706. lvs = []
  2707. else:
  2708. lvs = None
  2709. icoord_list = []
  2710. dcoord_list = []
  2711. color_list = []
  2712. current_color = [0]
  2713. currently_below_threshold = [False]
  2714. ivl = [] # list of leaves
  2715. if color_threshold is None or (isinstance(color_threshold, string_types) and
  2716. color_threshold == 'default'):
  2717. color_threshold = max(Z[:, 2]) * 0.7
  2718. R = {'icoord': icoord_list, 'dcoord': dcoord_list, 'ivl': ivl,
  2719. 'leaves': lvs, 'color_list': color_list}
  2720. # Empty list will be filled in _dendrogram_calculate_info
  2721. contraction_marks = [] if show_contracted else None
  2722. _dendrogram_calculate_info(
  2723. Z=Z, p=p,
  2724. truncate_mode=truncate_mode,
  2725. color_threshold=color_threshold,
  2726. get_leaves=get_leaves,
  2727. orientation=orientation,
  2728. labels=labels,
  2729. count_sort=count_sort,
  2730. distance_sort=distance_sort,
  2731. show_leaf_counts=show_leaf_counts,
  2732. i=2*n - 2,
  2733. iv=0.0,
  2734. ivl=ivl,
  2735. n=n,
  2736. icoord_list=icoord_list,
  2737. dcoord_list=dcoord_list,
  2738. lvs=lvs,
  2739. current_color=current_color,
  2740. color_list=color_list,
  2741. currently_below_threshold=currently_below_threshold,
  2742. leaf_label_func=leaf_label_func,
  2743. contraction_marks=contraction_marks,
  2744. link_color_func=link_color_func,
  2745. above_threshold_color=above_threshold_color)
  2746. if not no_plot:
  2747. mh = max(Z[:, 2])
  2748. _plot_dendrogram(icoord_list, dcoord_list, ivl, p, n, mh, orientation,
  2749. no_labels, color_list,
  2750. leaf_font_size=leaf_font_size,
  2751. leaf_rotation=leaf_rotation,
  2752. contraction_marks=contraction_marks,
  2753. ax=ax,
  2754. above_threshold_color=above_threshold_color)
  2755. return R
  2756. def _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func,
  2757. i, labels):
  2758. # If the leaf id structure is not None and is a list then the caller
  2759. # to dendrogram has indicated that cluster id's corresponding to the
  2760. # leaf nodes should be recorded.
  2761. if lvs is not None:
  2762. lvs.append(int(i))
  2763. # If leaf node labels are to be displayed...
  2764. if ivl is not None:
  2765. # If a leaf_label_func has been provided, the label comes from the
  2766. # string returned from the leaf_label_func, which is a function
  2767. # passed to dendrogram.
  2768. if leaf_label_func:
  2769. ivl.append(leaf_label_func(int(i)))
  2770. else:
  2771. # Otherwise, if the dendrogram caller has passed a labels list
  2772. # for the leaf nodes, use it.
  2773. if labels is not None:
  2774. ivl.append(labels[int(i - n)])
  2775. else:
  2776. # Otherwise, use the id as the label for the leaf.x
  2777. ivl.append(str(int(i)))
  2778. def _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func,
  2779. i, labels, show_leaf_counts):
  2780. # If the leaf id structure is not None and is a list then the caller
  2781. # to dendrogram has indicated that cluster id's corresponding to the
  2782. # leaf nodes should be recorded.
  2783. if lvs is not None:
  2784. lvs.append(int(i))
  2785. if ivl is not None:
  2786. if leaf_label_func:
  2787. ivl.append(leaf_label_func(int(i)))
  2788. else:
  2789. if show_leaf_counts:
  2790. ivl.append("(" + str(int(Z[i - n, 3])) + ")")
  2791. else:
  2792. ivl.append("")
  2793. def _append_contraction_marks(Z, iv, i, n, contraction_marks):
  2794. _append_contraction_marks_sub(Z, iv, int(Z[i - n, 0]), n, contraction_marks)
  2795. _append_contraction_marks_sub(Z, iv, int(Z[i - n, 1]), n, contraction_marks)
  2796. def _append_contraction_marks_sub(Z, iv, i, n, contraction_marks):
  2797. if i >= n:
  2798. contraction_marks.append((iv, Z[i - n, 2]))
  2799. _append_contraction_marks_sub(Z, iv, int(Z[i - n, 0]), n, contraction_marks)
  2800. _append_contraction_marks_sub(Z, iv, int(Z[i - n, 1]), n, contraction_marks)
  2801. def _dendrogram_calculate_info(Z, p, truncate_mode,
  2802. color_threshold=np.inf, get_leaves=True,
  2803. orientation='top', labels=None,
  2804. count_sort=False, distance_sort=False,
  2805. show_leaf_counts=False, i=-1, iv=0.0,
  2806. ivl=[], n=0, icoord_list=[], dcoord_list=[],
  2807. lvs=None, mhr=False,
  2808. current_color=[], color_list=[],
  2809. currently_below_threshold=[],
  2810. leaf_label_func=None, level=0,
  2811. contraction_marks=None,
  2812. link_color_func=None,
  2813. above_threshold_color='b'):
  2814. """
  2815. Calculate the endpoints of the links as well as the labels for the
  2816. the dendrogram rooted at the node with index i. iv is the independent
  2817. variable value to plot the left-most leaf node below the root node i
  2818. (if orientation='top', this would be the left-most x value where the
  2819. plotting of this root node i and its descendents should begin).
  2820. ivl is a list to store the labels of the leaf nodes. The leaf_label_func
  2821. is called whenever ivl != None, labels == None, and
  2822. leaf_label_func != None. When ivl != None and labels != None, the
  2823. labels list is used only for labeling the leaf nodes. When
  2824. ivl == None, no labels are generated for leaf nodes.
  2825. When get_leaves==True, a list of leaves is built as they are visited
  2826. in the dendrogram.
  2827. Returns a tuple with l being the independent variable coordinate that
  2828. corresponds to the midpoint of cluster to the left of cluster i if
  2829. i is non-singleton, otherwise the independent coordinate of the leaf
  2830. node if i is a leaf node.
  2831. Returns
  2832. -------
  2833. A tuple (left, w, h, md), where:
  2834. * left is the independent variable coordinate of the center of the
  2835. the U of the subtree
  2836. * w is the amount of space used for the subtree (in independent
  2837. variable units)
  2838. * h is the height of the subtree in dependent variable units
  2839. * md is the ``max(Z[*,2]``) for all nodes ``*`` below and including
  2840. the target node.
  2841. """
  2842. if n == 0:
  2843. raise ValueError("Invalid singleton cluster count n.")
  2844. if i == -1:
  2845. raise ValueError("Invalid root cluster index i.")
  2846. if truncate_mode == 'lastp':
  2847. # If the node is a leaf node but corresponds to a non-singleton
  2848. # cluster, its label is either the empty string or the number of
  2849. # original observations belonging to cluster i.
  2850. if 2*n - p > i >= n:
  2851. d = Z[i - n, 2]
  2852. _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl,
  2853. leaf_label_func, i, labels,
  2854. show_leaf_counts)
  2855. if contraction_marks is not None:
  2856. _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks)
  2857. return (iv + 5.0, 10.0, 0.0, d)
  2858. elif i < n:
  2859. _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
  2860. leaf_label_func, i, labels)
  2861. return (iv + 5.0, 10.0, 0.0, 0.0)
  2862. elif truncate_mode == 'level':
  2863. if i > n and level > p:
  2864. d = Z[i - n, 2]
  2865. _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl,
  2866. leaf_label_func, i, labels,
  2867. show_leaf_counts)
  2868. if contraction_marks is not None:
  2869. _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks)
  2870. return (iv + 5.0, 10.0, 0.0, d)
  2871. elif i < n:
  2872. _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
  2873. leaf_label_func, i, labels)
  2874. return (iv + 5.0, 10.0, 0.0, 0.0)
  2875. elif truncate_mode in ('mlab',):
  2876. msg = "Mode 'mlab' is deprecated in scipy 0.19.0 (it never worked)."
  2877. warnings.warn(msg, DeprecationWarning)
  2878. # Otherwise, only truncate if we have a leaf node.
  2879. #
  2880. # Only place leaves if they correspond to original observations.
  2881. if i < n:
  2882. _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
  2883. leaf_label_func, i, labels)
  2884. return (iv + 5.0, 10.0, 0.0, 0.0)
  2885. # !!! Otherwise, we don't have a leaf node, so work on plotting a
  2886. # non-leaf node.
  2887. # Actual indices of a and b
  2888. aa = int(Z[i - n, 0])
  2889. ab = int(Z[i - n, 1])
  2890. if aa > n:
  2891. # The number of singletons below cluster a
  2892. na = Z[aa - n, 3]
  2893. # The distance between a's two direct children.
  2894. da = Z[aa - n, 2]
  2895. else:
  2896. na = 1
  2897. da = 0.0
  2898. if ab > n:
  2899. nb = Z[ab - n, 3]
  2900. db = Z[ab - n, 2]
  2901. else:
  2902. nb = 1
  2903. db = 0.0
  2904. if count_sort == 'ascending' or count_sort:
  2905. # If a has a count greater than b, it and its descendents should
  2906. # be drawn to the right. Otherwise, to the left.
  2907. if na > nb:
  2908. # The cluster index to draw to the left (ua) will be ab
  2909. # and the one to draw to the right (ub) will be aa
  2910. ua = ab
  2911. ub = aa
  2912. else:
  2913. ua = aa
  2914. ub = ab
  2915. elif count_sort == 'descending':
  2916. # If a has a count less than or equal to b, it and its
  2917. # descendents should be drawn to the left. Otherwise, to
  2918. # the right.
  2919. if na > nb:
  2920. ua = aa
  2921. ub = ab
  2922. else:
  2923. ua = ab
  2924. ub = aa
  2925. elif distance_sort == 'ascending' or distance_sort:
  2926. # If a has a distance greater than b, it and its descendents should
  2927. # be drawn to the right. Otherwise, to the left.
  2928. if da > db:
  2929. ua = ab
  2930. ub = aa
  2931. else:
  2932. ua = aa
  2933. ub = ab
  2934. elif distance_sort == 'descending':
  2935. # If a has a distance less than or equal to b, it and its
  2936. # descendents should be drawn to the left. Otherwise, to
  2937. # the right.
  2938. if da > db:
  2939. ua = aa
  2940. ub = ab
  2941. else:
  2942. ua = ab
  2943. ub = aa
  2944. else:
  2945. ua = aa
  2946. ub = ab
  2947. # Updated iv variable and the amount of space used.
  2948. (uiva, uwa, uah, uamd) = \
  2949. _dendrogram_calculate_info(
  2950. Z=Z, p=p,
  2951. truncate_mode=truncate_mode,
  2952. color_threshold=color_threshold,
  2953. get_leaves=get_leaves,
  2954. orientation=orientation,
  2955. labels=labels,
  2956. count_sort=count_sort,
  2957. distance_sort=distance_sort,
  2958. show_leaf_counts=show_leaf_counts,
  2959. i=ua, iv=iv, ivl=ivl, n=n,
  2960. icoord_list=icoord_list,
  2961. dcoord_list=dcoord_list, lvs=lvs,
  2962. current_color=current_color,
  2963. color_list=color_list,
  2964. currently_below_threshold=currently_below_threshold,
  2965. leaf_label_func=leaf_label_func,
  2966. level=level + 1, contraction_marks=contraction_marks,
  2967. link_color_func=link_color_func,
  2968. above_threshold_color=above_threshold_color)
  2969. h = Z[i - n, 2]
  2970. if h >= color_threshold or color_threshold <= 0:
  2971. c = above_threshold_color
  2972. if currently_below_threshold[0]:
  2973. current_color[0] = (current_color[0] + 1) % len(_link_line_colors)
  2974. currently_below_threshold[0] = False
  2975. else:
  2976. currently_below_threshold[0] = True
  2977. c = _link_line_colors[current_color[0]]
  2978. (uivb, uwb, ubh, ubmd) = \
  2979. _dendrogram_calculate_info(
  2980. Z=Z, p=p,
  2981. truncate_mode=truncate_mode,
  2982. color_threshold=color_threshold,
  2983. get_leaves=get_leaves,
  2984. orientation=orientation,
  2985. labels=labels,
  2986. count_sort=count_sort,
  2987. distance_sort=distance_sort,
  2988. show_leaf_counts=show_leaf_counts,
  2989. i=ub, iv=iv + uwa, ivl=ivl, n=n,
  2990. icoord_list=icoord_list,
  2991. dcoord_list=dcoord_list, lvs=lvs,
  2992. current_color=current_color,
  2993. color_list=color_list,
  2994. currently_below_threshold=currently_below_threshold,
  2995. leaf_label_func=leaf_label_func,
  2996. level=level + 1, contraction_marks=contraction_marks,
  2997. link_color_func=link_color_func,
  2998. above_threshold_color=above_threshold_color)
  2999. max_dist = max(uamd, ubmd, h)
  3000. icoord_list.append([uiva, uiva, uivb, uivb])
  3001. dcoord_list.append([uah, h, h, ubh])
  3002. if link_color_func is not None:
  3003. v = link_color_func(int(i))
  3004. if not isinstance(v, string_types):
  3005. raise TypeError("link_color_func must return a matplotlib "
  3006. "color string!")
  3007. color_list.append(v)
  3008. else:
  3009. color_list.append(c)
  3010. return (((uiva + uivb) / 2), uwa + uwb, h, max_dist)
  3011. def is_isomorphic(T1, T2):
  3012. """
  3013. Determine if two different cluster assignments are equivalent.
  3014. Parameters
  3015. ----------
  3016. T1 : array_like
  3017. An assignment of singleton cluster ids to flat cluster ids.
  3018. T2 : array_like
  3019. An assignment of singleton cluster ids to flat cluster ids.
  3020. Returns
  3021. -------
  3022. b : bool
  3023. Whether the flat cluster assignments `T1` and `T2` are
  3024. equivalent.
  3025. See Also
  3026. --------
  3027. linkage: for a description of what a linkage matrix is.
  3028. fcluster: for the creation of flat cluster assignments.
  3029. Examples
  3030. --------
  3031. >>> from scipy.cluster.hierarchy import fcluster, is_isomorphic
  3032. >>> from scipy.cluster.hierarchy import single, complete
  3033. >>> from scipy.spatial.distance import pdist
  3034. Two flat cluster assignments can be isomorphic if they represent the same
  3035. cluster assignment, with different labels.
  3036. For example, we can use the `scipy.cluster.hierarchy.single`: method
  3037. and flatten the output to four clusters:
  3038. >>> X = [[0, 0], [0, 1], [1, 0],
  3039. ... [0, 4], [0, 3], [1, 4],
  3040. ... [4, 0], [3, 0], [4, 1],
  3041. ... [4, 4], [3, 4], [4, 3]]
  3042. >>> Z = single(pdist(X))
  3043. >>> T = fcluster(Z, 1, criterion='distance')
  3044. >>> T
  3045. array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
  3046. We can then do the same using the
  3047. `scipy.cluster.hierarchy.complete`: method:
  3048. >>> Z = complete(pdist(X))
  3049. >>> T_ = fcluster(Z, 1.5, criterion='distance')
  3050. >>> T_
  3051. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  3052. As we can see, in both cases we obtain four clusters and all the data
  3053. points are distributed in the same way - the only thing that changes
  3054. are the flat cluster labels (3 => 1, 4 =>2, 2 =>3 and 4 =>1), so both
  3055. cluster assignments are isomorphic:
  3056. >>> is_isomorphic(T, T_)
  3057. True
  3058. """
  3059. T1 = np.asarray(T1, order='c')
  3060. T2 = np.asarray(T2, order='c')
  3061. if type(T1) != np.ndarray:
  3062. raise TypeError('T1 must be a numpy array.')
  3063. if type(T2) != np.ndarray:
  3064. raise TypeError('T2 must be a numpy array.')
  3065. T1S = T1.shape
  3066. T2S = T2.shape
  3067. if len(T1S) != 1:
  3068. raise ValueError('T1 must be one-dimensional.')
  3069. if len(T2S) != 1:
  3070. raise ValueError('T2 must be one-dimensional.')
  3071. if T1S[0] != T2S[0]:
  3072. raise ValueError('T1 and T2 must have the same number of elements.')
  3073. n = T1S[0]
  3074. d1 = {}
  3075. d2 = {}
  3076. for i in xrange(0, n):
  3077. if T1[i] in d1:
  3078. if not T2[i] in d2:
  3079. return False
  3080. if d1[T1[i]] != T2[i] or d2[T2[i]] != T1[i]:
  3081. return False
  3082. elif T2[i] in d2:
  3083. return False
  3084. else:
  3085. d1[T1[i]] = T2[i]
  3086. d2[T2[i]] = T1[i]
  3087. return True
  3088. def maxdists(Z):
  3089. """
  3090. Return the maximum distance between any non-singleton cluster.
  3091. Parameters
  3092. ----------
  3093. Z : ndarray
  3094. The hierarchical clustering encoded as a matrix. See
  3095. ``linkage`` for more information.
  3096. Returns
  3097. -------
  3098. maxdists : ndarray
  3099. A ``(n-1)`` sized numpy array of doubles; ``MD[i]`` represents
  3100. the maximum distance between any cluster (including
  3101. singletons) below and including the node with index i. More
  3102. specifically, ``MD[i] = Z[Q(i)-n, 2].max()`` where ``Q(i)`` is the
  3103. set of all node indices below and including node i.
  3104. See Also
  3105. --------
  3106. linkage: for a description of what a linkage matrix is.
  3107. is_monotonic: for testing for monotonicity of a linkage matrix.
  3108. Examples
  3109. --------
  3110. >>> from scipy.cluster.hierarchy import median, maxdists
  3111. >>> from scipy.spatial.distance import pdist
  3112. Given a linkage matrix ``Z``, `scipy.cluster.hierarchy.maxdists`
  3113. computes for each new cluster generated (i.e. for each row of the linkage
  3114. matrix) what is the maximum distance between any two child clusters.
  3115. Due to the nature of hierarchical clustering, in many cases this is going
  3116. to be just the distance between the two child clusters that were merged
  3117. to form the current one - that is, Z[:,2].
  3118. However, for non-monotonic cluster assignments such as
  3119. `scipy.cluster.hierarchy.median` clustering this is not always the
  3120. case: There may be cluster formations were the distance between the two
  3121. clusters merged is smaller than the distance between their children.
  3122. We can see this in an example:
  3123. >>> X = [[0, 0], [0, 1], [1, 0],
  3124. ... [0, 4], [0, 3], [1, 4],
  3125. ... [4, 0], [3, 0], [4, 1],
  3126. ... [4, 4], [3, 4], [4, 3]]
  3127. >>> Z = median(pdist(X))
  3128. >>> Z
  3129. array([[ 0. , 1. , 1. , 2. ],
  3130. [ 3. , 4. , 1. , 2. ],
  3131. [ 9. , 10. , 1. , 2. ],
  3132. [ 6. , 7. , 1. , 2. ],
  3133. [ 2. , 12. , 1.11803399, 3. ],
  3134. [ 5. , 13. , 1.11803399, 3. ],
  3135. [ 8. , 15. , 1.11803399, 3. ],
  3136. [11. , 14. , 1.11803399, 3. ],
  3137. [18. , 19. , 3. , 6. ],
  3138. [16. , 17. , 3.5 , 6. ],
  3139. [20. , 21. , 3.25 , 12. ]])
  3140. >>> maxdists(Z)
  3141. array([1. , 1. , 1. , 1. , 1.11803399,
  3142. 1.11803399, 1.11803399, 1.11803399, 3. , 3.5 ,
  3143. 3.5 ])
  3144. Note that while the distance between the two clusters merged when creating the
  3145. last cluster is 3.25, there are two children (clusters 16 and 17) whose distance
  3146. is larger (3.5). Thus, `scipy.cluster.hierarchy.maxdists` returns 3.5 in
  3147. this case.
  3148. """
  3149. Z = np.asarray(Z, order='c', dtype=np.double)
  3150. is_valid_linkage(Z, throw=True, name='Z')
  3151. n = Z.shape[0] + 1
  3152. MD = np.zeros((n - 1,))
  3153. [Z] = _copy_arrays_if_base_present([Z])
  3154. _hierarchy.get_max_dist_for_each_cluster(Z, MD, int(n))
  3155. return MD
  3156. def maxinconsts(Z, R):
  3157. """
  3158. Return the maximum inconsistency coefficient for each
  3159. non-singleton cluster and its children.
  3160. Parameters
  3161. ----------
  3162. Z : ndarray
  3163. The hierarchical clustering encoded as a matrix. See
  3164. `linkage` for more information.
  3165. R : ndarray
  3166. The inconsistency matrix.
  3167. Returns
  3168. -------
  3169. MI : ndarray
  3170. A monotonic ``(n-1)``-sized numpy array of doubles.
  3171. See Also
  3172. --------
  3173. linkage: for a description of what a linkage matrix is.
  3174. inconsistent: for the creation of a inconsistency matrix.
  3175. Examples
  3176. --------
  3177. >>> from scipy.cluster.hierarchy import median, inconsistent, maxinconsts
  3178. >>> from scipy.spatial.distance import pdist
  3179. Given a data set ``X``, we can apply a clustering method to obtain a
  3180. linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
  3181. be also used to obtain the inconsistency matrix ``R`` associated to
  3182. this clustering process:
  3183. >>> X = [[0, 0], [0, 1], [1, 0],
  3184. ... [0, 4], [0, 3], [1, 4],
  3185. ... [4, 0], [3, 0], [4, 1],
  3186. ... [4, 4], [3, 4], [4, 3]]
  3187. >>> Z = median(pdist(X))
  3188. >>> R = inconsistent(Z)
  3189. >>> Z
  3190. array([[ 0. , 1. , 1. , 2. ],
  3191. [ 3. , 4. , 1. , 2. ],
  3192. [ 9. , 10. , 1. , 2. ],
  3193. [ 6. , 7. , 1. , 2. ],
  3194. [ 2. , 12. , 1.11803399, 3. ],
  3195. [ 5. , 13. , 1.11803399, 3. ],
  3196. [ 8. , 15. , 1.11803399, 3. ],
  3197. [11. , 14. , 1.11803399, 3. ],
  3198. [18. , 19. , 3. , 6. ],
  3199. [16. , 17. , 3.5 , 6. ],
  3200. [20. , 21. , 3.25 , 12. ]])
  3201. >>> R
  3202. array([[1. , 0. , 1. , 0. ],
  3203. [1. , 0. , 1. , 0. ],
  3204. [1. , 0. , 1. , 0. ],
  3205. [1. , 0. , 1. , 0. ],
  3206. [1.05901699, 0.08346263, 2. , 0.70710678],
  3207. [1.05901699, 0.08346263, 2. , 0.70710678],
  3208. [1.05901699, 0.08346263, 2. , 0.70710678],
  3209. [1.05901699, 0.08346263, 2. , 0.70710678],
  3210. [1.74535599, 1.08655358, 3. , 1.15470054],
  3211. [1.91202266, 1.37522872, 3. , 1.15470054],
  3212. [3.25 , 0.25 , 3. , 0. ]])
  3213. Here `scipy.cluster.hierarchy.maxinconsts` can be used to compute
  3214. the maximum value of the inconsistency statistic (the last column of
  3215. ``R``) for each non-singleton cluster and its children:
  3216. >>> maxinconsts(Z, R)
  3217. array([0. , 0. , 0. , 0. , 0.70710678,
  3218. 0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054,
  3219. 1.15470054])
  3220. """
  3221. Z = np.asarray(Z, order='c')
  3222. R = np.asarray(R, order='c')
  3223. is_valid_linkage(Z, throw=True, name='Z')
  3224. is_valid_im(R, throw=True, name='R')
  3225. n = Z.shape[0] + 1
  3226. if Z.shape[0] != R.shape[0]:
  3227. raise ValueError("The inconsistency matrix and linkage matrix each "
  3228. "have a different number of rows.")
  3229. MI = np.zeros((n - 1,))
  3230. [Z, R] = _copy_arrays_if_base_present([Z, R])
  3231. _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MI, int(n), 3)
  3232. return MI
  3233. def maxRstat(Z, R, i):
  3234. """
  3235. Return the maximum statistic for each non-singleton cluster and its
  3236. children.
  3237. Parameters
  3238. ----------
  3239. Z : array_like
  3240. The hierarchical clustering encoded as a matrix. See `linkage` for more
  3241. information.
  3242. R : array_like
  3243. The inconsistency matrix.
  3244. i : int
  3245. The column of `R` to use as the statistic.
  3246. Returns
  3247. -------
  3248. MR : ndarray
  3249. Calculates the maximum statistic for the i'th column of the
  3250. inconsistency matrix `R` for each non-singleton cluster
  3251. node. ``MR[j]`` is the maximum over ``R[Q(j)-n, i]`` where
  3252. ``Q(j)`` the set of all node ids corresponding to nodes below
  3253. and including ``j``.
  3254. See Also
  3255. --------
  3256. linkage: for a description of what a linkage matrix is.
  3257. inconsistent: for the creation of a inconsistency matrix.
  3258. Examples
  3259. --------
  3260. >>> from scipy.cluster.hierarchy import median, inconsistent, maxRstat
  3261. >>> from scipy.spatial.distance import pdist
  3262. Given a data set ``X``, we can apply a clustering method to obtain a
  3263. linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
  3264. be also used to obtain the inconsistency matrix ``R`` associated to
  3265. this clustering process:
  3266. >>> X = [[0, 0], [0, 1], [1, 0],
  3267. ... [0, 4], [0, 3], [1, 4],
  3268. ... [4, 0], [3, 0], [4, 1],
  3269. ... [4, 4], [3, 4], [4, 3]]
  3270. >>> Z = median(pdist(X))
  3271. >>> R = inconsistent(Z)
  3272. >>> R
  3273. array([[1. , 0. , 1. , 0. ],
  3274. [1. , 0. , 1. , 0. ],
  3275. [1. , 0. , 1. , 0. ],
  3276. [1. , 0. , 1. , 0. ],
  3277. [1.05901699, 0.08346263, 2. , 0.70710678],
  3278. [1.05901699, 0.08346263, 2. , 0.70710678],
  3279. [1.05901699, 0.08346263, 2. , 0.70710678],
  3280. [1.05901699, 0.08346263, 2. , 0.70710678],
  3281. [1.74535599, 1.08655358, 3. , 1.15470054],
  3282. [1.91202266, 1.37522872, 3. , 1.15470054],
  3283. [3.25 , 0.25 , 3. , 0. ]])
  3284. `scipy.cluster.hierarchy.maxRstat` can be used to compute
  3285. the maximum value of each column of ``R``, for each non-singleton
  3286. cluster and its children:
  3287. >>> maxRstat(Z, R, 0)
  3288. array([1. , 1. , 1. , 1. , 1.05901699,
  3289. 1.05901699, 1.05901699, 1.05901699, 1.74535599, 1.91202266,
  3290. 3.25 ])
  3291. >>> maxRstat(Z, R, 1)
  3292. array([0. , 0. , 0. , 0. , 0.08346263,
  3293. 0.08346263, 0.08346263, 0.08346263, 1.08655358, 1.37522872,
  3294. 1.37522872])
  3295. >>> maxRstat(Z, R, 3)
  3296. array([0. , 0. , 0. , 0. , 0.70710678,
  3297. 0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054,
  3298. 1.15470054])
  3299. """
  3300. Z = np.asarray(Z, order='c')
  3301. R = np.asarray(R, order='c')
  3302. is_valid_linkage(Z, throw=True, name='Z')
  3303. is_valid_im(R, throw=True, name='R')
  3304. if type(i) is not int:
  3305. raise TypeError('The third argument must be an integer.')
  3306. if i < 0 or i > 3:
  3307. raise ValueError('i must be an integer between 0 and 3 inclusive.')
  3308. if Z.shape[0] != R.shape[0]:
  3309. raise ValueError("The inconsistency matrix and linkage matrix each "
  3310. "have a different number of rows.")
  3311. n = Z.shape[0] + 1
  3312. MR = np.zeros((n - 1,))
  3313. [Z, R] = _copy_arrays_if_base_present([Z, R])
  3314. _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MR, int(n), i)
  3315. return MR
  3316. def leaders(Z, T):
  3317. """
  3318. Return the root nodes in a hierarchical clustering.
  3319. Returns the root nodes in a hierarchical clustering corresponding
  3320. to a cut defined by a flat cluster assignment vector ``T``. See
  3321. the ``fcluster`` function for more information on the format of ``T``.
  3322. For each flat cluster :math:`j` of the :math:`k` flat clusters
  3323. represented in the n-sized flat cluster assignment vector ``T``,
  3324. this function finds the lowest cluster node :math:`i` in the linkage
  3325. tree Z such that:
  3326. * leaf descendants belong only to flat cluster j
  3327. (i.e. ``T[p]==j`` for all :math:`p` in :math:`S(i)` where
  3328. :math:`S(i)` is the set of leaf ids of descendant leaf nodes
  3329. with cluster node :math:`i`)
  3330. * there does not exist a leaf that is not a descendant with
  3331. :math:`i` that also belongs to cluster :math:`j`
  3332. (i.e. ``T[q]!=j`` for all :math:`q` not in :math:`S(i)`). If
  3333. this condition is violated, ``T`` is not a valid cluster
  3334. assignment vector, and an exception will be thrown.
  3335. Parameters
  3336. ----------
  3337. Z : ndarray
  3338. The hierarchical clustering encoded as a matrix. See
  3339. `linkage` for more information.
  3340. T : ndarray
  3341. The flat cluster assignment vector.
  3342. Returns
  3343. -------
  3344. L : ndarray
  3345. The leader linkage node id's stored as a k-element 1-D array
  3346. where ``k`` is the number of flat clusters found in ``T``.
  3347. ``L[j]=i`` is the linkage cluster node id that is the
  3348. leader of flat cluster with id M[j]. If ``i < n``, ``i``
  3349. corresponds to an original observation, otherwise it
  3350. corresponds to a non-singleton cluster.
  3351. M : ndarray
  3352. The leader linkage node id's stored as a k-element 1-D array where
  3353. ``k`` is the number of flat clusters found in ``T``. This allows the
  3354. set of flat cluster ids to be any arbitrary set of ``k`` integers.
  3355. For example: if ``L[3]=2`` and ``M[3]=8``, the flat cluster with
  3356. id 8's leader is linkage node 2.
  3357. See Also
  3358. --------
  3359. fcluster: for the creation of flat cluster assignments.
  3360. Examples
  3361. --------
  3362. >>> from scipy.cluster.hierarchy import ward, fcluster, leaders
  3363. >>> from scipy.spatial.distance import pdist
  3364. Given a linkage matrix ``Z`` - obtained after apply a clustering method
  3365. to a dataset ``X`` - and a flat cluster assignment array ``T``:
  3366. >>> X = [[0, 0], [0, 1], [1, 0],
  3367. ... [0, 4], [0, 3], [1, 4],
  3368. ... [4, 0], [3, 0], [4, 1],
  3369. ... [4, 4], [3, 4], [4, 3]]
  3370. >>> Z = ward(pdist(X))
  3371. >>> Z
  3372. array([[ 0. , 1. , 1. , 2. ],
  3373. [ 3. , 4. , 1. , 2. ],
  3374. [ 6. , 7. , 1. , 2. ],
  3375. [ 9. , 10. , 1. , 2. ],
  3376. [ 2. , 12. , 1.29099445, 3. ],
  3377. [ 5. , 13. , 1.29099445, 3. ],
  3378. [ 8. , 14. , 1.29099445, 3. ],
  3379. [11. , 15. , 1.29099445, 3. ],
  3380. [16. , 17. , 5.77350269, 6. ],
  3381. [18. , 19. , 5.77350269, 6. ],
  3382. [20. , 21. , 8.16496581, 12. ]])
  3383. >>> T = fcluster(Z, 3, criterion='distance')
  3384. >>> T
  3385. array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
  3386. `scipy.cluster.hierarchy.leaders` returns the indexes of the nodes
  3387. in the dendrogram that are the leaders of each flat cluster:
  3388. >>> L, M = leaders(Z, T)
  3389. >>> L
  3390. array([16, 17, 18, 19], dtype=int32)
  3391. (remember that indexes 0-11 point to the 12 data points in ``X``
  3392. whereas indexes 12-22 point to the 11 rows of ``Z``)
  3393. `scipy.cluster.hierarchy.leaders` also returns the indexes of
  3394. the flat clusters in ``T``:
  3395. >>> M
  3396. array([1, 2, 3, 4], dtype=int32)
  3397. """
  3398. Z = np.asarray(Z, order='c')
  3399. T = np.asarray(T, order='c')
  3400. if type(T) != np.ndarray or T.dtype != 'i':
  3401. raise TypeError('T must be a one-dimensional numpy array of integers.')
  3402. is_valid_linkage(Z, throw=True, name='Z')
  3403. if len(T) != Z.shape[0] + 1:
  3404. raise ValueError('Mismatch: len(T)!=Z.shape[0] + 1.')
  3405. Cl = np.unique(T)
  3406. kk = len(Cl)
  3407. L = np.zeros((kk,), dtype='i')
  3408. M = np.zeros((kk,), dtype='i')
  3409. n = Z.shape[0] + 1
  3410. [Z, T] = _copy_arrays_if_base_present([Z, T])
  3411. s = _hierarchy.leaders(Z, T, L, M, int(kk), int(n))
  3412. if s >= 0:
  3413. raise ValueError(('T is not a valid assignment vector. Error found '
  3414. 'when examining linkage node %d (< 2n-1).') % s)
  3415. return (L, M)