algorithms.py 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826
  1. """
  2. Generic data algorithms. This module is experimental at the moment and not
  3. intended for public consumption
  4. """
  5. from __future__ import division
  6. from textwrap import dedent
  7. from warnings import catch_warnings, simplefilter, warn
  8. import numpy as np
  9. from pandas._libs import algos, hashtable as htable, lib
  10. from pandas._libs.tslib import iNaT
  11. from pandas.util._decorators import Appender, Substitution, deprecate_kwarg
  12. from pandas.core.dtypes.cast import (
  13. construct_1d_object_array_from_listlike, maybe_promote)
  14. from pandas.core.dtypes.common import (
  15. ensure_float64, ensure_int64, ensure_object, ensure_platform_int,
  16. ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype,
  17. is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype,
  18. is_datetimelike, is_extension_array_dtype, is_float_dtype,
  19. is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
  20. is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
  21. is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype,
  22. needs_i8_conversion)
  23. from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
  24. from pandas.core.dtypes.missing import isna, na_value_for_dtype
  25. from pandas.core import common as com
  26. _shared_docs = {}
  27. # --------------- #
  28. # dtype access #
  29. # --------------- #
  30. def _ensure_data(values, dtype=None):
  31. """
  32. routine to ensure that our data is of the correct
  33. input dtype for lower-level routines
  34. This will coerce:
  35. - ints -> int64
  36. - uint -> uint64
  37. - bool -> uint64 (TODO this should be uint8)
  38. - datetimelike -> i8
  39. - datetime64tz -> i8 (in local tz)
  40. - categorical -> codes
  41. Parameters
  42. ----------
  43. values : array-like
  44. dtype : pandas_dtype, optional
  45. coerce to this dtype
  46. Returns
  47. -------
  48. (ndarray, pandas_dtype, algo dtype as a string)
  49. """
  50. # we check some simple dtypes first
  51. try:
  52. if is_object_dtype(dtype):
  53. return ensure_object(np.asarray(values)), 'object', 'object'
  54. if is_bool_dtype(values) or is_bool_dtype(dtype):
  55. # we are actually coercing to uint64
  56. # until our algos support uint8 directly (see TODO)
  57. return np.asarray(values).astype('uint64'), 'bool', 'uint64'
  58. elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
  59. return ensure_int64(values), 'int64', 'int64'
  60. elif (is_unsigned_integer_dtype(values) or
  61. is_unsigned_integer_dtype(dtype)):
  62. return ensure_uint64(values), 'uint64', 'uint64'
  63. elif is_float_dtype(values) or is_float_dtype(dtype):
  64. return ensure_float64(values), 'float64', 'float64'
  65. elif is_object_dtype(values) and dtype is None:
  66. return ensure_object(np.asarray(values)), 'object', 'object'
  67. elif is_complex_dtype(values) or is_complex_dtype(dtype):
  68. # ignore the fact that we are casting to float
  69. # which discards complex parts
  70. with catch_warnings():
  71. simplefilter("ignore", np.ComplexWarning)
  72. values = ensure_float64(values)
  73. return values, 'float64', 'float64'
  74. except (TypeError, ValueError, OverflowError):
  75. # if we are trying to coerce to a dtype
  76. # and it is incompat this will fall thru to here
  77. return ensure_object(values), 'object', 'object'
  78. # datetimelike
  79. if (needs_i8_conversion(values) or
  80. is_period_dtype(dtype) or
  81. is_datetime64_any_dtype(dtype) or
  82. is_timedelta64_dtype(dtype)):
  83. if is_period_dtype(values) or is_period_dtype(dtype):
  84. from pandas import PeriodIndex
  85. values = PeriodIndex(values)
  86. dtype = values.dtype
  87. elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype):
  88. from pandas import TimedeltaIndex
  89. values = TimedeltaIndex(values)
  90. dtype = values.dtype
  91. else:
  92. # Datetime
  93. from pandas import DatetimeIndex
  94. values = DatetimeIndex(values)
  95. dtype = values.dtype
  96. return values.asi8, dtype, 'int64'
  97. elif (is_categorical_dtype(values) and
  98. (is_categorical_dtype(dtype) or dtype is None)):
  99. values = getattr(values, 'values', values)
  100. values = values.codes
  101. dtype = 'category'
  102. # we are actually coercing to int64
  103. # until our algos support int* directly (not all do)
  104. values = ensure_int64(values)
  105. return values, dtype, 'int64'
  106. # we have failed, return object
  107. values = np.asarray(values, dtype=np.object)
  108. return ensure_object(values), 'object', 'object'
  109. def _reconstruct_data(values, dtype, original):
  110. """
  111. reverse of _ensure_data
  112. Parameters
  113. ----------
  114. values : ndarray
  115. dtype : pandas_dtype
  116. original : ndarray-like
  117. Returns
  118. -------
  119. Index for extension types, otherwise ndarray casted to dtype
  120. """
  121. from pandas import Index
  122. if is_extension_array_dtype(dtype):
  123. values = dtype.construct_array_type()._from_sequence(values)
  124. elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
  125. values = Index(original)._shallow_copy(values, name=None)
  126. elif is_bool_dtype(dtype):
  127. values = values.astype(dtype)
  128. # we only support object dtypes bool Index
  129. if isinstance(original, Index):
  130. values = values.astype(object)
  131. elif dtype is not None:
  132. values = values.astype(dtype)
  133. return values
  134. def _ensure_arraylike(values):
  135. """
  136. ensure that we are arraylike if not already
  137. """
  138. if not is_array_like(values):
  139. inferred = lib.infer_dtype(values, skipna=False)
  140. if inferred in ['mixed', 'string', 'unicode']:
  141. if isinstance(values, tuple):
  142. values = list(values)
  143. values = construct_1d_object_array_from_listlike(values)
  144. else:
  145. values = np.asarray(values)
  146. return values
  147. _hashtables = {
  148. 'float64': (htable.Float64HashTable, htable.Float64Vector),
  149. 'uint64': (htable.UInt64HashTable, htable.UInt64Vector),
  150. 'int64': (htable.Int64HashTable, htable.Int64Vector),
  151. 'string': (htable.StringHashTable, htable.ObjectVector),
  152. 'object': (htable.PyObjectHashTable, htable.ObjectVector)
  153. }
  154. def _get_hashtable_algo(values):
  155. """
  156. Parameters
  157. ----------
  158. values : arraylike
  159. Returns
  160. -------
  161. tuples(hashtable class,
  162. vector class,
  163. values,
  164. dtype,
  165. ndtype)
  166. """
  167. values, dtype, ndtype = _ensure_data(values)
  168. if ndtype == 'object':
  169. # it's cheaper to use a String Hash Table than Object; we infer
  170. # including nulls because that is the only difference between
  171. # StringHashTable and ObjectHashtable
  172. if lib.infer_dtype(values, skipna=False) in ['string']:
  173. ndtype = 'string'
  174. else:
  175. ndtype = 'object'
  176. htable, table = _hashtables[ndtype]
  177. return (htable, table, values, dtype, ndtype)
  178. def _get_data_algo(values, func_map):
  179. if is_categorical_dtype(values):
  180. values = values._values_for_rank()
  181. values, dtype, ndtype = _ensure_data(values)
  182. if ndtype == 'object':
  183. # it's cheaper to use a String Hash Table than Object; we infer
  184. # including nulls because that is the only difference between
  185. # StringHashTable and ObjectHashtable
  186. if lib.infer_dtype(values, skipna=False) in ['string']:
  187. ndtype = 'string'
  188. f = func_map.get(ndtype, func_map['object'])
  189. return f, values
  190. # --------------- #
  191. # top-level algos #
  192. # --------------- #
  193. def match(to_match, values, na_sentinel=-1):
  194. """
  195. Compute locations of to_match into values
  196. Parameters
  197. ----------
  198. to_match : array-like
  199. values to find positions of
  200. values : array-like
  201. Unique set of values
  202. na_sentinel : int, default -1
  203. Value to mark "not found"
  204. Examples
  205. --------
  206. Returns
  207. -------
  208. match : ndarray of integers
  209. """
  210. values = com.asarray_tuplesafe(values)
  211. htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
  212. to_match, _, _ = _ensure_data(to_match, dtype)
  213. table = htable(min(len(to_match), 1000000))
  214. table.map_locations(values)
  215. result = table.lookup(to_match)
  216. if na_sentinel != -1:
  217. # replace but return a numpy array
  218. # use a Series because it handles dtype conversions properly
  219. from pandas import Series
  220. result = Series(result.ravel()).replace(-1, na_sentinel)
  221. result = result.values.reshape(result.shape)
  222. return result
  223. def unique(values):
  224. """
  225. Hash table-based unique. Uniques are returned in order
  226. of appearance. This does NOT sort.
  227. Significantly faster than numpy.unique. Includes NA values.
  228. Parameters
  229. ----------
  230. values : 1d array-like
  231. Returns
  232. -------
  233. unique values.
  234. - If the input is an Index, the return is an Index
  235. - If the input is a Categorical dtype, the return is a Categorical
  236. - If the input is a Series/ndarray, the return will be an ndarray
  237. See Also
  238. --------
  239. pandas.Index.unique
  240. pandas.Series.unique
  241. Examples
  242. --------
  243. >>> pd.unique(pd.Series([2, 1, 3, 3]))
  244. array([2, 1, 3])
  245. >>> pd.unique(pd.Series([2] + [1] * 5))
  246. array([2, 1])
  247. >>> pd.unique(pd.Series([pd.Timestamp('20160101'),
  248. ... pd.Timestamp('20160101')]))
  249. array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
  250. >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
  251. ... pd.Timestamp('20160101', tz='US/Eastern')]))
  252. array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
  253. dtype=object)
  254. >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
  255. ... pd.Timestamp('20160101', tz='US/Eastern')]))
  256. DatetimeIndex(['2016-01-01 00:00:00-05:00'],
  257. ... dtype='datetime64[ns, US/Eastern]', freq=None)
  258. >>> pd.unique(list('baabc'))
  259. array(['b', 'a', 'c'], dtype=object)
  260. An unordered Categorical will return categories in the
  261. order of appearance.
  262. >>> pd.unique(pd.Series(pd.Categorical(list('baabc'))))
  263. [b, a, c]
  264. Categories (3, object): [b, a, c]
  265. >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
  266. ... categories=list('abc'))))
  267. [b, a, c]
  268. Categories (3, object): [b, a, c]
  269. An ordered Categorical preserves the category ordering.
  270. >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
  271. ... categories=list('abc'),
  272. ... ordered=True)))
  273. [b, a, c]
  274. Categories (3, object): [a < b < c]
  275. An array of tuples
  276. >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')])
  277. array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
  278. """
  279. values = _ensure_arraylike(values)
  280. if is_extension_array_dtype(values):
  281. # Dispatch to extension dtype's unique.
  282. return values.unique()
  283. original = values
  284. htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
  285. table = htable(len(values))
  286. uniques = table.unique(values)
  287. uniques = _reconstruct_data(uniques, dtype, original)
  288. return uniques
  289. unique1d = unique
  290. def isin(comps, values):
  291. """
  292. Compute the isin boolean array
  293. Parameters
  294. ----------
  295. comps : array-like
  296. values : array-like
  297. Returns
  298. -------
  299. boolean array same length as comps
  300. """
  301. if not is_list_like(comps):
  302. raise TypeError("only list-like objects are allowed to be passed"
  303. " to isin(), you passed a [{comps_type}]"
  304. .format(comps_type=type(comps).__name__))
  305. if not is_list_like(values):
  306. raise TypeError("only list-like objects are allowed to be passed"
  307. " to isin(), you passed a [{values_type}]"
  308. .format(values_type=type(values).__name__))
  309. if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
  310. values = construct_1d_object_array_from_listlike(list(values))
  311. if is_categorical_dtype(comps):
  312. # TODO(extension)
  313. # handle categoricals
  314. return comps._values.isin(values)
  315. comps = com.values_from_object(comps)
  316. comps, dtype, _ = _ensure_data(comps)
  317. values, _, _ = _ensure_data(values, dtype=dtype)
  318. # faster for larger cases to use np.in1d
  319. f = lambda x, y: htable.ismember_object(x, values)
  320. # GH16012
  321. # Ensure np.in1d doesn't get object types or it *may* throw an exception
  322. if len(comps) > 1000000 and not is_object_dtype(comps):
  323. f = lambda x, y: np.in1d(x, y)
  324. elif is_integer_dtype(comps):
  325. try:
  326. values = values.astype('int64', copy=False)
  327. comps = comps.astype('int64', copy=False)
  328. f = lambda x, y: htable.ismember_int64(x, y)
  329. except (TypeError, ValueError, OverflowError):
  330. values = values.astype(object)
  331. comps = comps.astype(object)
  332. elif is_float_dtype(comps):
  333. try:
  334. values = values.astype('float64', copy=False)
  335. comps = comps.astype('float64', copy=False)
  336. f = lambda x, y: htable.ismember_float64(x, y)
  337. except (TypeError, ValueError):
  338. values = values.astype(object)
  339. comps = comps.astype(object)
  340. return f(comps, values)
  341. def _factorize_array(values, na_sentinel=-1, size_hint=None,
  342. na_value=None):
  343. """Factorize an array-like to labels and uniques.
  344. This doesn't do any coercion of types or unboxing before factorization.
  345. Parameters
  346. ----------
  347. values : ndarray
  348. na_sentinel : int, default -1
  349. size_hint : int, optional
  350. Passsed through to the hashtable's 'get_labels' method
  351. na_value : object, optional
  352. A value in `values` to consider missing. Note: only use this
  353. parameter when you know that you don't have any values pandas would
  354. consider missing in the array (NaN for float data, iNaT for
  355. datetimes, etc.).
  356. Returns
  357. -------
  358. labels, uniques : ndarray
  359. """
  360. (hash_klass, _), values = _get_data_algo(values, _hashtables)
  361. table = hash_klass(size_hint or len(values))
  362. uniques, labels = table.factorize(values, na_sentinel=na_sentinel,
  363. na_value=na_value)
  364. labels = ensure_platform_int(labels)
  365. return labels, uniques
  366. _shared_docs['factorize'] = """
  367. Encode the object as an enumerated type or categorical variable.
  368. This method is useful for obtaining a numeric representation of an
  369. array when all that matters is identifying distinct values. `factorize`
  370. is available as both a top-level function :func:`pandas.factorize`,
  371. and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
  372. Parameters
  373. ----------
  374. %(values)s%(sort)s%(order)s
  375. na_sentinel : int, default -1
  376. Value to mark "not found".
  377. %(size_hint)s\
  378. Returns
  379. -------
  380. labels : ndarray
  381. An integer ndarray that's an indexer into `uniques`.
  382. ``uniques.take(labels)`` will have the same values as `values`.
  383. uniques : ndarray, Index, or Categorical
  384. The unique valid values. When `values` is Categorical, `uniques`
  385. is a Categorical. When `values` is some other pandas object, an
  386. `Index` is returned. Otherwise, a 1-D ndarray is returned.
  387. .. note ::
  388. Even if there's a missing value in `values`, `uniques` will
  389. *not* contain an entry for it.
  390. See Also
  391. --------
  392. cut : Discretize continuous-valued array.
  393. unique : Find the unique value in an array.
  394. Examples
  395. --------
  396. These examples all show factorize as a top-level method like
  397. ``pd.factorize(values)``. The results are identical for methods like
  398. :meth:`Series.factorize`.
  399. >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
  400. >>> labels
  401. array([0, 0, 1, 2, 0])
  402. >>> uniques
  403. array(['b', 'a', 'c'], dtype=object)
  404. With ``sort=True``, the `uniques` will be sorted, and `labels` will be
  405. shuffled so that the relationship is the maintained.
  406. >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
  407. >>> labels
  408. array([1, 1, 0, 2, 1])
  409. >>> uniques
  410. array(['a', 'b', 'c'], dtype=object)
  411. Missing values are indicated in `labels` with `na_sentinel`
  412. (``-1`` by default). Note that missing values are never
  413. included in `uniques`.
  414. >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
  415. >>> labels
  416. array([ 0, -1, 1, 2, 0])
  417. >>> uniques
  418. array(['b', 'a', 'c'], dtype=object)
  419. Thus far, we've only factorized lists (which are internally coerced to
  420. NumPy arrays). When factorizing pandas objects, the type of `uniques`
  421. will differ. For Categoricals, a `Categorical` is returned.
  422. >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
  423. >>> labels, uniques = pd.factorize(cat)
  424. >>> labels
  425. array([0, 0, 1])
  426. >>> uniques
  427. [a, c]
  428. Categories (3, object): [a, b, c]
  429. Notice that ``'b'`` is in ``uniques.categories``, despite not being
  430. present in ``cat.values``.
  431. For all other pandas objects, an Index of the appropriate type is
  432. returned.
  433. >>> cat = pd.Series(['a', 'a', 'c'])
  434. >>> labels, uniques = pd.factorize(cat)
  435. >>> labels
  436. array([0, 0, 1])
  437. >>> uniques
  438. Index(['a', 'c'], dtype='object')
  439. """
  440. @Substitution(
  441. values=dedent("""\
  442. values : sequence
  443. A 1-D sequence. Sequences that aren't pandas objects are
  444. coerced to ndarrays before factorization.
  445. """),
  446. order=dedent("""\
  447. order
  448. .. deprecated:: 0.23.0
  449. This parameter has no effect and is deprecated.
  450. """),
  451. sort=dedent("""\
  452. sort : bool, default False
  453. Sort `uniques` and shuffle `labels` to maintain the
  454. relationship.
  455. """),
  456. size_hint=dedent("""\
  457. size_hint : int, optional
  458. Hint to the hashtable sizer.
  459. """),
  460. )
  461. @Appender(_shared_docs['factorize'])
  462. @deprecate_kwarg(old_arg_name='order', new_arg_name=None)
  463. def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
  464. # Implementation notes: This method is responsible for 3 things
  465. # 1.) coercing data to array-like (ndarray, Index, extension array)
  466. # 2.) factorizing labels and uniques
  467. # 3.) Maybe boxing the output in an Index
  468. #
  469. # Step 2 is dispatched to extension types (like Categorical). They are
  470. # responsible only for factorization. All data coercion, sorting and boxing
  471. # should happen here.
  472. values = _ensure_arraylike(values)
  473. original = values
  474. if is_extension_array_dtype(values):
  475. values = getattr(values, '_values', values)
  476. labels, uniques = values.factorize(na_sentinel=na_sentinel)
  477. dtype = original.dtype
  478. else:
  479. values, dtype, _ = _ensure_data(values)
  480. if (is_datetime64_any_dtype(original) or
  481. is_timedelta64_dtype(original) or
  482. is_period_dtype(original)):
  483. na_value = na_value_for_dtype(original.dtype)
  484. else:
  485. na_value = None
  486. labels, uniques = _factorize_array(values,
  487. na_sentinel=na_sentinel,
  488. size_hint=size_hint,
  489. na_value=na_value)
  490. if sort and len(uniques) > 0:
  491. from pandas.core.sorting import safe_sort
  492. if na_sentinel == -1:
  493. # GH-25409 take_1d only works for na_sentinels of -1
  494. try:
  495. order = uniques.argsort()
  496. order2 = order.argsort()
  497. labels = take_1d(order2, labels, fill_value=na_sentinel)
  498. uniques = uniques.take(order)
  499. except TypeError:
  500. # Mixed types, where uniques.argsort fails.
  501. uniques, labels = safe_sort(uniques, labels,
  502. na_sentinel=na_sentinel,
  503. assume_unique=True)
  504. else:
  505. uniques, labels = safe_sort(uniques, labels,
  506. na_sentinel=na_sentinel,
  507. assume_unique=True)
  508. uniques = _reconstruct_data(uniques, dtype, original)
  509. # return original tenor
  510. if isinstance(original, ABCIndexClass):
  511. uniques = original._shallow_copy(uniques, name=None)
  512. elif isinstance(original, ABCSeries):
  513. from pandas import Index
  514. uniques = Index(uniques)
  515. return labels, uniques
  516. def value_counts(values, sort=True, ascending=False, normalize=False,
  517. bins=None, dropna=True):
  518. """
  519. Compute a histogram of the counts of non-null values.
  520. Parameters
  521. ----------
  522. values : ndarray (1-d)
  523. sort : boolean, default True
  524. Sort by values
  525. ascending : boolean, default False
  526. Sort in ascending order
  527. normalize: boolean, default False
  528. If True then compute a relative histogram
  529. bins : integer, optional
  530. Rather than count values, group them into half-open bins,
  531. convenience for pd.cut, only works with numeric data
  532. dropna : boolean, default True
  533. Don't include counts of NaN
  534. Returns
  535. -------
  536. value_counts : Series
  537. """
  538. from pandas.core.series import Series, Index
  539. name = getattr(values, 'name', None)
  540. if bins is not None:
  541. try:
  542. from pandas.core.reshape.tile import cut
  543. values = Series(values)
  544. ii = cut(values, bins, include_lowest=True)
  545. except TypeError:
  546. raise TypeError("bins argument only works with numeric data.")
  547. # count, remove nulls (from the index), and but the bins
  548. result = ii.value_counts(dropna=dropna)
  549. result = result[result.index.notna()]
  550. result.index = result.index.astype('interval')
  551. result = result.sort_index()
  552. # if we are dropna and we have NO values
  553. if dropna and (result.values == 0).all():
  554. result = result.iloc[0:0]
  555. # normalizing is by len of all (regardless of dropna)
  556. counts = np.array([len(ii)])
  557. else:
  558. if is_extension_array_dtype(values) or is_sparse(values):
  559. # handle Categorical and sparse,
  560. result = Series(values)._values.value_counts(dropna=dropna)
  561. result.name = name
  562. counts = result.values
  563. else:
  564. keys, counts = _value_counts_arraylike(values, dropna)
  565. if not isinstance(keys, Index):
  566. keys = Index(keys)
  567. result = Series(counts, index=keys, name=name)
  568. if sort:
  569. result = result.sort_values(ascending=ascending)
  570. if normalize:
  571. result = result / float(counts.sum())
  572. return result
  573. def _value_counts_arraylike(values, dropna):
  574. """
  575. Parameters
  576. ----------
  577. values : arraylike
  578. dropna : boolean
  579. Returns
  580. -------
  581. (uniques, counts)
  582. """
  583. values = _ensure_arraylike(values)
  584. original = values
  585. values, dtype, ndtype = _ensure_data(values)
  586. if needs_i8_conversion(dtype):
  587. # i8
  588. keys, counts = htable.value_count_int64(values, dropna)
  589. if dropna:
  590. msk = keys != iNaT
  591. keys, counts = keys[msk], counts[msk]
  592. else:
  593. # ndarray like
  594. # TODO: handle uint8
  595. f = getattr(htable, "value_count_{dtype}".format(dtype=ndtype))
  596. keys, counts = f(values, dropna)
  597. mask = isna(values)
  598. if not dropna and mask.any():
  599. if not isna(keys).any():
  600. keys = np.insert(keys, 0, np.NaN)
  601. counts = np.insert(counts, 0, mask.sum())
  602. keys = _reconstruct_data(keys, original.dtype, original)
  603. return keys, counts
  604. def duplicated(values, keep='first'):
  605. """
  606. Return boolean ndarray denoting duplicate values.
  607. .. versionadded:: 0.19.0
  608. Parameters
  609. ----------
  610. values : ndarray-like
  611. Array over which to check for duplicate values.
  612. keep : {'first', 'last', False}, default 'first'
  613. - ``first`` : Mark duplicates as ``True`` except for the first
  614. occurrence.
  615. - ``last`` : Mark duplicates as ``True`` except for the last
  616. occurrence.
  617. - False : Mark all duplicates as ``True``.
  618. Returns
  619. -------
  620. duplicated : ndarray
  621. """
  622. values, dtype, ndtype = _ensure_data(values)
  623. f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
  624. return f(values, keep=keep)
  625. def mode(values, dropna=True):
  626. """
  627. Returns the mode(s) of an array.
  628. Parameters
  629. ----------
  630. values : array-like
  631. Array over which to check for duplicate values.
  632. dropna : boolean, default True
  633. Don't consider counts of NaN/NaT.
  634. .. versionadded:: 0.24.0
  635. Returns
  636. -------
  637. mode : Series
  638. """
  639. from pandas import Series
  640. values = _ensure_arraylike(values)
  641. original = values
  642. # categorical is a fast-path
  643. if is_categorical_dtype(values):
  644. if isinstance(values, Series):
  645. return Series(values.values.mode(dropna=dropna), name=values.name)
  646. return values.mode(dropna=dropna)
  647. if dropna and is_datetimelike(values):
  648. mask = values.isnull()
  649. values = values[~mask]
  650. values, dtype, ndtype = _ensure_data(values)
  651. f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
  652. result = f(values, dropna=dropna)
  653. try:
  654. result = np.sort(result)
  655. except TypeError as e:
  656. warn("Unable to sort modes: {error}".format(error=e))
  657. result = _reconstruct_data(result, original.dtype, original)
  658. return Series(result)
  659. def rank(values, axis=0, method='average', na_option='keep',
  660. ascending=True, pct=False):
  661. """
  662. Rank the values along a given axis.
  663. Parameters
  664. ----------
  665. values : array-like
  666. Array whose values will be ranked. The number of dimensions in this
  667. array must not exceed 2.
  668. axis : int, default 0
  669. Axis over which to perform rankings.
  670. method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
  671. The method by which tiebreaks are broken during the ranking.
  672. na_option : {'keep', 'top'}, default 'keep'
  673. The method by which NaNs are placed in the ranking.
  674. - ``keep``: rank each NaN value with a NaN ranking
  675. - ``top``: replace each NaN with either +/- inf so that they
  676. there are ranked at the top
  677. ascending : boolean, default True
  678. Whether or not the elements should be ranked in ascending order.
  679. pct : boolean, default False
  680. Whether or not to the display the returned rankings in integer form
  681. (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
  682. """
  683. if values.ndim == 1:
  684. f, values = _get_data_algo(values, _rank1d_functions)
  685. ranks = f(values, ties_method=method, ascending=ascending,
  686. na_option=na_option, pct=pct)
  687. elif values.ndim == 2:
  688. f, values = _get_data_algo(values, _rank2d_functions)
  689. ranks = f(values, axis=axis, ties_method=method,
  690. ascending=ascending, na_option=na_option, pct=pct)
  691. else:
  692. raise TypeError("Array with ndim > 2 are not supported.")
  693. return ranks
  694. def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
  695. """
  696. Perform array addition that checks for underflow and overflow.
  697. Performs the addition of an int64 array and an int64 integer (or array)
  698. but checks that they do not result in overflow first. For elements that
  699. are indicated to be NaN, whether or not there is overflow for that element
  700. is automatically ignored.
  701. Parameters
  702. ----------
  703. arr : array addend.
  704. b : array or scalar addend.
  705. arr_mask : boolean array or None
  706. array indicating which elements to exclude from checking
  707. b_mask : boolean array or boolean or None
  708. array or scalar indicating which element(s) to exclude from checking
  709. Returns
  710. -------
  711. sum : An array for elements x + b for each element x in arr if b is
  712. a scalar or an array for elements x + y for each element pair
  713. (x, y) in (arr, b).
  714. Raises
  715. ------
  716. OverflowError if any x + y exceeds the maximum or minimum int64 value.
  717. """
  718. # For performance reasons, we broadcast 'b' to the new array 'b2'
  719. # so that it has the same size as 'arr'.
  720. b2 = np.broadcast_to(b, arr.shape)
  721. if b_mask is not None:
  722. # We do the same broadcasting for b_mask as well.
  723. b2_mask = np.broadcast_to(b_mask, arr.shape)
  724. else:
  725. b2_mask = None
  726. # For elements that are NaN, regardless of their value, we should
  727. # ignore whether they overflow or not when doing the checked add.
  728. if arr_mask is not None and b2_mask is not None:
  729. not_nan = np.logical_not(arr_mask | b2_mask)
  730. elif arr_mask is not None:
  731. not_nan = np.logical_not(arr_mask)
  732. elif b_mask is not None:
  733. not_nan = np.logical_not(b2_mask)
  734. else:
  735. not_nan = np.empty(arr.shape, dtype=bool)
  736. not_nan.fill(True)
  737. # gh-14324: For each element in 'arr' and its corresponding element
  738. # in 'b2', we check the sign of the element in 'b2'. If it is positive,
  739. # we then check whether its sum with the element in 'arr' exceeds
  740. # np.iinfo(np.int64).max. If so, we have an overflow error. If it
  741. # it is negative, we then check whether its sum with the element in
  742. # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
  743. # error as well.
  744. mask1 = b2 > 0
  745. mask2 = b2 < 0
  746. if not mask1.any():
  747. to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
  748. elif not mask2.any():
  749. to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
  750. else:
  751. to_raise = (((np.iinfo(np.int64).max -
  752. b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
  753. ((np.iinfo(np.int64).min -
  754. b2[mask2] > arr[mask2]) & not_nan[mask2]).any())
  755. if to_raise:
  756. raise OverflowError("Overflow in int64 addition")
  757. return arr + b
  758. _rank1d_functions = {
  759. 'float64': algos.rank_1d_float64,
  760. 'int64': algos.rank_1d_int64,
  761. 'uint64': algos.rank_1d_uint64,
  762. 'object': algos.rank_1d_object
  763. }
  764. _rank2d_functions = {
  765. 'float64': algos.rank_2d_float64,
  766. 'int64': algos.rank_2d_int64,
  767. 'uint64': algos.rank_2d_uint64,
  768. 'object': algos.rank_2d_object
  769. }
  770. def quantile(x, q, interpolation_method='fraction'):
  771. """
  772. Compute sample quantile or quantiles of the input array. For example, q=0.5
  773. computes the median.
  774. The `interpolation_method` parameter supports three values, namely
  775. `fraction` (default), `lower` and `higher`. Interpolation is done only,
  776. if the desired quantile lies between two data points `i` and `j`. For
  777. `fraction`, the result is an interpolated value between `i` and `j`;
  778. for `lower`, the result is `i`, for `higher` the result is `j`.
  779. Parameters
  780. ----------
  781. x : ndarray
  782. Values from which to extract score.
  783. q : scalar or array
  784. Percentile at which to extract score.
  785. interpolation_method : {'fraction', 'lower', 'higher'}, optional
  786. This optional parameter specifies the interpolation method to use,
  787. when the desired quantile lies between two data points `i` and `j`:
  788. - fraction: `i + (j - i)*fraction`, where `fraction` is the
  789. fractional part of the index surrounded by `i` and `j`.
  790. -lower: `i`.
  791. - higher: `j`.
  792. Returns
  793. -------
  794. score : float
  795. Score at percentile.
  796. Examples
  797. --------
  798. >>> from scipy import stats
  799. >>> a = np.arange(100)
  800. >>> stats.scoreatpercentile(a, 50)
  801. 49.5
  802. """
  803. x = np.asarray(x)
  804. mask = isna(x)
  805. x = x[~mask]
  806. values = np.sort(x)
  807. def _interpolate(a, b, fraction):
  808. """Returns the point at the given fraction between a and b, where
  809. 'fraction' must be between 0 and 1.
  810. """
  811. return a + (b - a) * fraction
  812. def _get_score(at):
  813. if len(values) == 0:
  814. return np.nan
  815. idx = at * (len(values) - 1)
  816. if idx % 1 == 0:
  817. score = values[int(idx)]
  818. else:
  819. if interpolation_method == 'fraction':
  820. score = _interpolate(values[int(idx)], values[int(idx) + 1],
  821. idx % 1)
  822. elif interpolation_method == 'lower':
  823. score = values[np.floor(idx)]
  824. elif interpolation_method == 'higher':
  825. score = values[np.ceil(idx)]
  826. else:
  827. raise ValueError("interpolation_method can only be 'fraction' "
  828. ", 'lower' or 'higher'")
  829. return score
  830. if is_scalar(q):
  831. return _get_score(q)
  832. else:
  833. q = np.asarray(q, np.float64)
  834. return algos.arrmap_float64(q, _get_score)
  835. # --------------- #
  836. # select n #
  837. # --------------- #
  838. class SelectN(object):
  839. def __init__(self, obj, n, keep):
  840. self.obj = obj
  841. self.n = n
  842. self.keep = keep
  843. if self.keep not in ('first', 'last', 'all'):
  844. raise ValueError('keep must be either "first", "last" or "all"')
  845. def nlargest(self):
  846. return self.compute('nlargest')
  847. def nsmallest(self):
  848. return self.compute('nsmallest')
  849. @staticmethod
  850. def is_valid_dtype_n_method(dtype):
  851. """
  852. Helper function to determine if dtype is valid for
  853. nsmallest/nlargest methods
  854. """
  855. return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or
  856. needs_i8_conversion(dtype))
  857. class SelectNSeries(SelectN):
  858. """
  859. Implement n largest/smallest for Series
  860. Parameters
  861. ----------
  862. obj : Series
  863. n : int
  864. keep : {'first', 'last'}, default 'first'
  865. Returns
  866. -------
  867. nordered : Series
  868. """
  869. def compute(self, method):
  870. n = self.n
  871. dtype = self.obj.dtype
  872. if not self.is_valid_dtype_n_method(dtype):
  873. raise TypeError("Cannot use method '{method}' with "
  874. "dtype {dtype}".format(method=method,
  875. dtype=dtype))
  876. if n <= 0:
  877. return self.obj[[]]
  878. dropped = self.obj.dropna()
  879. # slow method
  880. if n >= len(self.obj):
  881. reverse_it = (self.keep == 'last' or method == 'nlargest')
  882. ascending = method == 'nsmallest'
  883. slc = np.s_[::-1] if reverse_it else np.s_[:]
  884. return dropped[slc].sort_values(ascending=ascending).head(n)
  885. # fast method
  886. arr, pandas_dtype, _ = _ensure_data(dropped.values)
  887. if method == 'nlargest':
  888. arr = -arr
  889. if is_integer_dtype(pandas_dtype):
  890. # GH 21426: ensure reverse ordering at boundaries
  891. arr -= 1
  892. if self.keep == 'last':
  893. arr = arr[::-1]
  894. narr = len(arr)
  895. n = min(n, narr)
  896. kth_val = algos.kth_smallest(arr.copy(), n - 1)
  897. ns, = np.nonzero(arr <= kth_val)
  898. inds = ns[arr[ns].argsort(kind='mergesort')]
  899. if self.keep != 'all':
  900. inds = inds[:n]
  901. if self.keep == 'last':
  902. # reverse indices
  903. inds = narr - 1 - inds
  904. return dropped.iloc[inds]
  905. class SelectNFrame(SelectN):
  906. """
  907. Implement n largest/smallest for DataFrame
  908. Parameters
  909. ----------
  910. obj : DataFrame
  911. n : int
  912. keep : {'first', 'last'}, default 'first'
  913. columns : list or str
  914. Returns
  915. -------
  916. nordered : DataFrame
  917. """
  918. def __init__(self, obj, n, keep, columns):
  919. super(SelectNFrame, self).__init__(obj, n, keep)
  920. if not is_list_like(columns) or isinstance(columns, tuple):
  921. columns = [columns]
  922. columns = list(columns)
  923. self.columns = columns
  924. def compute(self, method):
  925. from pandas import Int64Index
  926. n = self.n
  927. frame = self.obj
  928. columns = self.columns
  929. for column in columns:
  930. dtype = frame[column].dtype
  931. if not self.is_valid_dtype_n_method(dtype):
  932. raise TypeError((
  933. "Column {column!r} has dtype {dtype}, cannot use method "
  934. "{method!r} with this dtype"
  935. ).format(column=column, dtype=dtype, method=method))
  936. def get_indexer(current_indexer, other_indexer):
  937. """Helper function to concat `current_indexer` and `other_indexer`
  938. depending on `method`
  939. """
  940. if method == 'nsmallest':
  941. return current_indexer.append(other_indexer)
  942. else:
  943. return other_indexer.append(current_indexer)
  944. # Below we save and reset the index in case index contains duplicates
  945. original_index = frame.index
  946. cur_frame = frame = frame.reset_index(drop=True)
  947. cur_n = n
  948. indexer = Int64Index([])
  949. for i, column in enumerate(columns):
  950. # For each column we apply method to cur_frame[column].
  951. # If it's the last column or if we have the number of
  952. # results desired we are done.
  953. # Otherwise there are duplicates of the largest/smallest
  954. # value and we need to look at the rest of the columns
  955. # to determine which of the rows with the largest/smallest
  956. # value in the column to keep.
  957. series = cur_frame[column]
  958. is_last_column = len(columns) - 1 == i
  959. values = getattr(series, method)(
  960. cur_n,
  961. keep=self.keep if is_last_column else 'all')
  962. if is_last_column or len(values) <= cur_n:
  963. indexer = get_indexer(indexer, values.index)
  964. break
  965. # Now find all values which are equal to
  966. # the (nsmallest: largest)/(nlarrgest: smallest)
  967. # from our series.
  968. border_value = values == values[values.index[-1]]
  969. # Some of these values are among the top-n
  970. # some aren't.
  971. unsafe_values = values[border_value]
  972. # These values are definitely among the top-n
  973. safe_values = values[~border_value]
  974. indexer = get_indexer(indexer, safe_values.index)
  975. # Go on and separate the unsafe_values on the remaining
  976. # columns.
  977. cur_frame = cur_frame.loc[unsafe_values.index]
  978. cur_n = n - len(indexer)
  979. frame = frame.take(indexer)
  980. # Restore the index on frame
  981. frame.index = original_index.take(indexer)
  982. # If there is only one column, the frame is already sorted.
  983. if len(columns) == 1:
  984. return frame
  985. ascending = method == 'nsmallest'
  986. return frame.sort_values(
  987. columns,
  988. ascending=ascending,
  989. kind='mergesort')
  990. # ------- ## ---- #
  991. # take #
  992. # ---- #
  993. def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
  994. def wrapper(arr, indexer, out, fill_value=np.nan):
  995. if arr_dtype is not None:
  996. arr = arr.view(arr_dtype)
  997. if out_dtype is not None:
  998. out = out.view(out_dtype)
  999. if fill_wrap is not None:
  1000. fill_value = fill_wrap(fill_value)
  1001. f(arr, indexer, out, fill_value=fill_value)
  1002. return wrapper
  1003. def _convert_wrapper(f, conv_dtype):
  1004. def wrapper(arr, indexer, out, fill_value=np.nan):
  1005. arr = arr.astype(conv_dtype)
  1006. f(arr, indexer, out, fill_value=fill_value)
  1007. return wrapper
  1008. def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info):
  1009. # this is not ideal, performance-wise, but it's better than raising
  1010. # an exception (best to optimize in Cython to avoid getting here)
  1011. row_idx, col_idx = indexer
  1012. if mask_info is not None:
  1013. (row_mask, col_mask), (row_needs, col_needs) = mask_info
  1014. else:
  1015. row_mask = row_idx == -1
  1016. col_mask = col_idx == -1
  1017. row_needs = row_mask.any()
  1018. col_needs = col_mask.any()
  1019. if fill_value is not None:
  1020. if row_needs:
  1021. out[row_mask, :] = fill_value
  1022. if col_needs:
  1023. out[:, col_mask] = fill_value
  1024. for i in range(len(row_idx)):
  1025. u_ = row_idx[i]
  1026. for j in range(len(col_idx)):
  1027. v = col_idx[j]
  1028. out[i, j] = arr[u_, v]
  1029. def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info):
  1030. if mask_info is not None:
  1031. mask, needs_masking = mask_info
  1032. else:
  1033. mask = indexer == -1
  1034. needs_masking = mask.any()
  1035. if arr.dtype != out.dtype:
  1036. arr = arr.astype(out.dtype)
  1037. if arr.shape[axis] > 0:
  1038. arr.take(ensure_platform_int(indexer), axis=axis, out=out)
  1039. if needs_masking:
  1040. outindexer = [slice(None)] * arr.ndim
  1041. outindexer[axis] = mask
  1042. out[tuple(outindexer)] = fill_value
  1043. _take_1d_dict = {
  1044. ('int8', 'int8'): algos.take_1d_int8_int8,
  1045. ('int8', 'int32'): algos.take_1d_int8_int32,
  1046. ('int8', 'int64'): algos.take_1d_int8_int64,
  1047. ('int8', 'float64'): algos.take_1d_int8_float64,
  1048. ('int16', 'int16'): algos.take_1d_int16_int16,
  1049. ('int16', 'int32'): algos.take_1d_int16_int32,
  1050. ('int16', 'int64'): algos.take_1d_int16_int64,
  1051. ('int16', 'float64'): algos.take_1d_int16_float64,
  1052. ('int32', 'int32'): algos.take_1d_int32_int32,
  1053. ('int32', 'int64'): algos.take_1d_int32_int64,
  1054. ('int32', 'float64'): algos.take_1d_int32_float64,
  1055. ('int64', 'int64'): algos.take_1d_int64_int64,
  1056. ('int64', 'float64'): algos.take_1d_int64_float64,
  1057. ('float32', 'float32'): algos.take_1d_float32_float32,
  1058. ('float32', 'float64'): algos.take_1d_float32_float64,
  1059. ('float64', 'float64'): algos.take_1d_float64_float64,
  1060. ('object', 'object'): algos.take_1d_object_object,
  1061. ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8,
  1062. np.uint8),
  1063. ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8,
  1064. None),
  1065. ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(
  1066. algos.take_1d_int64_int64, np.int64, np.int64, np.int64)
  1067. }
  1068. _take_2d_axis0_dict = {
  1069. ('int8', 'int8'): algos.take_2d_axis0_int8_int8,
  1070. ('int8', 'int32'): algos.take_2d_axis0_int8_int32,
  1071. ('int8', 'int64'): algos.take_2d_axis0_int8_int64,
  1072. ('int8', 'float64'): algos.take_2d_axis0_int8_float64,
  1073. ('int16', 'int16'): algos.take_2d_axis0_int16_int16,
  1074. ('int16', 'int32'): algos.take_2d_axis0_int16_int32,
  1075. ('int16', 'int64'): algos.take_2d_axis0_int16_int64,
  1076. ('int16', 'float64'): algos.take_2d_axis0_int16_float64,
  1077. ('int32', 'int32'): algos.take_2d_axis0_int32_int32,
  1078. ('int32', 'int64'): algos.take_2d_axis0_int32_int64,
  1079. ('int32', 'float64'): algos.take_2d_axis0_int32_float64,
  1080. ('int64', 'int64'): algos.take_2d_axis0_int64_int64,
  1081. ('int64', 'float64'): algos.take_2d_axis0_int64_float64,
  1082. ('float32', 'float32'): algos.take_2d_axis0_float32_float32,
  1083. ('float32', 'float64'): algos.take_2d_axis0_float32_float64,
  1084. ('float64', 'float64'): algos.take_2d_axis0_float64_float64,
  1085. ('object', 'object'): algos.take_2d_axis0_object_object,
  1086. ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8,
  1087. np.uint8),
  1088. ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object,
  1089. np.uint8, None),
  1090. ('datetime64[ns]', 'datetime64[ns]'):
  1091. _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64,
  1092. fill_wrap=np.int64)
  1093. }
  1094. _take_2d_axis1_dict = {
  1095. ('int8', 'int8'): algos.take_2d_axis1_int8_int8,
  1096. ('int8', 'int32'): algos.take_2d_axis1_int8_int32,
  1097. ('int8', 'int64'): algos.take_2d_axis1_int8_int64,
  1098. ('int8', 'float64'): algos.take_2d_axis1_int8_float64,
  1099. ('int16', 'int16'): algos.take_2d_axis1_int16_int16,
  1100. ('int16', 'int32'): algos.take_2d_axis1_int16_int32,
  1101. ('int16', 'int64'): algos.take_2d_axis1_int16_int64,
  1102. ('int16', 'float64'): algos.take_2d_axis1_int16_float64,
  1103. ('int32', 'int32'): algos.take_2d_axis1_int32_int32,
  1104. ('int32', 'int64'): algos.take_2d_axis1_int32_int64,
  1105. ('int32', 'float64'): algos.take_2d_axis1_int32_float64,
  1106. ('int64', 'int64'): algos.take_2d_axis1_int64_int64,
  1107. ('int64', 'float64'): algos.take_2d_axis1_int64_float64,
  1108. ('float32', 'float32'): algos.take_2d_axis1_float32_float32,
  1109. ('float32', 'float64'): algos.take_2d_axis1_float32_float64,
  1110. ('float64', 'float64'): algos.take_2d_axis1_float64_float64,
  1111. ('object', 'object'): algos.take_2d_axis1_object_object,
  1112. ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8,
  1113. np.uint8),
  1114. ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object,
  1115. np.uint8, None),
  1116. ('datetime64[ns]', 'datetime64[ns]'):
  1117. _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64,
  1118. fill_wrap=np.int64)
  1119. }
  1120. _take_2d_multi_dict = {
  1121. ('int8', 'int8'): algos.take_2d_multi_int8_int8,
  1122. ('int8', 'int32'): algos.take_2d_multi_int8_int32,
  1123. ('int8', 'int64'): algos.take_2d_multi_int8_int64,
  1124. ('int8', 'float64'): algos.take_2d_multi_int8_float64,
  1125. ('int16', 'int16'): algos.take_2d_multi_int16_int16,
  1126. ('int16', 'int32'): algos.take_2d_multi_int16_int32,
  1127. ('int16', 'int64'): algos.take_2d_multi_int16_int64,
  1128. ('int16', 'float64'): algos.take_2d_multi_int16_float64,
  1129. ('int32', 'int32'): algos.take_2d_multi_int32_int32,
  1130. ('int32', 'int64'): algos.take_2d_multi_int32_int64,
  1131. ('int32', 'float64'): algos.take_2d_multi_int32_float64,
  1132. ('int64', 'int64'): algos.take_2d_multi_int64_int64,
  1133. ('int64', 'float64'): algos.take_2d_multi_int64_float64,
  1134. ('float32', 'float32'): algos.take_2d_multi_float32_float32,
  1135. ('float32', 'float64'): algos.take_2d_multi_float32_float64,
  1136. ('float64', 'float64'): algos.take_2d_multi_float64_float64,
  1137. ('object', 'object'): algos.take_2d_multi_object_object,
  1138. ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8,
  1139. np.uint8),
  1140. ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object,
  1141. np.uint8, None),
  1142. ('datetime64[ns]', 'datetime64[ns]'):
  1143. _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64,
  1144. fill_wrap=np.int64)
  1145. }
  1146. def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None):
  1147. if ndim <= 2:
  1148. tup = (arr_dtype.name, out_dtype.name)
  1149. if ndim == 1:
  1150. func = _take_1d_dict.get(tup, None)
  1151. elif ndim == 2:
  1152. if axis == 0:
  1153. func = _take_2d_axis0_dict.get(tup, None)
  1154. else:
  1155. func = _take_2d_axis1_dict.get(tup, None)
  1156. if func is not None:
  1157. return func
  1158. tup = (out_dtype.name, out_dtype.name)
  1159. if ndim == 1:
  1160. func = _take_1d_dict.get(tup, None)
  1161. elif ndim == 2:
  1162. if axis == 0:
  1163. func = _take_2d_axis0_dict.get(tup, None)
  1164. else:
  1165. func = _take_2d_axis1_dict.get(tup, None)
  1166. if func is not None:
  1167. func = _convert_wrapper(func, out_dtype)
  1168. return func
  1169. def func(arr, indexer, out, fill_value=np.nan):
  1170. indexer = ensure_int64(indexer)
  1171. _take_nd_object(arr, indexer, out, axis=axis, fill_value=fill_value,
  1172. mask_info=mask_info)
  1173. return func
  1174. def take(arr, indices, axis=0, allow_fill=False, fill_value=None):
  1175. """
  1176. Take elements from an array.
  1177. .. versionadded:: 0.23.0
  1178. Parameters
  1179. ----------
  1180. arr : sequence
  1181. Non array-likes (sequences without a dtype) are coerced
  1182. to an ndarray.
  1183. indices : sequence of integers
  1184. Indices to be taken.
  1185. axis : int, default 0
  1186. The axis over which to select values.
  1187. allow_fill : bool, default False
  1188. How to handle negative values in `indices`.
  1189. * False: negative values in `indices` indicate positional indices
  1190. from the right (the default). This is similar to :func:`numpy.take`.
  1191. * True: negative values in `indices` indicate
  1192. missing values. These values are set to `fill_value`. Any other
  1193. other negative values raise a ``ValueError``.
  1194. fill_value : any, optional
  1195. Fill value to use for NA-indices when `allow_fill` is True.
  1196. This may be ``None``, in which case the default NA value for
  1197. the type (``self.dtype.na_value``) is used.
  1198. For multi-dimensional `arr`, each *element* is filled with
  1199. `fill_value`.
  1200. Returns
  1201. -------
  1202. ndarray or ExtensionArray
  1203. Same type as the input.
  1204. Raises
  1205. ------
  1206. IndexError
  1207. When `indices` is out of bounds for the array.
  1208. ValueError
  1209. When the indexer contains negative values other than ``-1``
  1210. and `allow_fill` is True.
  1211. Notes
  1212. -----
  1213. When `allow_fill` is False, `indices` may be whatever dimensionality
  1214. is accepted by NumPy for `arr`.
  1215. When `allow_fill` is True, `indices` should be 1-D.
  1216. See Also
  1217. --------
  1218. numpy.take
  1219. Examples
  1220. --------
  1221. >>> from pandas.api.extensions import take
  1222. With the default ``allow_fill=False``, negative numbers indicate
  1223. positional indices from the right.
  1224. >>> take(np.array([10, 20, 30]), [0, 0, -1])
  1225. array([10, 10, 30])
  1226. Setting ``allow_fill=True`` will place `fill_value` in those positions.
  1227. >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
  1228. array([10., 10., nan])
  1229. >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
  1230. ... fill_value=-10)
  1231. array([ 10, 10, -10])
  1232. """
  1233. from pandas.core.indexing import validate_indices
  1234. if not is_array_like(arr):
  1235. arr = np.asarray(arr)
  1236. indices = np.asarray(indices, dtype=np.intp)
  1237. if allow_fill:
  1238. # Pandas style, -1 means NA
  1239. validate_indices(indices, len(arr))
  1240. result = take_1d(arr, indices, axis=axis, allow_fill=True,
  1241. fill_value=fill_value)
  1242. else:
  1243. # NumPy style
  1244. result = arr.take(indices, axis=axis)
  1245. return result
  1246. def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
  1247. allow_fill=True):
  1248. """
  1249. Specialized Cython take which sets NaN values in one pass
  1250. This dispatches to ``take`` defined on ExtensionArrays. It does not
  1251. currently dispatch to ``SparseArray.take`` for sparse ``arr``.
  1252. Parameters
  1253. ----------
  1254. arr : array-like
  1255. Input array.
  1256. indexer : ndarray
  1257. 1-D array of indices to take, subarrays corresponding to -1 value
  1258. indices are filed with fill_value
  1259. axis : int, default 0
  1260. Axis to take from
  1261. out : ndarray or None, default None
  1262. Optional output array, must be appropriate type to hold input and
  1263. fill_value together, if indexer has any -1 value entries; call
  1264. _maybe_promote to determine this type for any fill_value
  1265. fill_value : any, default np.nan
  1266. Fill value to replace -1 values with
  1267. mask_info : tuple of (ndarray, boolean)
  1268. If provided, value should correspond to:
  1269. (indexer != -1, (indexer != -1).any())
  1270. If not provided, it will be computed internally if necessary
  1271. allow_fill : boolean, default True
  1272. If False, indexer is assumed to contain no -1 values so no filling
  1273. will be done. This short-circuits computation of a mask. Result is
  1274. undefined if allow_fill == False and -1 is present in indexer.
  1275. Returns
  1276. -------
  1277. subarray : array-like
  1278. May be the same type as the input, or cast to an ndarray.
  1279. """
  1280. # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs
  1281. # dispatch to internal type takes
  1282. if is_extension_array_dtype(arr):
  1283. return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
  1284. elif is_datetime64tz_dtype(arr):
  1285. return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
  1286. elif is_interval_dtype(arr):
  1287. return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
  1288. if is_sparse(arr):
  1289. arr = arr.get_values()
  1290. elif isinstance(arr, (ABCIndexClass, ABCSeries)):
  1291. arr = arr.values
  1292. arr = np.asarray(arr)
  1293. if indexer is None:
  1294. indexer = np.arange(arr.shape[axis], dtype=np.int64)
  1295. dtype, fill_value = arr.dtype, arr.dtype.type()
  1296. else:
  1297. indexer = ensure_int64(indexer, copy=False)
  1298. if not allow_fill:
  1299. dtype, fill_value = arr.dtype, arr.dtype.type()
  1300. mask_info = None, False
  1301. else:
  1302. # check for promotion based on types only (do this first because
  1303. # it's faster than computing a mask)
  1304. dtype, fill_value = maybe_promote(arr.dtype, fill_value)
  1305. if dtype != arr.dtype and (out is None or out.dtype != dtype):
  1306. # check if promotion is actually required based on indexer
  1307. if mask_info is not None:
  1308. mask, needs_masking = mask_info
  1309. else:
  1310. mask = indexer == -1
  1311. needs_masking = mask.any()
  1312. mask_info = mask, needs_masking
  1313. if needs_masking:
  1314. if out is not None and out.dtype != dtype:
  1315. raise TypeError('Incompatible type for fill_value')
  1316. else:
  1317. # if not, then depromote, set fill_value to dummy
  1318. # (it won't be used but we don't want the cython code
  1319. # to crash when trying to cast it to dtype)
  1320. dtype, fill_value = arr.dtype, arr.dtype.type()
  1321. flip_order = False
  1322. if arr.ndim == 2:
  1323. if arr.flags.f_contiguous:
  1324. flip_order = True
  1325. if flip_order:
  1326. arr = arr.T
  1327. axis = arr.ndim - axis - 1
  1328. if out is not None:
  1329. out = out.T
  1330. # at this point, it's guaranteed that dtype can hold both the arr values
  1331. # and the fill_value
  1332. if out is None:
  1333. out_shape = list(arr.shape)
  1334. out_shape[axis] = len(indexer)
  1335. out_shape = tuple(out_shape)
  1336. if arr.flags.f_contiguous and axis == arr.ndim - 1:
  1337. # minor tweak that can make an order-of-magnitude difference
  1338. # for dataframes initialized directly from 2-d ndarrays
  1339. # (s.t. df.values is c-contiguous and df._data.blocks[0] is its
  1340. # f-contiguous transpose)
  1341. out = np.empty(out_shape, dtype=dtype, order='F')
  1342. else:
  1343. out = np.empty(out_shape, dtype=dtype)
  1344. func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
  1345. mask_info=mask_info)
  1346. func(arr, indexer, out, fill_value)
  1347. if flip_order:
  1348. out = out.T
  1349. return out
  1350. take_1d = take_nd
  1351. def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None,
  1352. allow_fill=True):
  1353. """
  1354. Specialized Cython take which sets NaN values in one pass
  1355. """
  1356. if indexer is None or (indexer[0] is None and indexer[1] is None):
  1357. row_idx = np.arange(arr.shape[0], dtype=np.int64)
  1358. col_idx = np.arange(arr.shape[1], dtype=np.int64)
  1359. indexer = row_idx, col_idx
  1360. dtype, fill_value = arr.dtype, arr.dtype.type()
  1361. else:
  1362. row_idx, col_idx = indexer
  1363. if row_idx is None:
  1364. row_idx = np.arange(arr.shape[0], dtype=np.int64)
  1365. else:
  1366. row_idx = ensure_int64(row_idx)
  1367. if col_idx is None:
  1368. col_idx = np.arange(arr.shape[1], dtype=np.int64)
  1369. else:
  1370. col_idx = ensure_int64(col_idx)
  1371. indexer = row_idx, col_idx
  1372. if not allow_fill:
  1373. dtype, fill_value = arr.dtype, arr.dtype.type()
  1374. mask_info = None, False
  1375. else:
  1376. # check for promotion based on types only (do this first because
  1377. # it's faster than computing a mask)
  1378. dtype, fill_value = maybe_promote(arr.dtype, fill_value)
  1379. if dtype != arr.dtype and (out is None or out.dtype != dtype):
  1380. # check if promotion is actually required based on indexer
  1381. if mask_info is not None:
  1382. (row_mask, col_mask), (row_needs, col_needs) = mask_info
  1383. else:
  1384. row_mask = row_idx == -1
  1385. col_mask = col_idx == -1
  1386. row_needs = row_mask.any()
  1387. col_needs = col_mask.any()
  1388. mask_info = (row_mask, col_mask), (row_needs, col_needs)
  1389. if row_needs or col_needs:
  1390. if out is not None and out.dtype != dtype:
  1391. raise TypeError('Incompatible type for fill_value')
  1392. else:
  1393. # if not, then depromote, set fill_value to dummy
  1394. # (it won't be used but we don't want the cython code
  1395. # to crash when trying to cast it to dtype)
  1396. dtype, fill_value = arr.dtype, arr.dtype.type()
  1397. # at this point, it's guaranteed that dtype can hold both the arr values
  1398. # and the fill_value
  1399. if out is None:
  1400. out_shape = len(row_idx), len(col_idx)
  1401. out = np.empty(out_shape, dtype=dtype)
  1402. func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
  1403. if func is None and arr.dtype != out.dtype:
  1404. func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
  1405. if func is not None:
  1406. func = _convert_wrapper(func, out.dtype)
  1407. if func is None:
  1408. def func(arr, indexer, out, fill_value=np.nan):
  1409. _take_2d_multi_object(arr, indexer, out, fill_value=fill_value,
  1410. mask_info=mask_info)
  1411. func(arr, indexer, out=out, fill_value=fill_value)
  1412. return out
  1413. # ---- #
  1414. # diff #
  1415. # ---- #
  1416. _diff_special = {
  1417. 'float64': algos.diff_2d_float64,
  1418. 'float32': algos.diff_2d_float32,
  1419. 'int64': algos.diff_2d_int64,
  1420. 'int32': algos.diff_2d_int32,
  1421. 'int16': algos.diff_2d_int16,
  1422. 'int8': algos.diff_2d_int8,
  1423. }
  1424. def diff(arr, n, axis=0):
  1425. """
  1426. difference of n between self,
  1427. analogous to s-s.shift(n)
  1428. Parameters
  1429. ----------
  1430. arr : ndarray
  1431. n : int
  1432. number of periods
  1433. axis : int
  1434. axis to shift on
  1435. Returns
  1436. -------
  1437. shifted
  1438. """
  1439. n = int(n)
  1440. na = np.nan
  1441. dtype = arr.dtype
  1442. is_timedelta = False
  1443. if needs_i8_conversion(arr):
  1444. dtype = np.float64
  1445. arr = arr.view('i8')
  1446. na = iNaT
  1447. is_timedelta = True
  1448. elif is_bool_dtype(dtype):
  1449. dtype = np.object_
  1450. elif is_integer_dtype(dtype):
  1451. dtype = np.float64
  1452. dtype = np.dtype(dtype)
  1453. out_arr = np.empty(arr.shape, dtype=dtype)
  1454. na_indexer = [slice(None)] * arr.ndim
  1455. na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
  1456. out_arr[tuple(na_indexer)] = na
  1457. if arr.ndim == 2 and arr.dtype.name in _diff_special:
  1458. f = _diff_special[arr.dtype.name]
  1459. f(arr, out_arr, n, axis)
  1460. else:
  1461. res_indexer = [slice(None)] * arr.ndim
  1462. res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
  1463. res_indexer = tuple(res_indexer)
  1464. lag_indexer = [slice(None)] * arr.ndim
  1465. lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
  1466. lag_indexer = tuple(lag_indexer)
  1467. # need to make sure that we account for na for datelike/timedelta
  1468. # we don't actually want to subtract these i8 numbers
  1469. if is_timedelta:
  1470. res = arr[res_indexer]
  1471. lag = arr[lag_indexer]
  1472. mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na)
  1473. if mask.any():
  1474. res = res.copy()
  1475. res[mask] = 0
  1476. lag = lag.copy()
  1477. lag[mask] = 0
  1478. result = res - lag
  1479. result[mask] = na
  1480. out_arr[res_indexer] = result
  1481. else:
  1482. out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
  1483. if is_timedelta:
  1484. from pandas import TimedeltaIndex
  1485. out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape(
  1486. out_arr.shape).astype('timedelta64[ns]')
  1487. return out_arr