tile.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. """
  2. Quantilization functions and related stuff
  3. """
  4. from functools import partial
  5. import numpy as np
  6. from pandas._libs.lib import infer_dtype
  7. from pandas.core.dtypes.common import (
  8. _NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype,
  9. is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer,
  10. is_scalar, is_timedelta64_dtype)
  11. from pandas.core.dtypes.missing import isna
  12. from pandas import (
  13. Categorical, Index, Interval, IntervalIndex, Series, Timedelta, Timestamp,
  14. to_datetime, to_timedelta)
  15. import pandas.core.algorithms as algos
  16. import pandas.core.nanops as nanops
  17. def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
  18. include_lowest=False, duplicates='raise'):
  19. """
  20. Bin values into discrete intervals.
  21. Use `cut` when you need to segment and sort data values into bins. This
  22. function is also useful for going from a continuous variable to a
  23. categorical variable. For example, `cut` could convert ages to groups of
  24. age ranges. Supports binning into an equal number of bins, or a
  25. pre-specified array of bins.
  26. Parameters
  27. ----------
  28. x : array-like
  29. The input array to be binned. Must be 1-dimensional.
  30. bins : int, sequence of scalars, or pandas.IntervalIndex
  31. The criteria to bin by.
  32. * int : Defines the number of equal-width bins in the range of `x`. The
  33. range of `x` is extended by .1% on each side to include the minimum
  34. and maximum values of `x`.
  35. * sequence of scalars : Defines the bin edges allowing for non-uniform
  36. width. No extension of the range of `x` is done.
  37. * IntervalIndex : Defines the exact bins to be used. Note that
  38. IntervalIndex for `bins` must be non-overlapping.
  39. right : bool, default True
  40. Indicates whether `bins` includes the rightmost edge or not. If
  41. ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
  42. indicate (1,2], (2,3], (3,4]. This argument is ignored when
  43. `bins` is an IntervalIndex.
  44. labels : array or bool, optional
  45. Specifies the labels for the returned bins. Must be the same length as
  46. the resulting bins. If False, returns only integer indicators of the
  47. bins. This affects the type of the output container (see below).
  48. This argument is ignored when `bins` is an IntervalIndex.
  49. retbins : bool, default False
  50. Whether to return the bins or not. Useful when bins is provided
  51. as a scalar.
  52. precision : int, default 3
  53. The precision at which to store and display the bins labels.
  54. include_lowest : bool, default False
  55. Whether the first interval should be left-inclusive or not.
  56. duplicates : {default 'raise', 'drop'}, optional
  57. If bin edges are not unique, raise ValueError or drop non-uniques.
  58. .. versionadded:: 0.23.0
  59. Returns
  60. -------
  61. out : pandas.Categorical, Series, or ndarray
  62. An array-like object representing the respective bin for each value
  63. of `x`. The type depends on the value of `labels`.
  64. * True (default) : returns a Series for Series `x` or a
  65. pandas.Categorical for all other inputs. The values stored within
  66. are Interval dtype.
  67. * sequence of scalars : returns a Series for Series `x` or a
  68. pandas.Categorical for all other inputs. The values stored within
  69. are whatever the type in the sequence is.
  70. * False : returns an ndarray of integers.
  71. bins : numpy.ndarray or IntervalIndex.
  72. The computed or specified bins. Only returned when `retbins=True`.
  73. For scalar or sequence `bins`, this is an ndarray with the computed
  74. bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
  75. an IntervalIndex `bins`, this is equal to `bins`.
  76. See Also
  77. --------
  78. qcut : Discretize variable into equal-sized buckets based on rank
  79. or based on sample quantiles.
  80. pandas.Categorical : Array type for storing data that come from a
  81. fixed set of values.
  82. Series : One-dimensional array with axis labels (including time series).
  83. pandas.IntervalIndex : Immutable Index implementing an ordered,
  84. sliceable set.
  85. Notes
  86. -----
  87. Any NA values will be NA in the result. Out of bounds values will be NA in
  88. the resulting Series or pandas.Categorical object.
  89. Examples
  90. --------
  91. Discretize into three equal-sized bins.
  92. >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
  93. ... # doctest: +ELLIPSIS
  94. [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
  95. Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
  96. >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
  97. ... # doctest: +ELLIPSIS
  98. ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
  99. Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
  100. array([0.994, 3. , 5. , 7. ]))
  101. Discovers the same bins, but assign them specific labels. Notice that
  102. the returned Categorical's categories are `labels` and is ordered.
  103. >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
  104. ... 3, labels=["bad", "medium", "good"])
  105. [bad, good, medium, medium, good, bad]
  106. Categories (3, object): [bad < medium < good]
  107. ``labels=False`` implies you just want the bins back.
  108. >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
  109. array([0, 1, 1, 3])
  110. Passing a Series as an input returns a Series with categorical dtype:
  111. >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
  112. ... index=['a', 'b', 'c', 'd', 'e'])
  113. >>> pd.cut(s, 3)
  114. ... # doctest: +ELLIPSIS
  115. a (1.992, 4.667]
  116. b (1.992, 4.667]
  117. c (4.667, 7.333]
  118. d (7.333, 10.0]
  119. e (7.333, 10.0]
  120. dtype: category
  121. Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
  122. Passing a Series as an input returns a Series with mapping value.
  123. It is used to map numerically to intervals based on bins.
  124. >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
  125. ... index=['a', 'b', 'c', 'd', 'e'])
  126. >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
  127. ... # doctest: +ELLIPSIS
  128. (a 0.0
  129. b 1.0
  130. c 2.0
  131. d 3.0
  132. e 4.0
  133. dtype: float64, array([0, 2, 4, 6, 8]))
  134. Use `drop` optional when bins is not unique
  135. >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
  136. ... right=False, duplicates='drop')
  137. ... # doctest: +ELLIPSIS
  138. (a 0.0
  139. b 1.0
  140. c 2.0
  141. d 3.0
  142. e 3.0
  143. dtype: float64, array([0, 2, 4, 6, 8]))
  144. Passing an IntervalIndex for `bins` results in those categories exactly.
  145. Notice that values not covered by the IntervalIndex are set to NaN. 0
  146. is to the left of the first bin (which is closed on the right), and 1.5
  147. falls between two bins.
  148. >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
  149. >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
  150. [NaN, (0, 1], NaN, (2, 3], (4, 5]]
  151. Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
  152. """
  153. # NOTE: this binning code is changed a bit from histogram for var(x) == 0
  154. # for handling the cut for datetime and timedelta objects
  155. x_is_series, series_index, name, x = _preprocess_for_cut(x)
  156. x, dtype = _coerce_to_type(x)
  157. if not np.iterable(bins):
  158. if is_scalar(bins) and bins < 1:
  159. raise ValueError("`bins` should be a positive integer.")
  160. try: # for array-like
  161. sz = x.size
  162. except AttributeError:
  163. x = np.asarray(x)
  164. sz = x.size
  165. if sz == 0:
  166. raise ValueError('Cannot cut empty array')
  167. rng = (nanops.nanmin(x), nanops.nanmax(x))
  168. mn, mx = [mi + 0.0 for mi in rng]
  169. if np.isinf(mn) or np.isinf(mx):
  170. # GH 24314
  171. raise ValueError('cannot specify integer `bins` when input data '
  172. 'contains infinity')
  173. elif mn == mx: # adjust end points before binning
  174. mn -= .001 * abs(mn) if mn != 0 else .001
  175. mx += .001 * abs(mx) if mx != 0 else .001
  176. bins = np.linspace(mn, mx, bins + 1, endpoint=True)
  177. else: # adjust end points after binning
  178. bins = np.linspace(mn, mx, bins + 1, endpoint=True)
  179. adj = (mx - mn) * 0.001 # 0.1% of the range
  180. if right:
  181. bins[0] -= adj
  182. else:
  183. bins[-1] += adj
  184. elif isinstance(bins, IntervalIndex):
  185. if bins.is_overlapping:
  186. raise ValueError('Overlapping IntervalIndex is not accepted.')
  187. else:
  188. if is_datetime64tz_dtype(bins):
  189. bins = np.asarray(bins, dtype=_NS_DTYPE)
  190. else:
  191. bins = np.asarray(bins)
  192. bins = _convert_bin_to_numeric_type(bins, dtype)
  193. if (np.diff(bins) < 0).any():
  194. raise ValueError('bins must increase monotonically.')
  195. fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,
  196. precision=precision,
  197. include_lowest=include_lowest,
  198. dtype=dtype,
  199. duplicates=duplicates)
  200. return _postprocess_for_cut(fac, bins, retbins, x_is_series,
  201. series_index, name, dtype)
  202. def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
  203. """
  204. Quantile-based discretization function. Discretize variable into
  205. equal-sized buckets based on rank or based on sample quantiles. For example
  206. 1000 values for 10 quantiles would produce a Categorical object indicating
  207. quantile membership for each data point.
  208. Parameters
  209. ----------
  210. x : 1d ndarray or Series
  211. q : integer or array of quantiles
  212. Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
  213. array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
  214. labels : array or boolean, default None
  215. Used as labels for the resulting bins. Must be of the same length as
  216. the resulting bins. If False, return only integer indicators of the
  217. bins.
  218. retbins : bool, optional
  219. Whether to return the (bins, labels) or not. Can be useful if bins
  220. is given as a scalar.
  221. precision : int, optional
  222. The precision at which to store and display the bins labels
  223. duplicates : {default 'raise', 'drop'}, optional
  224. If bin edges are not unique, raise ValueError or drop non-uniques.
  225. .. versionadded:: 0.20.0
  226. Returns
  227. -------
  228. out : Categorical or Series or array of integers if labels is False
  229. The return type (Categorical or Series) depends on the input: a Series
  230. of type category if input is a Series else Categorical. Bins are
  231. represented as categories when categorical data is returned.
  232. bins : ndarray of floats
  233. Returned only if `retbins` is True.
  234. Notes
  235. -----
  236. Out of bounds values will be NA in the resulting Categorical object
  237. Examples
  238. --------
  239. >>> pd.qcut(range(5), 4)
  240. ... # doctest: +ELLIPSIS
  241. [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
  242. Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...
  243. >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
  244. ... # doctest: +SKIP
  245. [good, good, medium, bad, bad]
  246. Categories (3, object): [good < medium < bad]
  247. >>> pd.qcut(range(5), 4, labels=False)
  248. array([0, 0, 1, 2, 3])
  249. """
  250. x_is_series, series_index, name, x = _preprocess_for_cut(x)
  251. x, dtype = _coerce_to_type(x)
  252. if is_integer(q):
  253. quantiles = np.linspace(0, 1, q + 1)
  254. else:
  255. quantiles = q
  256. bins = algos.quantile(x, quantiles)
  257. fac, bins = _bins_to_cuts(x, bins, labels=labels,
  258. precision=precision, include_lowest=True,
  259. dtype=dtype, duplicates=duplicates)
  260. return _postprocess_for_cut(fac, bins, retbins, x_is_series,
  261. series_index, name, dtype)
  262. def _bins_to_cuts(x, bins, right=True, labels=None,
  263. precision=3, include_lowest=False,
  264. dtype=None, duplicates='raise'):
  265. if duplicates not in ['raise', 'drop']:
  266. raise ValueError("invalid value for 'duplicates' parameter, "
  267. "valid options are: raise, drop")
  268. if isinstance(bins, IntervalIndex):
  269. # we have a fast-path here
  270. ids = bins.get_indexer(x)
  271. result = algos.take_nd(bins, ids)
  272. result = Categorical(result, categories=bins, ordered=True)
  273. return result, bins
  274. unique_bins = algos.unique(bins)
  275. if len(unique_bins) < len(bins) and len(bins) != 2:
  276. if duplicates == 'raise':
  277. raise ValueError("Bin edges must be unique: {bins!r}.\nYou "
  278. "can drop duplicate edges by setting "
  279. "the 'duplicates' kwarg".format(bins=bins))
  280. else:
  281. bins = unique_bins
  282. side = 'left' if right else 'right'
  283. ids = ensure_int64(bins.searchsorted(x, side=side))
  284. if include_lowest:
  285. ids[x == bins[0]] = 1
  286. na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
  287. has_nas = na_mask.any()
  288. if labels is not False:
  289. if labels is None:
  290. labels = _format_labels(bins, precision, right=right,
  291. include_lowest=include_lowest,
  292. dtype=dtype)
  293. else:
  294. if len(labels) != len(bins) - 1:
  295. raise ValueError('Bin labels must be one fewer than '
  296. 'the number of bin edges')
  297. if not is_categorical_dtype(labels):
  298. labels = Categorical(labels, categories=labels, ordered=True)
  299. np.putmask(ids, na_mask, 0)
  300. result = algos.take_nd(labels, ids - 1)
  301. else:
  302. result = ids - 1
  303. if has_nas:
  304. result = result.astype(np.float64)
  305. np.putmask(result, na_mask, np.nan)
  306. return result, bins
  307. def _trim_zeros(x):
  308. while len(x) > 1 and x[-1] == '0':
  309. x = x[:-1]
  310. if len(x) > 1 and x[-1] == '.':
  311. x = x[:-1]
  312. return x
  313. def _coerce_to_type(x):
  314. """
  315. if the passed data is of datetime/timedelta type,
  316. this method converts it to numeric so that cut method can
  317. handle it
  318. """
  319. dtype = None
  320. if is_datetime64tz_dtype(x):
  321. dtype = x.dtype
  322. elif is_datetime64_dtype(x):
  323. x = to_datetime(x)
  324. dtype = np.dtype('datetime64[ns]')
  325. elif is_timedelta64_dtype(x):
  326. x = to_timedelta(x)
  327. dtype = np.dtype('timedelta64[ns]')
  328. if dtype is not None:
  329. # GH 19768: force NaT to NaN during integer conversion
  330. x = np.where(x.notna(), x.view(np.int64), np.nan)
  331. return x, dtype
  332. def _convert_bin_to_numeric_type(bins, dtype):
  333. """
  334. if the passed bin is of datetime/timedelta type,
  335. this method converts it to integer
  336. Parameters
  337. ----------
  338. bins : list-like of bins
  339. dtype : dtype of data
  340. Raises
  341. ------
  342. ValueError if bins are not of a compat dtype to dtype
  343. """
  344. bins_dtype = infer_dtype(bins, skipna=False)
  345. if is_timedelta64_dtype(dtype):
  346. if bins_dtype in ['timedelta', 'timedelta64']:
  347. bins = to_timedelta(bins).view(np.int64)
  348. else:
  349. raise ValueError("bins must be of timedelta64 dtype")
  350. elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
  351. if bins_dtype in ['datetime', 'datetime64']:
  352. bins = to_datetime(bins).view(np.int64)
  353. else:
  354. raise ValueError("bins must be of datetime64 dtype")
  355. return bins
  356. def _convert_bin_to_datelike_type(bins, dtype):
  357. """
  358. Convert bins to a DatetimeIndex or TimedeltaIndex if the orginal dtype is
  359. datelike
  360. Parameters
  361. ----------
  362. bins : list-like of bins
  363. dtype : dtype of data
  364. Returns
  365. -------
  366. bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
  367. datelike
  368. """
  369. if is_datetime64tz_dtype(dtype):
  370. bins = to_datetime(bins.astype(np.int64),
  371. utc=True).tz_convert(dtype.tz)
  372. elif is_datetime_or_timedelta_dtype(dtype):
  373. bins = Index(bins.astype(np.int64), dtype=dtype)
  374. return bins
  375. def _format_labels(bins, precision, right=True,
  376. include_lowest=False, dtype=None):
  377. """ based on the dtype, return our labels """
  378. closed = 'right' if right else 'left'
  379. if is_datetime64tz_dtype(dtype):
  380. formatter = partial(Timestamp, tz=dtype.tz)
  381. adjust = lambda x: x - Timedelta('1ns')
  382. elif is_datetime64_dtype(dtype):
  383. formatter = Timestamp
  384. adjust = lambda x: x - Timedelta('1ns')
  385. elif is_timedelta64_dtype(dtype):
  386. formatter = Timedelta
  387. adjust = lambda x: x - Timedelta('1ns')
  388. else:
  389. precision = _infer_precision(precision, bins)
  390. formatter = lambda x: _round_frac(x, precision)
  391. adjust = lambda x: x - 10 ** (-precision)
  392. breaks = [formatter(b) for b in bins]
  393. labels = IntervalIndex.from_breaks(breaks, closed=closed)
  394. if right and include_lowest:
  395. # we will adjust the left hand side by precision to
  396. # account that we are all right closed
  397. v = adjust(labels[0].left)
  398. i = IntervalIndex([Interval(v, labels[0].right, closed='right')])
  399. labels = i.append(labels[1:])
  400. return labels
  401. def _preprocess_for_cut(x):
  402. """
  403. handles preprocessing for cut where we convert passed
  404. input to array, strip the index information and store it
  405. separately
  406. """
  407. x_is_series = isinstance(x, Series)
  408. series_index = None
  409. name = None
  410. if x_is_series:
  411. series_index = x.index
  412. name = x.name
  413. # Check that the passed array is a Pandas or Numpy object
  414. # We don't want to strip away a Pandas data-type here (e.g. datetimetz)
  415. ndim = getattr(x, 'ndim', None)
  416. if ndim is None:
  417. x = np.asarray(x)
  418. if x.ndim != 1:
  419. raise ValueError("Input array must be 1 dimensional")
  420. return x_is_series, series_index, name, x
  421. def _postprocess_for_cut(fac, bins, retbins, x_is_series,
  422. series_index, name, dtype):
  423. """
  424. handles post processing for the cut method where
  425. we combine the index information if the originally passed
  426. datatype was a series
  427. """
  428. if x_is_series:
  429. fac = Series(fac, index=series_index, name=name)
  430. if not retbins:
  431. return fac
  432. bins = _convert_bin_to_datelike_type(bins, dtype)
  433. return fac, bins
  434. def _round_frac(x, precision):
  435. """
  436. Round the fractional part of the given number
  437. """
  438. if not np.isfinite(x) or x == 0:
  439. return x
  440. else:
  441. frac, whole = np.modf(x)
  442. if whole == 0:
  443. digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
  444. else:
  445. digits = precision
  446. return np.around(x, digits)
  447. def _infer_precision(base_precision, bins):
  448. """Infer an appropriate precision for _round_frac
  449. """
  450. for precision in range(base_precision, 20):
  451. levels = [_round_frac(b, precision) for b in bins]
  452. if algos.unique(levels).size == bins.size:
  453. return precision
  454. return base_precision # default