mstats_extras.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. """
  2. Additional statistics functions with support for masked arrays.
  3. """
  4. # Original author (2007): Pierre GF Gerard-Marchant
  5. from __future__ import division, print_function, absolute_import
  6. __all__ = ['compare_medians_ms',
  7. 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
  8. 'idealfourths',
  9. 'median_cihs','mjci','mquantiles_cimj',
  10. 'rsh',
  11. 'trimmed_mean_ci',]
  12. import numpy as np
  13. from numpy import float_, int_, ndarray
  14. import numpy.ma as ma
  15. from numpy.ma import MaskedArray
  16. from . import mstats_basic as mstats
  17. from scipy.stats.distributions import norm, beta, t, binom
  18. def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
  19. """
  20. Computes quantile estimates with the Harrell-Davis method.
  21. The quantile estimates are calculated as a weighted linear combination
  22. of order statistics.
  23. Parameters
  24. ----------
  25. data : array_like
  26. Data array.
  27. prob : sequence, optional
  28. Sequence of quantiles to compute.
  29. axis : int or None, optional
  30. Axis along which to compute the quantiles. If None, use a flattened
  31. array.
  32. var : bool, optional
  33. Whether to return the variance of the estimate.
  34. Returns
  35. -------
  36. hdquantiles : MaskedArray
  37. A (p,) array of quantiles (if `var` is False), or a (2,p) array of
  38. quantiles and variances (if `var` is True), where ``p`` is the
  39. number of quantiles.
  40. See Also
  41. --------
  42. hdquantiles_sd
  43. """
  44. def _hd_1D(data,prob,var):
  45. "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
  46. xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
  47. # Don't use length here, in case we have a numpy scalar
  48. n = xsorted.size
  49. hd = np.empty((2,len(prob)), float_)
  50. if n < 2:
  51. hd.flat = np.nan
  52. if var:
  53. return hd
  54. return hd[0]
  55. v = np.arange(n+1) / float(n)
  56. betacdf = beta.cdf
  57. for (i,p) in enumerate(prob):
  58. _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
  59. w = _w[1:] - _w[:-1]
  60. hd_mean = np.dot(w, xsorted)
  61. hd[0,i] = hd_mean
  62. #
  63. hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
  64. #
  65. hd[0, prob == 0] = xsorted[0]
  66. hd[0, prob == 1] = xsorted[-1]
  67. if var:
  68. hd[1, prob == 0] = hd[1, prob == 1] = np.nan
  69. return hd
  70. return hd[0]
  71. # Initialization & checks
  72. data = ma.array(data, copy=False, dtype=float_)
  73. p = np.array(prob, copy=False, ndmin=1)
  74. # Computes quantiles along axis (or globally)
  75. if (axis is None) or (data.ndim == 1):
  76. result = _hd_1D(data, p, var)
  77. else:
  78. if data.ndim > 2:
  79. raise ValueError("Array 'data' must be at most two dimensional, "
  80. "but got data.ndim = %d" % data.ndim)
  81. result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
  82. return ma.fix_invalid(result, copy=False)
  83. def hdmedian(data, axis=-1, var=False):
  84. """
  85. Returns the Harrell-Davis estimate of the median along the given axis.
  86. Parameters
  87. ----------
  88. data : ndarray
  89. Data array.
  90. axis : int, optional
  91. Axis along which to compute the quantiles. If None, use a flattened
  92. array.
  93. var : bool, optional
  94. Whether to return the variance of the estimate.
  95. Returns
  96. -------
  97. hdmedian : MaskedArray
  98. The median values. If ``var=True``, the variance is returned inside
  99. the masked array. E.g. for a 1-D array the shape change from (1,) to
  100. (2,).
  101. """
  102. result = hdquantiles(data,[0.5], axis=axis, var=var)
  103. return result.squeeze()
  104. def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
  105. """
  106. The standard error of the Harrell-Davis quantile estimates by jackknife.
  107. Parameters
  108. ----------
  109. data : array_like
  110. Data array.
  111. prob : sequence, optional
  112. Sequence of quantiles to compute.
  113. axis : int, optional
  114. Axis along which to compute the quantiles. If None, use a flattened
  115. array.
  116. Returns
  117. -------
  118. hdquantiles_sd : MaskedArray
  119. Standard error of the Harrell-Davis quantile estimates.
  120. See Also
  121. --------
  122. hdquantiles
  123. """
  124. def _hdsd_1D(data, prob):
  125. "Computes the std error for 1D arrays."
  126. xsorted = np.sort(data.compressed())
  127. n = len(xsorted)
  128. hdsd = np.empty(len(prob), float_)
  129. if n < 2:
  130. hdsd.flat = np.nan
  131. vv = np.arange(n) / float(n-1)
  132. betacdf = beta.cdf
  133. for (i,p) in enumerate(prob):
  134. _w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
  135. w = _w[1:] - _w[:-1]
  136. mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)),
  137. list(range(k+1,n))].astype(int_)])
  138. for k in range(n)], dtype=float_)
  139. mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
  140. hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
  141. return hdsd
  142. # Initialization & checks
  143. data = ma.array(data, copy=False, dtype=float_)
  144. p = np.array(prob, copy=False, ndmin=1)
  145. # Computes quantiles along axis (or globally)
  146. if (axis is None):
  147. result = _hdsd_1D(data, p)
  148. else:
  149. if data.ndim > 2:
  150. raise ValueError("Array 'data' must be at most two dimensional, "
  151. "but got data.ndim = %d" % data.ndim)
  152. result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
  153. return ma.fix_invalid(result, copy=False).ravel()
  154. def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
  155. alpha=0.05, axis=None):
  156. """
  157. Selected confidence interval of the trimmed mean along the given axis.
  158. Parameters
  159. ----------
  160. data : array_like
  161. Input data.
  162. limits : {None, tuple}, optional
  163. None or a two item tuple.
  164. Tuple of the percentages to cut on each side of the array, with respect
  165. to the number of unmasked data, as floats between 0. and 1. If ``n``
  166. is the number of unmasked data before trimming, then
  167. (``n * limits[0]``)th smallest data and (``n * limits[1]``)th
  168. largest data are masked. The total number of unmasked data after
  169. trimming is ``n * (1. - sum(limits))``.
  170. The value of one limit can be set to None to indicate an open interval.
  171. Defaults to (0.2, 0.2).
  172. inclusive : (2,) tuple of boolean, optional
  173. If relative==False, tuple indicating whether values exactly equal to
  174. the absolute limits are allowed.
  175. If relative==True, tuple indicating whether the number of data being
  176. masked on each side should be rounded (True) or truncated (False).
  177. Defaults to (True, True).
  178. alpha : float, optional
  179. Confidence level of the intervals.
  180. Defaults to 0.05.
  181. axis : int, optional
  182. Axis along which to cut. If None, uses a flattened version of `data`.
  183. Defaults to None.
  184. Returns
  185. -------
  186. trimmed_mean_ci : (2,) ndarray
  187. The lower and upper confidence intervals of the trimmed data.
  188. """
  189. data = ma.array(data, copy=False)
  190. trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
  191. tmean = trimmed.mean(axis)
  192. tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
  193. df = trimmed.count(axis) - 1
  194. tppf = t.ppf(1-alpha/2.,df)
  195. return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
  196. def mjci(data, prob=[0.25,0.5,0.75], axis=None):
  197. """
  198. Returns the Maritz-Jarrett estimators of the standard error of selected
  199. experimental quantiles of the data.
  200. Parameters
  201. ----------
  202. data : ndarray
  203. Data array.
  204. prob : sequence, optional
  205. Sequence of quantiles to compute.
  206. axis : int or None, optional
  207. Axis along which to compute the quantiles. If None, use a flattened
  208. array.
  209. """
  210. def _mjci_1D(data, p):
  211. data = np.sort(data.compressed())
  212. n = data.size
  213. prob = (np.array(p) * n + 0.5).astype(int_)
  214. betacdf = beta.cdf
  215. mj = np.empty(len(prob), float_)
  216. x = np.arange(1,n+1, dtype=float_) / n
  217. y = x - 1./n
  218. for (i,m) in enumerate(prob):
  219. W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
  220. C1 = np.dot(W,data)
  221. C2 = np.dot(W,data**2)
  222. mj[i] = np.sqrt(C2 - C1**2)
  223. return mj
  224. data = ma.array(data, copy=False)
  225. if data.ndim > 2:
  226. raise ValueError("Array 'data' must be at most two dimensional, "
  227. "but got data.ndim = %d" % data.ndim)
  228. p = np.array(prob, copy=False, ndmin=1)
  229. # Computes quantiles along axis (or globally)
  230. if (axis is None):
  231. return _mjci_1D(data, p)
  232. else:
  233. return ma.apply_along_axis(_mjci_1D, axis, data, p)
  234. def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
  235. """
  236. Computes the alpha confidence interval for the selected quantiles of the
  237. data, with Maritz-Jarrett estimators.
  238. Parameters
  239. ----------
  240. data : ndarray
  241. Data array.
  242. prob : sequence, optional
  243. Sequence of quantiles to compute.
  244. alpha : float, optional
  245. Confidence level of the intervals.
  246. axis : int or None, optional
  247. Axis along which to compute the quantiles.
  248. If None, use a flattened array.
  249. Returns
  250. -------
  251. ci_lower : ndarray
  252. The lower boundaries of the confidence interval. Of the same length as
  253. `prob`.
  254. ci_upper : ndarray
  255. The upper boundaries of the confidence interval. Of the same length as
  256. `prob`.
  257. """
  258. alpha = min(alpha, 1 - alpha)
  259. z = norm.ppf(1 - alpha/2.)
  260. xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
  261. smj = mjci(data, prob, axis=axis)
  262. return (xq - z * smj, xq + z * smj)
  263. def median_cihs(data, alpha=0.05, axis=None):
  264. """
  265. Computes the alpha-level confidence interval for the median of the data.
  266. Uses the Hettmasperger-Sheather method.
  267. Parameters
  268. ----------
  269. data : array_like
  270. Input data. Masked values are discarded. The input should be 1D only,
  271. or `axis` should be set to None.
  272. alpha : float, optional
  273. Confidence level of the intervals.
  274. axis : int or None, optional
  275. Axis along which to compute the quantiles. If None, use a flattened
  276. array.
  277. Returns
  278. -------
  279. median_cihs
  280. Alpha level confidence interval.
  281. """
  282. def _cihs_1D(data, alpha):
  283. data = np.sort(data.compressed())
  284. n = len(data)
  285. alpha = min(alpha, 1-alpha)
  286. k = int(binom._ppf(alpha/2., n, 0.5))
  287. gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
  288. if gk < 1-alpha:
  289. k -= 1
  290. gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
  291. gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
  292. I = (gk - 1 + alpha)/(gk - gkk)
  293. lambd = (n-k) * I / float(k + (n-2*k)*I)
  294. lims = (lambd*data[k] + (1-lambd)*data[k-1],
  295. lambd*data[n-k-1] + (1-lambd)*data[n-k])
  296. return lims
  297. data = ma.array(data, copy=False)
  298. # Computes quantiles along axis (or globally)
  299. if (axis is None):
  300. result = _cihs_1D(data, alpha)
  301. else:
  302. if data.ndim > 2:
  303. raise ValueError("Array 'data' must be at most two dimensional, "
  304. "but got data.ndim = %d" % data.ndim)
  305. result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
  306. return result
  307. def compare_medians_ms(group_1, group_2, axis=None):
  308. """
  309. Compares the medians from two independent groups along the given axis.
  310. The comparison is performed using the McKean-Schrader estimate of the
  311. standard error of the medians.
  312. Parameters
  313. ----------
  314. group_1 : array_like
  315. First dataset. Has to be of size >=7.
  316. group_2 : array_like
  317. Second dataset. Has to be of size >=7.
  318. axis : int, optional
  319. Axis along which the medians are estimated. If None, the arrays are
  320. flattened. If `axis` is not None, then `group_1` and `group_2`
  321. should have the same shape.
  322. Returns
  323. -------
  324. compare_medians_ms : {float, ndarray}
  325. If `axis` is None, then returns a float, otherwise returns a 1-D
  326. ndarray of floats with a length equal to the length of `group_1`
  327. along `axis`.
  328. """
  329. (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
  330. (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
  331. mstats.stde_median(group_2, axis=axis))
  332. W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
  333. return 1 - norm.cdf(W)
  334. def idealfourths(data, axis=None):
  335. """
  336. Returns an estimate of the lower and upper quartiles.
  337. Uses the ideal fourths algorithm.
  338. Parameters
  339. ----------
  340. data : array_like
  341. Input array.
  342. axis : int, optional
  343. Axis along which the quartiles are estimated. If None, the arrays are
  344. flattened.
  345. Returns
  346. -------
  347. idealfourths : {list of floats, masked array}
  348. Returns the two internal values that divide `data` into four parts
  349. using the ideal fourths algorithm either along the flattened array
  350. (if `axis` is None) or along `axis` of `data`.
  351. """
  352. def _idf(data):
  353. x = data.compressed()
  354. n = len(x)
  355. if n < 3:
  356. return [np.nan,np.nan]
  357. (j,h) = divmod(n/4. + 5/12.,1)
  358. j = int(j)
  359. qlo = (1-h)*x[j-1] + h*x[j]
  360. k = n - j
  361. qup = (1-h)*x[k] + h*x[k-1]
  362. return [qlo, qup]
  363. data = ma.sort(data, axis=axis).view(MaskedArray)
  364. if (axis is None):
  365. return _idf(data)
  366. else:
  367. return ma.apply_along_axis(_idf, axis, data)
  368. def rsh(data, points=None):
  369. """
  370. Evaluates Rosenblatt's shifted histogram estimators for each data point.
  371. Rosenblatt's estimator is a centered finite-difference approximation to the
  372. derivative of the empirical cumulative distribution function.
  373. Parameters
  374. ----------
  375. data : sequence
  376. Input data, should be 1-D. Masked values are ignored.
  377. points : sequence or None, optional
  378. Sequence of points where to evaluate Rosenblatt shifted histogram.
  379. If None, use the data.
  380. """
  381. data = ma.array(data, copy=False)
  382. if points is None:
  383. points = data
  384. else:
  385. points = np.array(points, copy=False, ndmin=1)
  386. if data.ndim != 1:
  387. raise AttributeError("The input array should be 1D only !")
  388. n = data.count()
  389. r = idealfourths(data, axis=None)
  390. h = 1.2 * (r[-1]-r[0]) / n**(1./5)
  391. nhi = (data[:,None] <= points[None,:] + h).sum(0)
  392. nlo = (data[:,None] < points[None,:] - h).sum(0)
  393. return (nhi-nlo) / (2.*n*h)