_stats_mstats_common.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. from collections import namedtuple
  2. import numpy as np
  3. from . import distributions
  4. __all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
  5. LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept',
  6. 'rvalue', 'pvalue',
  7. 'stderr'))
  8. def linregress(x, y=None):
  9. """
  10. Calculate a linear least-squares regression for two sets of measurements.
  11. Parameters
  12. ----------
  13. x, y : array_like
  14. Two sets of measurements. Both arrays should have the same length.
  15. If only x is given (and y=None), then it must be a two-dimensional
  16. array where one dimension has length 2. The two sets of measurements
  17. are then found by splitting the array along the length-2 dimension.
  18. Returns
  19. -------
  20. slope : float
  21. slope of the regression line
  22. intercept : float
  23. intercept of the regression line
  24. rvalue : float
  25. correlation coefficient
  26. pvalue : float
  27. two-sided p-value for a hypothesis test whose null hypothesis is
  28. that the slope is zero, using Wald Test with t-distribution of
  29. the test statistic.
  30. stderr : float
  31. Standard error of the estimated gradient.
  32. See also
  33. --------
  34. :func:`scipy.optimize.curve_fit` : Use non-linear
  35. least squares to fit a function to data.
  36. :func:`scipy.optimize.leastsq` : Minimize the sum of
  37. squares of a set of equations.
  38. Examples
  39. --------
  40. >>> import matplotlib.pyplot as plt
  41. >>> from scipy import stats
  42. Generate some data:
  43. >>> np.random.seed(12345678)
  44. >>> x = np.random.random(10)
  45. >>> y = 1.6*x + np.random.random(10)
  46. Perform the linear regression:
  47. >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
  48. >>> print("slope: %f intercept: %f" % (slope, intercept))
  49. slope: 1.944864 intercept: 0.268578
  50. To get coefficient of determination (r_squared):
  51. >>> print("r-squared: %f" % r_value**2)
  52. r-squared: 0.735498
  53. Plot the data along with the fitted line:
  54. >>> plt.plot(x, y, 'o', label='original data')
  55. >>> plt.plot(x, intercept + slope*x, 'r', label='fitted line')
  56. >>> plt.legend()
  57. >>> plt.show()
  58. """
  59. TINY = 1.0e-20
  60. if y is None: # x is a (2, N) or (N, 2) shaped array_like
  61. x = np.asarray(x)
  62. if x.shape[0] == 2:
  63. x, y = x
  64. elif x.shape[1] == 2:
  65. x, y = x.T
  66. else:
  67. msg = ("If only `x` is given as input, it has to be of shape "
  68. "(2, N) or (N, 2), provided shape was %s" % str(x.shape))
  69. raise ValueError(msg)
  70. else:
  71. x = np.asarray(x)
  72. y = np.asarray(y)
  73. if x.size == 0 or y.size == 0:
  74. raise ValueError("Inputs must not be empty.")
  75. n = len(x)
  76. xmean = np.mean(x, None)
  77. ymean = np.mean(y, None)
  78. # average sum of squares:
  79. ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
  80. r_num = ssxym
  81. r_den = np.sqrt(ssxm * ssym)
  82. if r_den == 0.0:
  83. r = 0.0
  84. else:
  85. r = r_num / r_den
  86. # test for numerical error propagation
  87. if r > 1.0:
  88. r = 1.0
  89. elif r < -1.0:
  90. r = -1.0
  91. df = n - 2
  92. slope = r_num / ssxm
  93. intercept = ymean - slope*xmean
  94. if n == 2:
  95. # handle case when only two points are passed in
  96. if y[0] == y[1]:
  97. prob = 1.0
  98. else:
  99. prob = 0.0
  100. sterrest = 0.0
  101. else:
  102. t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  103. prob = 2 * distributions.t.sf(np.abs(t), df)
  104. sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
  105. return LinregressResult(slope, intercept, r, prob, sterrest)
  106. def theilslopes(y, x=None, alpha=0.95):
  107. r"""
  108. Computes the Theil-Sen estimator for a set of points (x, y).
  109. `theilslopes` implements a method for robust linear regression. It
  110. computes the slope as the median of all slopes between paired values.
  111. Parameters
  112. ----------
  113. y : array_like
  114. Dependent variable.
  115. x : array_like or None, optional
  116. Independent variable. If None, use ``arange(len(y))`` instead.
  117. alpha : float, optional
  118. Confidence degree between 0 and 1. Default is 95% confidence.
  119. Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
  120. interpreted as "find the 90% confidence interval".
  121. Returns
  122. -------
  123. medslope : float
  124. Theil slope.
  125. medintercept : float
  126. Intercept of the Theil line, as ``median(y) - medslope*median(x)``.
  127. lo_slope : float
  128. Lower bound of the confidence interval on `medslope`.
  129. up_slope : float
  130. Upper bound of the confidence interval on `medslope`.
  131. See also
  132. --------
  133. siegelslopes : a similar technique using repeated medians
  134. Notes
  135. -----
  136. The implementation of `theilslopes` follows [1]_. The intercept is
  137. not defined in [1]_, and here it is defined as ``median(y) -
  138. medslope*median(x)``, which is given in [3]_. Other definitions of
  139. the intercept exist in the literature. A confidence interval for
  140. the intercept is not given as this question is not addressed in
  141. [1]_.
  142. References
  143. ----------
  144. .. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau",
  145. J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
  146. .. [2] H. Theil, "A rank-invariant method of linear and polynomial
  147. regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
  148. 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
  149. .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
  150. John Wiley and Sons, New York, pp. 493.
  151. Examples
  152. --------
  153. >>> from scipy import stats
  154. >>> import matplotlib.pyplot as plt
  155. >>> x = np.linspace(-5, 5, num=150)
  156. >>> y = x + np.random.normal(size=x.size)
  157. >>> y[11:15] += 10 # add outliers
  158. >>> y[-5:] -= 7
  159. Compute the slope, intercept and 90% confidence interval. For comparison,
  160. also compute the least-squares fit with `linregress`:
  161. >>> res = stats.theilslopes(y, x, 0.90)
  162. >>> lsq_res = stats.linregress(x, y)
  163. Plot the results. The Theil-Sen regression line is shown in red, with the
  164. dashed red lines illustrating the confidence interval of the slope (note
  165. that the dashed red lines are not the confidence interval of the regression
  166. as the confidence interval of the intercept is not included). The green
  167. line shows the least-squares fit for comparison.
  168. >>> fig = plt.figure()
  169. >>> ax = fig.add_subplot(111)
  170. >>> ax.plot(x, y, 'b.')
  171. >>> ax.plot(x, res[1] + res[0] * x, 'r-')
  172. >>> ax.plot(x, res[1] + res[2] * x, 'r--')
  173. >>> ax.plot(x, res[1] + res[3] * x, 'r--')
  174. >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
  175. >>> plt.show()
  176. """
  177. # We copy both x and y so we can use _find_repeats.
  178. y = np.array(y).flatten()
  179. if x is None:
  180. x = np.arange(len(y), dtype=float)
  181. else:
  182. x = np.array(x, dtype=float).flatten()
  183. if len(x) != len(y):
  184. raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
  185. # Compute sorted slopes only when deltax > 0
  186. deltax = x[:, np.newaxis] - x
  187. deltay = y[:, np.newaxis] - y
  188. slopes = deltay[deltax > 0] / deltax[deltax > 0]
  189. slopes.sort()
  190. medslope = np.median(slopes)
  191. medinter = np.median(y) - medslope * np.median(x)
  192. # Now compute confidence intervals
  193. if alpha > 0.5:
  194. alpha = 1. - alpha
  195. z = distributions.norm.ppf(alpha / 2.)
  196. # This implements (2.6) from Sen (1968)
  197. _, nxreps = _find_repeats(x)
  198. _, nyreps = _find_repeats(y)
  199. nt = len(slopes) # N in Sen (1968)
  200. ny = len(y) # n in Sen (1968)
  201. # Equation 2.6 in Sen (1968):
  202. sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
  203. sum(k * (k-1) * (2*k + 5) for k in nxreps) -
  204. sum(k * (k-1) * (2*k + 5) for k in nyreps))
  205. # Find the confidence interval indices in `slopes`
  206. sigma = np.sqrt(sigsq)
  207. Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
  208. Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
  209. delta = slopes[[Rl, Ru]]
  210. return medslope, medinter, delta[0], delta[1]
  211. def _find_repeats(arr):
  212. # This function assumes it may clobber its input.
  213. if len(arr) == 0:
  214. return np.array(0, np.float64), np.array(0, np.intp)
  215. # XXX This cast was previously needed for the Fortran implementation,
  216. # should we ditch it?
  217. arr = np.asarray(arr, np.float64).ravel()
  218. arr.sort()
  219. # Taken from NumPy 1.9's np.unique.
  220. change = np.concatenate(([True], arr[1:] != arr[:-1]))
  221. unique = arr[change]
  222. change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
  223. freq = np.diff(change_idx)
  224. atleast2 = freq > 1
  225. return unique[atleast2], freq[atleast2]
  226. def siegelslopes(y, x=None, method="hierarchical"):
  227. r"""
  228. Computes the Siegel estimator for a set of points (x, y).
  229. `siegelslopes` implements a method for robust linear regression
  230. using repeated medians (see [1]_) to fit a line to the points (x, y).
  231. The method is robust to outliers with an asymptotic breakdown point
  232. of 50%.
  233. Parameters
  234. ----------
  235. y : array_like
  236. Dependent variable.
  237. x : array_like or None, optional
  238. Independent variable. If None, use ``arange(len(y))`` instead.
  239. method : {'hierarchical', 'separate'}
  240. If 'hierarchical', estimate the intercept using the estimated
  241. slope ``medslope`` (default option).
  242. If 'separate', estimate the intercept independent of the estimated
  243. slope. See Notes for details.
  244. Returns
  245. -------
  246. medslope : float
  247. Estimate of the slope of the regression line.
  248. medintercept : float
  249. Estimate of the intercept of the regression line.
  250. See also
  251. --------
  252. theilslopes : a similar technique without repeated medians
  253. Notes
  254. -----
  255. With ``n = len(y)``, compute ``m_j`` as the median of
  256. the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
  257. ``medslope`` is then the median of all slopes ``m_j``.
  258. Two ways are given to estimate the intercept in [1]_ which can be chosen
  259. via the parameter ``method``.
  260. The hierarchical approach uses the estimated slope ``medslope``
  261. and computes ``medintercept`` as the median of ``y - medslope*x``.
  262. The other approach estimates the intercept separately as follows: for
  263. each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
  264. lines through the remaining points and take the median ``i_j``.
  265. ``medintercept`` is the median of the ``i_j``.
  266. The implementation computes `n` times the median of a vector of size `n`
  267. which can be slow for large vectors. There are more efficient algorithms
  268. (see [2]_) which are not implemented here.
  269. References
  270. ----------
  271. .. [1] A. Siegel, "Robust Regression Using Repeated Medians",
  272. Biometrika, Vol. 69, pp. 242-244, 1982.
  273. .. [2] A. Stein and M. Werman, "Finding the repeated median regression
  274. line", Proceedings of the Third Annual ACM-SIAM Symposium on
  275. Discrete Algorithms, pp. 409-413, 1992.
  276. Examples
  277. --------
  278. >>> from scipy import stats
  279. >>> import matplotlib.pyplot as plt
  280. >>> x = np.linspace(-5, 5, num=150)
  281. >>> y = x + np.random.normal(size=x.size)
  282. >>> y[11:15] += 10 # add outliers
  283. >>> y[-5:] -= 7
  284. Compute the slope and intercept. For comparison, also compute the
  285. least-squares fit with `linregress`:
  286. >>> res = stats.siegelslopes(y, x)
  287. >>> lsq_res = stats.linregress(x, y)
  288. Plot the results. The Siegel regression line is shown in red. The green
  289. line shows the least-squares fit for comparison.
  290. >>> fig = plt.figure()
  291. >>> ax = fig.add_subplot(111)
  292. >>> ax.plot(x, y, 'b.')
  293. >>> ax.plot(x, res[1] + res[0] * x, 'r-')
  294. >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
  295. >>> plt.show()
  296. """
  297. if method not in ['hierarchical', 'separate']:
  298. raise ValueError("method can only be 'hierarchical' or 'separate'")
  299. y = np.asarray(y).ravel()
  300. if x is None:
  301. x = np.arange(len(y), dtype=float)
  302. else:
  303. x = np.asarray(x, dtype=float).ravel()
  304. if len(x) != len(y):
  305. raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
  306. deltax = x[:, np.newaxis] - x
  307. deltay = y[:, np.newaxis] - y
  308. slopes, intercepts = [], []
  309. for j in range(len(x)):
  310. id_nonzero = deltax[j, :] != 0
  311. slopes_j = deltay[j, id_nonzero] / deltax[j, id_nonzero]
  312. medslope_j = np.median(slopes_j)
  313. slopes.append(medslope_j)
  314. if method == 'separate':
  315. z = y*x[j] - y[j]*x
  316. medintercept_j = np.median(z[id_nonzero] / deltax[j, id_nonzero])
  317. intercepts.append(medintercept_j)
  318. medslope = np.median(np.asarray(slopes))
  319. if method == "separate":
  320. medinter = np.median(np.asarray(intercepts))
  321. else:
  322. medinter = np.median(y - medslope*x)
  323. return medslope, medinter