kde.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625
  1. #-------------------------------------------------------------------------------
  2. #
  3. # Define classes for (uni/multi)-variate kernel density estimation.
  4. #
  5. # Currently, only Gaussian kernels are implemented.
  6. #
  7. # Written by: Robert Kern
  8. #
  9. # Date: 2004-08-09
  10. #
  11. # Modified: 2005-02-10 by Robert Kern.
  12. # Contributed to Scipy
  13. # 2005-10-07 by Robert Kern.
  14. # Some fixes to match the new scipy_core
  15. #
  16. # Copyright 2004-2005 by Enthought, Inc.
  17. #
  18. #-------------------------------------------------------------------------------
  19. from __future__ import division, print_function, absolute_import
  20. # Standard library imports.
  21. import warnings
  22. # Scipy imports.
  23. from scipy._lib.six import callable, string_types
  24. from scipy import linalg, special
  25. from scipy.special import logsumexp
  26. from scipy._lib._numpy_compat import cov
  27. from numpy import (atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt,
  28. ravel, power, atleast_1d, squeeze, sum, transpose, ones)
  29. import numpy as np
  30. from numpy.random import choice, multivariate_normal
  31. # Local imports.
  32. from . import mvn
  33. __all__ = ['gaussian_kde']
  34. class gaussian_kde(object):
  35. """Representation of a kernel-density estimate using Gaussian kernels.
  36. Kernel density estimation is a way to estimate the probability density
  37. function (PDF) of a random variable in a non-parametric way.
  38. `gaussian_kde` works for both uni-variate and multi-variate data. It
  39. includes automatic bandwidth determination. The estimation works best for
  40. a unimodal distribution; bimodal or multi-modal distributions tend to be
  41. oversmoothed.
  42. Parameters
  43. ----------
  44. dataset : array_like
  45. Datapoints to estimate from. In case of univariate data this is a 1-D
  46. array, otherwise a 2-D array with shape (# of dims, # of data).
  47. bw_method : str, scalar or callable, optional
  48. The method used to calculate the estimator bandwidth. This can be
  49. 'scott', 'silverman', a scalar constant or a callable. If a scalar,
  50. this will be used directly as `kde.factor`. If a callable, it should
  51. take a `gaussian_kde` instance as only parameter and return a scalar.
  52. If None (default), 'scott' is used. See Notes for more details.
  53. weights : array_like, optional
  54. weights of datapoints. This must be the same shape as dataset.
  55. If None (default), the samples are assumed to be equally weighted
  56. Attributes
  57. ----------
  58. dataset : ndarray
  59. The dataset with which `gaussian_kde` was initialized.
  60. d : int
  61. Number of dimensions.
  62. n : int
  63. Number of datapoints.
  64. neff : int
  65. Effective number of datapoints.
  66. .. versionadded:: 1.2.0
  67. factor : float
  68. The bandwidth factor, obtained from `kde.covariance_factor`, with which
  69. the covariance matrix is multiplied.
  70. covariance : ndarray
  71. The covariance matrix of `dataset`, scaled by the calculated bandwidth
  72. (`kde.factor`).
  73. inv_cov : ndarray
  74. The inverse of `covariance`.
  75. Methods
  76. -------
  77. evaluate
  78. __call__
  79. integrate_gaussian
  80. integrate_box_1d
  81. integrate_box
  82. integrate_kde
  83. pdf
  84. logpdf
  85. resample
  86. set_bandwidth
  87. covariance_factor
  88. Notes
  89. -----
  90. Bandwidth selection strongly influences the estimate obtained from the KDE
  91. (much more so than the actual shape of the kernel). Bandwidth selection
  92. can be done by a "rule of thumb", by cross-validation, by "plug-in
  93. methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
  94. uses a rule of thumb, the default is Scott's Rule.
  95. Scott's Rule [1]_, implemented as `scotts_factor`, is::
  96. n**(-1./(d+4)),
  97. with ``n`` the number of data points and ``d`` the number of dimensions.
  98. In the case of unequally weighted points, `scotts_factor` becomes::
  99. neff**(-1./(d+4)),
  100. with ``neff`` the effective number of datapoints.
  101. Silverman's Rule [2]_, implemented as `silverman_factor`, is::
  102. (n * (d + 2) / 4.)**(-1. / (d + 4)).
  103. or in the case of unequally weighted points::
  104. (neff * (d + 2) / 4.)**(-1. / (d + 4)).
  105. Good general descriptions of kernel density estimation can be found in [1]_
  106. and [2]_, the mathematics for this multi-dimensional implementation can be
  107. found in [1]_.
  108. With a set of weighted samples, the effective number of datapoints ``neff``
  109. is defined by::
  110. neff = sum(weights)^2 / sum(weights^2)
  111. as detailed in [5]_.
  112. References
  113. ----------
  114. .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
  115. Visualization", John Wiley & Sons, New York, Chicester, 1992.
  116. .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
  117. Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
  118. Chapman and Hall, London, 1986.
  119. .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
  120. Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
  121. .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
  122. conditional density estimation", Computational Statistics & Data
  123. Analysis, Vol. 36, pp. 279-298, 2001.
  124. .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
  125. Series A (General), 132, 272
  126. Examples
  127. --------
  128. Generate some random two-dimensional data:
  129. >>> from scipy import stats
  130. >>> def measure(n):
  131. ... "Measurement model, return two coupled measurements."
  132. ... m1 = np.random.normal(size=n)
  133. ... m2 = np.random.normal(scale=0.5, size=n)
  134. ... return m1+m2, m1-m2
  135. >>> m1, m2 = measure(2000)
  136. >>> xmin = m1.min()
  137. >>> xmax = m1.max()
  138. >>> ymin = m2.min()
  139. >>> ymax = m2.max()
  140. Perform a kernel density estimate on the data:
  141. >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
  142. >>> positions = np.vstack([X.ravel(), Y.ravel()])
  143. >>> values = np.vstack([m1, m2])
  144. >>> kernel = stats.gaussian_kde(values)
  145. >>> Z = np.reshape(kernel(positions).T, X.shape)
  146. Plot the results:
  147. >>> import matplotlib.pyplot as plt
  148. >>> fig, ax = plt.subplots()
  149. >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
  150. ... extent=[xmin, xmax, ymin, ymax])
  151. >>> ax.plot(m1, m2, 'k.', markersize=2)
  152. >>> ax.set_xlim([xmin, xmax])
  153. >>> ax.set_ylim([ymin, ymax])
  154. >>> plt.show()
  155. """
  156. def __init__(self, dataset, bw_method=None, weights=None):
  157. self.dataset = atleast_2d(dataset)
  158. if not self.dataset.size > 1:
  159. raise ValueError("`dataset` input should have multiple elements.")
  160. self.d, self.n = self.dataset.shape
  161. if weights is not None:
  162. self._weights = atleast_1d(weights).astype(float)
  163. self._weights /= sum(self._weights)
  164. if self.weights.ndim != 1:
  165. raise ValueError("`weights` input should be one-dimensional.")
  166. if len(self._weights) != self.n:
  167. raise ValueError("`weights` input should be of length n")
  168. self._neff = 1/sum(self._weights**2)
  169. self.set_bandwidth(bw_method=bw_method)
  170. def evaluate(self, points):
  171. """Evaluate the estimated pdf on a set of points.
  172. Parameters
  173. ----------
  174. points : (# of dimensions, # of points)-array
  175. Alternatively, a (# of dimensions,) vector can be passed in and
  176. treated as a single point.
  177. Returns
  178. -------
  179. values : (# of points,)-array
  180. The values at each point.
  181. Raises
  182. ------
  183. ValueError : if the dimensionality of the input points is different than
  184. the dimensionality of the KDE.
  185. """
  186. points = atleast_2d(points)
  187. d, m = points.shape
  188. if d != self.d:
  189. if d == 1 and m == self.d:
  190. # points was passed in as a row vector
  191. points = reshape(points, (self.d, 1))
  192. m = 1
  193. else:
  194. msg = "points have dimension %s, dataset has dimension %s" % (d,
  195. self.d)
  196. raise ValueError(msg)
  197. result = zeros((m,), dtype=float)
  198. whitening = linalg.cholesky(self.inv_cov)
  199. scaled_dataset = dot(whitening, self.dataset)
  200. scaled_points = dot(whitening, points)
  201. if m >= self.n:
  202. # there are more points than data, so loop over data
  203. for i in range(self.n):
  204. diff = scaled_dataset[:, i, newaxis] - scaled_points
  205. energy = sum(diff * diff, axis=0) / 2.0
  206. result += self.weights[i]*exp(-energy)
  207. else:
  208. # loop over points
  209. for i in range(m):
  210. diff = scaled_dataset - scaled_points[:, i, newaxis]
  211. energy = sum(diff * diff, axis=0) / 2.0
  212. result[i] = sum(exp(-energy)*self.weights, axis=0)
  213. result = result * self.n / self._norm_factor
  214. return result
  215. __call__ = evaluate
  216. def integrate_gaussian(self, mean, cov):
  217. """
  218. Multiply estimated density by a multivariate Gaussian and integrate
  219. over the whole space.
  220. Parameters
  221. ----------
  222. mean : aray_like
  223. A 1-D array, specifying the mean of the Gaussian.
  224. cov : array_like
  225. A 2-D array, specifying the covariance matrix of the Gaussian.
  226. Returns
  227. -------
  228. result : scalar
  229. The value of the integral.
  230. Raises
  231. ------
  232. ValueError
  233. If the mean or covariance of the input Gaussian differs from
  234. the KDE's dimensionality.
  235. """
  236. mean = atleast_1d(squeeze(mean))
  237. cov = atleast_2d(cov)
  238. if mean.shape != (self.d,):
  239. raise ValueError("mean does not have dimension %s" % self.d)
  240. if cov.shape != (self.d, self.d):
  241. raise ValueError("covariance does not have dimension %s" % self.d)
  242. # make mean a column vector
  243. mean = mean[:, newaxis]
  244. sum_cov = self.covariance + cov
  245. # This will raise LinAlgError if the new cov matrix is not s.p.d
  246. # cho_factor returns (ndarray, bool) where bool is a flag for whether
  247. # or not ndarray is upper or lower triangular
  248. sum_cov_chol = linalg.cho_factor(sum_cov)
  249. diff = self.dataset - mean
  250. tdiff = linalg.cho_solve(sum_cov_chol, diff)
  251. sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
  252. norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
  253. energies = sum(diff * tdiff, axis=0) / 2.0
  254. result = sum(exp(-energies)*self.weights, axis=0) / norm_const
  255. return result
  256. def integrate_box_1d(self, low, high):
  257. """
  258. Computes the integral of a 1D pdf between two bounds.
  259. Parameters
  260. ----------
  261. low : scalar
  262. Lower bound of integration.
  263. high : scalar
  264. Upper bound of integration.
  265. Returns
  266. -------
  267. value : scalar
  268. The result of the integral.
  269. Raises
  270. ------
  271. ValueError
  272. If the KDE is over more than one dimension.
  273. """
  274. if self.d != 1:
  275. raise ValueError("integrate_box_1d() only handles 1D pdfs")
  276. stdev = ravel(sqrt(self.covariance))[0]
  277. normalized_low = ravel((low - self.dataset) / stdev)
  278. normalized_high = ravel((high - self.dataset) / stdev)
  279. value = np.sum(self.weights*(
  280. special.ndtr(normalized_high) -
  281. special.ndtr(normalized_low)))
  282. return value
  283. def integrate_box(self, low_bounds, high_bounds, maxpts=None):
  284. """Computes the integral of a pdf over a rectangular interval.
  285. Parameters
  286. ----------
  287. low_bounds : array_like
  288. A 1-D array containing the lower bounds of integration.
  289. high_bounds : array_like
  290. A 1-D array containing the upper bounds of integration.
  291. maxpts : int, optional
  292. The maximum number of points to use for integration.
  293. Returns
  294. -------
  295. value : scalar
  296. The result of the integral.
  297. """
  298. if maxpts is not None:
  299. extra_kwds = {'maxpts': maxpts}
  300. else:
  301. extra_kwds = {}
  302. value, inform = mvn.mvnun_weighted(low_bounds, high_bounds,
  303. self.dataset, self.weights,
  304. self.covariance, **extra_kwds)
  305. if inform:
  306. msg = ('An integral in mvn.mvnun requires more points than %s' %
  307. (self.d * 1000))
  308. warnings.warn(msg)
  309. return value
  310. def integrate_kde(self, other):
  311. """
  312. Computes the integral of the product of this kernel density estimate
  313. with another.
  314. Parameters
  315. ----------
  316. other : gaussian_kde instance
  317. The other kde.
  318. Returns
  319. -------
  320. value : scalar
  321. The result of the integral.
  322. Raises
  323. ------
  324. ValueError
  325. If the KDEs have different dimensionality.
  326. """
  327. if other.d != self.d:
  328. raise ValueError("KDEs are not the same dimensionality")
  329. # we want to iterate over the smallest number of points
  330. if other.n < self.n:
  331. small = other
  332. large = self
  333. else:
  334. small = self
  335. large = other
  336. sum_cov = small.covariance + large.covariance
  337. sum_cov_chol = linalg.cho_factor(sum_cov)
  338. result = 0.0
  339. for i in range(small.n):
  340. mean = small.dataset[:, i, newaxis]
  341. diff = large.dataset - mean
  342. tdiff = linalg.cho_solve(sum_cov_chol, diff)
  343. energies = sum(diff * tdiff, axis=0) / 2.0
  344. result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
  345. sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
  346. norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
  347. result /= norm_const
  348. return result
  349. def resample(self, size=None):
  350. """
  351. Randomly sample a dataset from the estimated pdf.
  352. Parameters
  353. ----------
  354. size : int, optional
  355. The number of samples to draw. If not provided, then the size is
  356. the same as the effective number of samples in the underlying
  357. dataset.
  358. Returns
  359. -------
  360. resample : (self.d, `size`) ndarray
  361. The sampled dataset.
  362. """
  363. if size is None:
  364. size = int(self.neff)
  365. norm = transpose(multivariate_normal(zeros((self.d,), float),
  366. self.covariance, size=size))
  367. indices = choice(self.n, size=size, p=self.weights)
  368. means = self.dataset[:, indices]
  369. return means + norm
  370. def scotts_factor(self):
  371. return power(self.neff, -1./(self.d+4))
  372. def silverman_factor(self):
  373. return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
  374. # Default method to calculate bandwidth, can be overwritten by subclass
  375. covariance_factor = scotts_factor
  376. covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
  377. multiplies the data covariance matrix to obtain the kernel covariance
  378. matrix. The default is `scotts_factor`. A subclass can overwrite this
  379. method to provide a different method, or set it through a call to
  380. `kde.set_bandwidth`."""
  381. def set_bandwidth(self, bw_method=None):
  382. """Compute the estimator bandwidth with given method.
  383. The new bandwidth calculated after a call to `set_bandwidth` is used
  384. for subsequent evaluations of the estimated density.
  385. Parameters
  386. ----------
  387. bw_method : str, scalar or callable, optional
  388. The method used to calculate the estimator bandwidth. This can be
  389. 'scott', 'silverman', a scalar constant or a callable. If a
  390. scalar, this will be used directly as `kde.factor`. If a callable,
  391. it should take a `gaussian_kde` instance as only parameter and
  392. return a scalar. If None (default), nothing happens; the current
  393. `kde.covariance_factor` method is kept.
  394. Notes
  395. -----
  396. .. versionadded:: 0.11
  397. Examples
  398. --------
  399. >>> import scipy.stats as stats
  400. >>> x1 = np.array([-7, -5, 1, 4, 5.])
  401. >>> kde = stats.gaussian_kde(x1)
  402. >>> xs = np.linspace(-10, 10, num=50)
  403. >>> y1 = kde(xs)
  404. >>> kde.set_bandwidth(bw_method='silverman')
  405. >>> y2 = kde(xs)
  406. >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
  407. >>> y3 = kde(xs)
  408. >>> import matplotlib.pyplot as plt
  409. >>> fig, ax = plt.subplots()
  410. >>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo',
  411. ... label='Data points (rescaled)')
  412. >>> ax.plot(xs, y1, label='Scott (default)')
  413. >>> ax.plot(xs, y2, label='Silverman')
  414. >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
  415. >>> ax.legend()
  416. >>> plt.show()
  417. """
  418. if bw_method is None:
  419. pass
  420. elif bw_method == 'scott':
  421. self.covariance_factor = self.scotts_factor
  422. elif bw_method == 'silverman':
  423. self.covariance_factor = self.silverman_factor
  424. elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
  425. self._bw_method = 'use constant'
  426. self.covariance_factor = lambda: bw_method
  427. elif callable(bw_method):
  428. self._bw_method = bw_method
  429. self.covariance_factor = lambda: self._bw_method(self)
  430. else:
  431. msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
  432. "or a callable."
  433. raise ValueError(msg)
  434. self._compute_covariance()
  435. def _compute_covariance(self):
  436. """Computes the covariance matrix for each Gaussian kernel using
  437. covariance_factor().
  438. """
  439. self.factor = self.covariance_factor()
  440. # Cache covariance and inverse covariance of the data
  441. if not hasattr(self, '_data_inv_cov'):
  442. self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
  443. bias=False,
  444. aweights=self.weights))
  445. self._data_inv_cov = linalg.inv(self._data_covariance)
  446. self.covariance = self._data_covariance * self.factor**2
  447. self.inv_cov = self._data_inv_cov / self.factor**2
  448. self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n
  449. def pdf(self, x):
  450. """
  451. Evaluate the estimated pdf on a provided set of points.
  452. Notes
  453. -----
  454. This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
  455. docstring for more details.
  456. """
  457. return self.evaluate(x)
  458. def logpdf(self, x):
  459. """
  460. Evaluate the log of the estimated pdf on a provided set of points.
  461. """
  462. points = atleast_2d(x)
  463. d, m = points.shape
  464. if d != self.d:
  465. if d == 1 and m == self.d:
  466. # points was passed in as a row vector
  467. points = reshape(points, (self.d, 1))
  468. m = 1
  469. else:
  470. msg = "points have dimension %s, dataset has dimension %s" % (d,
  471. self.d)
  472. raise ValueError(msg)
  473. result = zeros((m,), dtype=float)
  474. if m >= self.n:
  475. # there are more points than data, so loop over data
  476. energy = zeros((self.n, m), dtype=float)
  477. for i in range(self.n):
  478. diff = self.dataset[:, i, newaxis] - points
  479. tdiff = dot(self.inv_cov, diff)
  480. energy[i] = sum(diff*tdiff, axis=0) / 2.0
  481. result = logsumexp(-energy,
  482. b=self.weights[i]*self.n/self._norm_factor,
  483. axis=0)
  484. else:
  485. # loop over points
  486. for i in range(m):
  487. diff = self.dataset - points[:, i, newaxis]
  488. tdiff = dot(self.inv_cov, diff)
  489. energy = sum(diff * tdiff, axis=0) / 2.0
  490. result[i] = logsumexp(-energy,
  491. b=self.weights*self.n/self._norm_factor)
  492. return result
  493. @property
  494. def weights(self):
  495. try:
  496. return self._weights
  497. except AttributeError:
  498. self._weights = ones(self.n)/self.n
  499. return self._weights
  500. @property
  501. def neff(self):
  502. try:
  503. return self._neff
  504. except AttributeError:
  505. self._neff = 1/sum(self.weights**2)
  506. return self._neff