123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393 |
- from __future__ import division, print_function, absolute_import
- from scipy import stats
- import numpy as np
- from numpy.testing import (assert_almost_equal, assert_,
- assert_array_almost_equal, assert_array_almost_equal_nulp, assert_allclose)
- import pytest
- from pytest import raises as assert_raises
- def test_kde_1d():
- #some basic tests comparing to normal distribution
- np.random.seed(8765678)
- n_basesample = 500
- xn = np.random.randn(n_basesample)
- xnmean = xn.mean()
- xnstd = xn.std(ddof=1)
- # get kde for original sample
- gkde = stats.gaussian_kde(xn)
- # evaluate the density function for the kde for some points
- xs = np.linspace(-7,7,501)
- kdepdf = gkde.evaluate(xs)
- normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
- intervall = xs[1] - xs[0]
- assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
- prob1 = gkde.integrate_box_1d(xnmean, np.inf)
- prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
- assert_almost_equal(prob1, 0.5, decimal=1)
- assert_almost_equal(prob2, 0.5, decimal=1)
- assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
- assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
- assert_almost_equal(gkde.integrate_kde(gkde),
- (kdepdf**2).sum()*intervall, decimal=2)
- assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
- (kdepdf*normpdf).sum()*intervall, decimal=2)
- def test_kde_1d_weighted():
- #some basic tests comparing to normal distribution
- np.random.seed(8765678)
- n_basesample = 500
- xn = np.random.randn(n_basesample)
- wn = np.random.rand(n_basesample)
- xnmean = np.average(xn, weights=wn)
- xnstd = np.sqrt(np.average((xn-xnmean)**2, weights=wn))
- # get kde for original sample
- gkde = stats.gaussian_kde(xn, weights=wn)
- # evaluate the density function for the kde for some points
- xs = np.linspace(-7,7,501)
- kdepdf = gkde.evaluate(xs)
- normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
- intervall = xs[1] - xs[0]
- assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
- prob1 = gkde.integrate_box_1d(xnmean, np.inf)
- prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
- assert_almost_equal(prob1, 0.5, decimal=1)
- assert_almost_equal(prob2, 0.5, decimal=1)
- assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
- assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
- assert_almost_equal(gkde.integrate_kde(gkde),
- (kdepdf**2).sum()*intervall, decimal=2)
- assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
- (kdepdf*normpdf).sum()*intervall, decimal=2)
- @pytest.mark.slow
- def test_kde_2d():
- #some basic tests comparing to normal distribution
- np.random.seed(8765678)
- n_basesample = 500
- mean = np.array([1.0, 3.0])
- covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
- # Need transpose (shape (2, 500)) for kde
- xn = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
- # get kde for original sample
- gkde = stats.gaussian_kde(xn)
- # evaluate the density function for the kde for some points
- x, y = np.mgrid[-7:7:500j, -7:7:500j]
- grid_coords = np.vstack([x.ravel(), y.ravel()])
- kdepdf = gkde.evaluate(grid_coords)
- kdepdf = kdepdf.reshape(500, 500)
- normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]), mean=mean, cov=covariance)
- intervall = y.ravel()[1] - y.ravel()[0]
- assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
- small = -1e100
- large = 1e100
- prob1 = gkde.integrate_box([small, mean[1]], [large, large])
- prob2 = gkde.integrate_box([small, small], [large, mean[1]])
- assert_almost_equal(prob1, 0.5, decimal=1)
- assert_almost_equal(prob2, 0.5, decimal=1)
- assert_almost_equal(gkde.integrate_kde(gkde),
- (kdepdf**2).sum()*(intervall**2), decimal=2)
- assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
- (kdepdf*normpdf).sum()*(intervall**2), decimal=2)
- @pytest.mark.slow
- def test_kde_2d_weighted():
- #some basic tests comparing to normal distribution
- np.random.seed(8765678)
- n_basesample = 500
- mean = np.array([1.0, 3.0])
- covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
- # Need transpose (shape (2, 500)) for kde
- xn = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
- wn = np.random.rand(n_basesample)
- # get kde for original sample
- gkde = stats.gaussian_kde(xn, weights=wn)
- # evaluate the density function for the kde for some points
- x, y = np.mgrid[-7:7:500j, -7:7:500j]
- grid_coords = np.vstack([x.ravel(), y.ravel()])
- kdepdf = gkde.evaluate(grid_coords)
- kdepdf = kdepdf.reshape(500, 500)
- normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]), mean=mean, cov=covariance)
- intervall = y.ravel()[1] - y.ravel()[0]
- assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
- small = -1e100
- large = 1e100
- prob1 = gkde.integrate_box([small, mean[1]], [large, large])
- prob2 = gkde.integrate_box([small, small], [large, mean[1]])
- assert_almost_equal(prob1, 0.5, decimal=1)
- assert_almost_equal(prob2, 0.5, decimal=1)
- assert_almost_equal(gkde.integrate_kde(gkde),
- (kdepdf**2).sum()*(intervall**2), decimal=2)
- assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
- (kdepdf*normpdf).sum()*(intervall**2), decimal=2)
- def test_kde_bandwidth_method():
- def scotts_factor(kde_obj):
- """Same as default, just check that it works."""
- return np.power(kde_obj.n, -1./(kde_obj.d+4))
- np.random.seed(8765678)
- n_basesample = 50
- xn = np.random.randn(n_basesample)
- # Default
- gkde = stats.gaussian_kde(xn)
- # Supply a callable
- gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
- # Supply a scalar
- gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
- xs = np.linspace(-7,7,51)
- kdepdf = gkde.evaluate(xs)
- kdepdf2 = gkde2.evaluate(xs)
- assert_almost_equal(kdepdf, kdepdf2)
- kdepdf3 = gkde3.evaluate(xs)
- assert_almost_equal(kdepdf, kdepdf3)
- assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
- def test_kde_bandwidth_method_weighted():
- def scotts_factor(kde_obj):
- """Same as default, just check that it works."""
- return np.power(kde_obj.neff, -1./(kde_obj.d+4))
- np.random.seed(8765678)
- n_basesample = 50
- xn = np.random.randn(n_basesample)
- # Default
- gkde = stats.gaussian_kde(xn)
- # Supply a callable
- gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
- # Supply a scalar
- gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
- xs = np.linspace(-7,7,51)
- kdepdf = gkde.evaluate(xs)
- kdepdf2 = gkde2.evaluate(xs)
- assert_almost_equal(kdepdf, kdepdf2)
- kdepdf3 = gkde3.evaluate(xs)
- assert_almost_equal(kdepdf, kdepdf3)
- assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
- # Subclasses that should stay working (extracted from various sources).
- # Unfortunately the earlier design of gaussian_kde made it necessary for users
- # to create these kinds of subclasses, or call _compute_covariance() directly.
- class _kde_subclass1(stats.gaussian_kde):
- def __init__(self, dataset):
- self.dataset = np.atleast_2d(dataset)
- self.d, self.n = self.dataset.shape
- self.covariance_factor = self.scotts_factor
- self._compute_covariance()
- class _kde_subclass2(stats.gaussian_kde):
- def __init__(self, dataset):
- self.covariance_factor = self.scotts_factor
- super(_kde_subclass2, self).__init__(dataset)
- class _kde_subclass3(stats.gaussian_kde):
- def __init__(self, dataset, covariance):
- self.covariance = covariance
- stats.gaussian_kde.__init__(self, dataset)
- def _compute_covariance(self):
- self.inv_cov = np.linalg.inv(self.covariance)
- self._norm_factor = np.sqrt(np.linalg.det(2*np.pi * self.covariance)) \
- * self.n
- class _kde_subclass4(stats.gaussian_kde):
- def covariance_factor(self):
- return 0.5 * self.silverman_factor()
- def test_gaussian_kde_subclassing():
- x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
- xs = np.linspace(-10, 10, num=50)
- # gaussian_kde itself
- kde = stats.gaussian_kde(x1)
- ys = kde(xs)
- # subclass 1
- kde1 = _kde_subclass1(x1)
- y1 = kde1(xs)
- assert_array_almost_equal_nulp(ys, y1, nulp=10)
- # subclass 2
- kde2 = _kde_subclass2(x1)
- y2 = kde2(xs)
- assert_array_almost_equal_nulp(ys, y2, nulp=10)
- # subclass 3
- kde3 = _kde_subclass3(x1, kde.covariance)
- y3 = kde3(xs)
- assert_array_almost_equal_nulp(ys, y3, nulp=10)
- # subclass 4
- kde4 = _kde_subclass4(x1)
- y4 = kde4(x1)
- y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017]
- assert_array_almost_equal(y_expected, y4, decimal=6)
- # Not a subclass, but check for use of _compute_covariance()
- kde5 = kde
- kde5.covariance_factor = lambda: kde.factor
- kde5._compute_covariance()
- y5 = kde5(xs)
- assert_array_almost_equal_nulp(ys, y5, nulp=10)
- def test_gaussian_kde_covariance_caching():
- x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
- xs = np.linspace(-10, 10, num=5)
- # These expected values are from scipy 0.10, before some changes to
- # gaussian_kde. They were not compared with any external reference.
- y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475]
- # Set the bandwidth, then reset it to the default.
- kde = stats.gaussian_kde(x1)
- kde.set_bandwidth(bw_method=0.5)
- kde.set_bandwidth(bw_method='scott')
- y2 = kde(xs)
- assert_array_almost_equal(y_expected, y2, decimal=7)
- def test_gaussian_kde_monkeypatch():
- """Ugly, but people may rely on this. See scipy pull request 123,
- specifically the linked ML thread "Width of the Gaussian in stats.kde".
- If it is necessary to break this later on, that is to be discussed on ML.
- """
- x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
- xs = np.linspace(-10, 10, num=50)
- # The old monkeypatched version to get at Silverman's Rule.
- kde = stats.gaussian_kde(x1)
- kde.covariance_factor = kde.silverman_factor
- kde._compute_covariance()
- y1 = kde(xs)
- # The new saner version.
- kde2 = stats.gaussian_kde(x1, bw_method='silverman')
- y2 = kde2(xs)
- assert_array_almost_equal_nulp(y1, y2, nulp=10)
- def test_kde_integer_input():
- """Regression test for #1181."""
- x1 = np.arange(5)
- kde = stats.gaussian_kde(x1)
- y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
- assert_array_almost_equal(kde(x1), y_expected, decimal=6)
- def test_pdf_logpdf():
- np.random.seed(1)
- n_basesample = 50
- xn = np.random.randn(n_basesample)
- # Default
- gkde = stats.gaussian_kde(xn)
- xs = np.linspace(-15, 12, 25)
- pdf = gkde.evaluate(xs)
- pdf2 = gkde.pdf(xs)
- assert_almost_equal(pdf, pdf2, decimal=12)
- logpdf = np.log(pdf)
- logpdf2 = gkde.logpdf(xs)
- assert_almost_equal(logpdf, logpdf2, decimal=12)
- # There are more points than data
- gkde = stats.gaussian_kde(xs)
- pdf = np.log(gkde.evaluate(xn))
- pdf2 = gkde.logpdf(xn)
- assert_almost_equal(pdf, pdf2, decimal=12)
- def test_pdf_logpdf_weighted():
- np.random.seed(1)
- n_basesample = 50
- xn = np.random.randn(n_basesample)
- wn = np.random.rand(n_basesample)
- # Default
- gkde = stats.gaussian_kde(xn, weights=wn)
- xs = np.linspace(-15, 12, 25)
- pdf = gkde.evaluate(xs)
- pdf2 = gkde.pdf(xs)
- assert_almost_equal(pdf, pdf2, decimal=12)
- logpdf = np.log(pdf)
- logpdf2 = gkde.logpdf(xs)
- assert_almost_equal(logpdf, logpdf2, decimal=12)
- # There are more points than data
- gkde = stats.gaussian_kde(xs)
- pdf = np.log(gkde.evaluate(xn))
- pdf2 = gkde.logpdf(xn)
- assert_almost_equal(pdf, pdf2, decimal=12)
- def test_weights_intact():
- # regression test for gh-9709: weights are not modified
- np.random.seed(12345)
- vals = np.random.lognormal(size=100)
- weights = np.random.choice([1.0, 10.0, 100], size=vals.size)
- orig_weights = weights.copy()
- stats.gaussian_kde(np.log10(vals), weights=weights)
- assert_allclose(weights, orig_weights, atol=1e-14, rtol=1e-14)
- def test_weights_integer():
- # integer weights are OK, cf gh-9709 (comment)
- np.random.seed(12345)
- values = [0.2, 13.5, 21.0, 75.0, 99.0]
- weights = [1, 2, 4, 8, 16] # a list of integers
- pdf_i = stats.gaussian_kde(values, weights=weights)
- pdf_f = stats.gaussian_kde(values, weights=np.float64(weights))
- xn = [0.3, 11, 88]
- assert_allclose(pdf_i.evaluate(xn),
- pdf_f.evaluate(xn), atol=1e-14, rtol=1e-14)
|