test_distance.py 78 KB


  1. #
  2. # Author: Damian Eads
  3. # Date: April 17, 2008
  4. #
  5. # Copyright (C) 2008 Damian Eads
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions
  9. # are met:
  10. #
  11. # 1. Redistributions of source code must retain the above copyright
  12. # notice, this list of conditions and the following disclaimer.
  13. #
  14. # 2. Redistributions in binary form must reproduce the above
  15. # copyright notice, this list of conditions and the following
  16. # disclaimer in the documentation and/or other materials provided
  17. # with the distribution.
  18. #
  19. # 3. The name of the author may not be used to endorse or promote
  20. # products derived from this software without specific prior
  21. # written permission.
  22. #
  23. # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  24. # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  25. # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  27. # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  29. # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  30. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  31. # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  32. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  33. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  34. from __future__ import division, print_function, absolute_import
  35. import os.path
  36. from functools import wraps, partial
  37. from scipy._lib.six import xrange, u
  38. import numpy as np
  39. import warnings
  40. from numpy.linalg import norm
  41. from numpy.testing import (verbose, assert_,
  42. assert_array_equal, assert_equal,
  43. assert_almost_equal, assert_allclose)
  44. import pytest
  45. from pytest import raises as assert_raises
  46. from scipy._lib._numpy_compat import suppress_warnings
  47. from scipy.spatial.distance import (squareform, pdist, cdist, num_obs_y,
  48. num_obs_dm, is_valid_dm, is_valid_y,
  49. _validate_vector, _METRICS_NAMES)
  50. # these were missing: chebyshev cityblock kulsinski
  51. from scipy.spatial.distance import (braycurtis, canberra, chebyshev, cityblock,
  52. correlation, cosine, dice, euclidean,
  53. hamming, jaccard, jensenshannon,
  54. kulsinski, mahalanobis, matching,
  55. minkowski, rogerstanimoto, russellrao,
  56. seuclidean, sokalmichener, sokalsneath,
  57. sqeuclidean, yule)
  58. from scipy.spatial.distance import wminkowski as old_wminkowski
  59. _filenames = [
  60. "cdist-X1.txt",
  61. "cdist-X2.txt",
  62. "iris.txt",
  63. "pdist-boolean-inp.txt",
  64. "pdist-chebyshev-ml-iris.txt",
  65. "pdist-chebyshev-ml.txt",
  66. "pdist-cityblock-ml-iris.txt",
  67. "pdist-cityblock-ml.txt",
  68. "pdist-correlation-ml-iris.txt",
  69. "pdist-correlation-ml.txt",
  70. "pdist-cosine-ml-iris.txt",
  71. "pdist-cosine-ml.txt",
  72. "pdist-double-inp.txt",
  73. "pdist-euclidean-ml-iris.txt",
  74. "pdist-euclidean-ml.txt",
  75. "pdist-hamming-ml.txt",
  76. "pdist-jaccard-ml.txt",
  77. "pdist-jensenshannon-ml-iris.txt",
  78. "pdist-jensenshannon-ml.txt",
  79. "pdist-minkowski-3.2-ml-iris.txt",
  80. "pdist-minkowski-3.2-ml.txt",
  81. "pdist-minkowski-5.8-ml-iris.txt",
  82. "pdist-seuclidean-ml-iris.txt",
  83. "pdist-seuclidean-ml.txt",
  84. "pdist-spearman-ml.txt",
  85. "random-bool-data.txt",
  86. "random-double-data.txt",
  87. "random-int-data.txt",
  88. "random-uint-data.txt",
  89. ]
  90. _tdist = np.array([[0, 662, 877, 255, 412, 996],
  91. [662, 0, 295, 468, 268, 400],
  92. [877, 295, 0, 754, 564, 138],
  93. [255, 468, 754, 0, 219, 869],
  94. [412, 268, 564, 219, 0, 669],
  95. [996, 400, 138, 869, 669, 0]], dtype='double')
  96. _ytdist = squareform(_tdist)
  97. # A hashmap of expected output arrays for the tests. These arrays
  98. # come from a list of text files, which are read prior to testing.
  99. # Each test loads inputs and outputs from this dictionary.
  100. eo = {}
  101. def load_testing_files():
  102. for fn in _filenames:
  103. name = fn.replace(".txt", "").replace("-ml", "")
  104. fqfn = os.path.join(os.path.dirname(__file__), 'data', fn)
  105. fp = open(fqfn)
  106. eo[name] = np.loadtxt(fp)
  107. fp.close()
  108. eo['pdist-boolean-inp'] = np.bool_(eo['pdist-boolean-inp'])
  109. eo['random-bool-data'] = np.bool_(eo['random-bool-data'])
  110. eo['random-float32-data'] = np.float32(eo['random-double-data'])
  111. eo['random-int-data'] = np.int_(eo['random-int-data'])
  112. eo['random-uint-data'] = np.uint(eo['random-uint-data'])
  113. load_testing_files()
  114. def _chk_asarrays(arrays, axis=None):
  115. arrays = [np.asanyarray(a) for a in arrays]
  116. if axis is None:
  117. # np < 1.10 ravel removes subclass from arrays
  118. arrays = [np.ravel(a) if a.ndim != 1 else a
  119. for a in arrays]
  120. axis = 0
  121. arrays = tuple(np.atleast_1d(a) for a in arrays)
  122. if axis < 0:
  123. if not all(a.ndim == arrays[0].ndim for a in arrays):
  124. raise ValueError("array ndim must be the same for neg axis")
  125. axis = range(arrays[0].ndim)[axis]
  126. return arrays + (axis,)
  127. def _chk_weights(arrays, weights=None, axis=None,
  128. force_weights=False, simplify_weights=True,
  129. pos_only=False, neg_check=False,
  130. nan_screen=False, mask_screen=False,
  131. ddof=None):
  132. chked = _chk_asarrays(arrays, axis=axis)
  133. arrays, axis = chked[:-1], chked[-1]
  134. simplify_weights = simplify_weights and not force_weights
  135. if not force_weights and mask_screen:
  136. force_weights = any(np.ma.getmask(a) is not np.ma.nomask for a in arrays)
  137. if nan_screen:
  138. has_nans = [np.isnan(np.sum(a)) for a in arrays]
  139. if any(has_nans):
  140. mask_screen = True
  141. force_weights = True
  142. arrays = tuple(np.ma.masked_invalid(a) if has_nan else a
  143. for a, has_nan in zip(arrays, has_nans))
  144. if weights is not None:
  145. weights = np.asanyarray(weights)
  146. elif force_weights:
  147. weights = np.ones(arrays[0].shape[axis])
  148. else:
  149. return arrays + (weights, axis)
  150. if ddof:
  151. weights = _freq_weights(weights)
  152. if mask_screen:
  153. weights = _weight_masked(arrays, weights, axis)
  154. if not all(weights.shape == (a.shape[axis],) for a in arrays):
  155. raise ValueError("weights shape must match arrays along axis")
  156. if neg_check and (weights < 0).any():
  157. raise ValueError("weights cannot be negative")
  158. if pos_only:
  159. pos_weights = np.nonzero(weights > 0)[0]
  160. if pos_weights.size < weights.size:
  161. arrays = tuple(np.take(a, pos_weights, axis=axis) for a in arrays)
  162. weights = weights[pos_weights]
  163. if simplify_weights and (weights == 1).all():
  164. weights = None
  165. return arrays + (weights, axis)
  166. def _freq_weights(weights):
  167. if weights is None:
  168. return weights
  169. int_weights = weights.astype(int)
  170. if (weights != int_weights).any():
  171. raise ValueError("frequency (integer count-type) weights required %s" % weights)
  172. return int_weights
  173. def _weight_masked(arrays, weights, axis):
  174. if axis is None:
  175. axis = 0
  176. weights = np.asanyarray(weights)
  177. for a in arrays:
  178. axis_mask = np.ma.getmask(a)
  179. if axis_mask is np.ma.nomask:
  180. continue
  181. if a.ndim > 1:
  182. not_axes = tuple(i for i in range(a.ndim) if i != axis)
  183. axis_mask = axis_mask.any(axis=not_axes)
  184. weights *= 1 - axis_mask.astype(int)
  185. return weights
  186. def within_tol(a, b, tol):
  187. return np.abs(a - b).max() < tol
  188. def _assert_within_tol(a, b, atol=0, rtol=0, verbose_=False):
  189. if verbose_:
  190. print(np.abs(a - b).max())
  191. assert_allclose(a, b, rtol=rtol, atol=atol)
  192. def _rand_split(arrays, weights, axis, split_per, seed=None):
  193. # inverse operation for stats.collapse_weights
  194. weights = np.array(weights, dtype=np.float64) # modified inplace; need a copy
  195. seeded_rand = np.random.RandomState(seed)
  196. def mytake(a, ix, axis):
  197. record = np.asanyarray(np.take(a, ix, axis=axis))
  198. return record.reshape([a.shape[i] if i != axis else 1
  199. for i in range(a.ndim)])
  200. n_obs = arrays[0].shape[axis]
  201. assert all(a.shape[axis] == n_obs for a in arrays), "data must be aligned on sample axis"
  202. for i in range(int(split_per) * n_obs):
  203. split_ix = seeded_rand.randint(n_obs + i)
  204. prev_w = weights[split_ix]
  205. q = seeded_rand.rand()
  206. weights[split_ix] = q * prev_w
  207. weights = np.append(weights, (1. - q) * prev_w)
  208. arrays = [np.append(a, mytake(a, split_ix, axis=axis),
  209. axis=axis) for a in arrays]
  210. return arrays, weights
  211. def _rough_check(a, b, compare_assert=partial(assert_allclose, atol=1e-5),
  212. key=lambda x: x, w=None):
  213. check_a = key(a)
  214. check_b = key(b)
  215. try:
  216. if np.array(check_a != check_b).any(): # try strict equality for string types
  217. compare_assert(check_a, check_b)
  218. except AttributeError: # masked array
  219. compare_assert(check_a, check_b)
  220. except (TypeError, ValueError): # nested data structure
  221. for a_i, b_i in zip(check_a, check_b):
  222. _rough_check(a_i, b_i, compare_assert=compare_assert)
  223. # diff from test_stats:
  224. # n_args=2, weight_arg='w', default_axis=None
  225. # ma_safe = False, nan_safe = False
  226. def _weight_checked(fn, n_args=2, default_axis=None, key=lambda x: x, weight_arg='w',
  227. squeeze=True, silent=False,
  228. ones_test=True, const_test=True, dup_test=True,
  229. split_test=True, dud_test=True, ma_safe=False, ma_very_safe=False, nan_safe=False,
  230. split_per=1.0, seed=0, compare_assert=partial(assert_allclose, atol=1e-5)):
  231. """runs fn on its arguments 2 or 3 ways, checks that the results are the same,
  232. then returns the same thing it would have returned before"""
  233. @wraps(fn)
  234. def wrapped(*args, **kwargs):
  235. result = fn(*args, **kwargs)
  236. arrays = args[:n_args]
  237. rest = args[n_args:]
  238. weights = kwargs.get(weight_arg, None)
  239. axis = kwargs.get('axis', default_axis)
  240. chked = _chk_weights(arrays, weights=weights, axis=axis, force_weights=True, mask_screen=True)
  241. arrays, weights, axis = chked[:-2], chked[-2], chked[-1]
  242. if squeeze:
  243. arrays = [np.atleast_1d(a.squeeze()) for a in arrays]
  244. try:
  245. # WEIGHTS CHECK 1: EQUAL WEIGHTED OBESERVATIONS
  246. args = tuple(arrays) + rest
  247. if ones_test:
  248. kwargs[weight_arg] = weights
  249. _rough_check(result, fn(*args, **kwargs), key=key)
  250. if const_test:
  251. kwargs[weight_arg] = weights * 101.0
  252. _rough_check(result, fn(*args, **kwargs), key=key)
  253. kwargs[weight_arg] = weights * 0.101
  254. try:
  255. _rough_check(result, fn(*args, **kwargs), key=key)
  256. except Exception as e:
  257. raise type(e)((e, arrays, weights))
  258. # WEIGHTS CHECK 2: ADDL 0-WEIGHTED OBS
  259. if dud_test:
  260. # add randomly resampled rows, weighted at 0
  261. dud_arrays, dud_weights = _rand_split(arrays, weights, axis, split_per=split_per, seed=seed)
  262. dud_weights[:weights.size] = weights # not exactly 1 because of masked arrays
  263. dud_weights[weights.size:] = 0
  264. dud_args = tuple(dud_arrays) + rest
  265. kwargs[weight_arg] = dud_weights
  266. _rough_check(result, fn(*dud_args, **kwargs), key=key)
  267. # increase the value of those 0-weighted rows
  268. for a in dud_arrays:
  269. indexer = [slice(None)] * a.ndim
  270. indexer[axis] = slice(weights.size, None)
  271. indexer = tuple(indexer)
  272. a[indexer] = a[indexer] * 101
  273. dud_args = tuple(dud_arrays) + rest
  274. _rough_check(result, fn(*dud_args, **kwargs), key=key)
  275. # set those 0-weighted rows to NaNs
  276. for a in dud_arrays:
  277. indexer = [slice(None)] * a.ndim
  278. indexer[axis] = slice(weights.size, None)
  279. indexer = tuple(indexer)
  280. a[indexer] = a[indexer] * np.nan
  281. if kwargs.get("nan_policy", None) == "omit" and nan_safe:
  282. dud_args = tuple(dud_arrays) + rest
  283. _rough_check(result, fn(*dud_args, **kwargs), key=key)
  284. # mask out those nan values
  285. if ma_safe:
  286. dud_arrays = [np.ma.masked_invalid(a) for a in dud_arrays]
  287. dud_args = tuple(dud_arrays) + rest
  288. _rough_check(result, fn(*dud_args, **kwargs), key=key)
  289. if ma_very_safe:
  290. kwargs[weight_arg] = None
  291. _rough_check(result, fn(*dud_args, **kwargs), key=key)
  292. del dud_arrays, dud_args, dud_weights
  293. # WEIGHTS CHECK 3: DUPLICATE DATA (DUMB SPLITTING)
  294. if dup_test:
  295. dup_arrays = [np.append(a, a, axis=axis) for a in arrays]
  296. dup_weights = np.append(weights, weights) / 2.0
  297. dup_args = tuple(dup_arrays) + rest
  298. kwargs[weight_arg] = dup_weights
  299. _rough_check(result, fn(*dup_args, **kwargs), key=key)
  300. del dup_args, dup_arrays, dup_weights
  301. # WEIGHT CHECK 3: RANDOM SPLITTING
  302. if split_test and split_per > 0:
  303. split_arrays, split_weights = _rand_split(arrays, weights, axis, split_per=split_per, seed=seed)
  304. split_args = tuple(split_arrays) + rest
  305. kwargs[weight_arg] = split_weights
  306. _rough_check(result, fn(*split_args, **kwargs), key=key)
  307. except NotImplementedError as e:
  308. # when some combination of arguments makes weighting impossible,
  309. # this is the desired response
  310. if not silent:
  311. warnings.warn("%s NotImplemented weights: %s" % (fn.__name__, e))
  312. return result
  313. return wrapped
  314. wcdist = _weight_checked(cdist, default_axis=1, squeeze=False)
  315. wcdist_no_const = _weight_checked(cdist, default_axis=1, squeeze=False, const_test=False)
  316. wpdist = _weight_checked(pdist, default_axis=1, squeeze=False, n_args=1)
  317. wpdist_no_const = _weight_checked(pdist, default_axis=1, squeeze=False, const_test=False, n_args=1)
  318. wrogerstanimoto = _weight_checked(rogerstanimoto)
  319. wmatching = whamming = _weight_checked(hamming, dud_test=False)
  320. wyule = _weight_checked(yule)
  321. wdice = _weight_checked(dice)
  322. wcityblock = _weight_checked(cityblock)
  323. wchebyshev = _weight_checked(chebyshev)
  324. wcosine = _weight_checked(cosine)
  325. wcorrelation = _weight_checked(correlation)
  326. wkulsinski = _weight_checked(kulsinski)
  327. wminkowski = _weight_checked(minkowski, const_test=False)
  328. wjaccard = _weight_checked(jaccard)
  329. weuclidean = _weight_checked(euclidean, const_test=False)
  330. wsqeuclidean = _weight_checked(sqeuclidean, const_test=False)
  331. wbraycurtis = _weight_checked(braycurtis)
  332. wcanberra = _weight_checked(canberra, const_test=False)
  333. wsokalsneath = _weight_checked(sokalsneath)
  334. wsokalmichener = _weight_checked(sokalmichener)
  335. wrussellrao = _weight_checked(russellrao)
  336. class TestCdist(object):
  337. def setup_method(self):
  338. self.rnd_eo_names = ['random-float32-data', 'random-int-data',
  339. 'random-uint-data', 'random-double-data',
  340. 'random-bool-data']
  341. self.valid_upcasts = {'bool': [np.uint, np.int_, np.float32, np.double],
  342. 'uint': [np.int_, np.float32, np.double],
  343. 'int': [np.float32, np.double],
  344. 'float32': [np.double]}
  345. def test_cdist_extra_args(self):
  346. # Tests that args and kwargs are correctly handled
  347. def _my_metric(x, y, arg, kwarg=1, kwarg2=2):
  348. return arg + kwarg + kwarg2
  349. X1 = [[1., 2., 3.], [1.2, 2.3, 3.4], [2.2, 2.3, 4.4]]
  350. X2 = [[7., 5., 8.], [7.5, 5.8, 8.4], [5.5, 5.8, 4.4]]
  351. kwargs = {'N0tV4l1D_p4raM': 3.14, "w":np.arange(3)}
  352. args = [3.14] * 200
  353. with suppress_warnings() as w:
  354. w.filter(DeprecationWarning)
  355. for metric in _METRICS_NAMES:
  356. assert_raises(TypeError, cdist, X1, X2,
  357. metric=metric, **kwargs)
  358. assert_raises(TypeError, cdist, X1, X2,
  359. metric=eval(metric), **kwargs)
  360. assert_raises(TypeError, cdist, X1, X2,
  361. metric="test_" + metric, **kwargs)
  362. assert_raises(TypeError, cdist, X1, X2,
  363. metric=metric, *args)
  364. assert_raises(TypeError, cdist, X1, X2,
  365. metric=eval(metric), *args)
  366. assert_raises(TypeError, cdist, X1, X2,
  367. metric="test_" + metric, *args)
  368. assert_raises(TypeError, cdist, X1, X2, _my_metric)
  369. assert_raises(TypeError, cdist, X1, X2, _my_metric, *args)
  370. assert_raises(TypeError, cdist, X1, X2, _my_metric, **kwargs)
  371. assert_raises(TypeError, cdist, X1, X2, _my_metric,
  372. kwarg=2.2, kwarg2=3.3)
  373. assert_raises(TypeError, cdist, X1, X2, _my_metric, 1, 2, kwarg=2.2)
  374. assert_raises(TypeError, cdist, X1, X2, _my_metric, 1.1, 2.2, 3.3)
  375. assert_raises(TypeError, cdist, X1, X2, _my_metric, 1.1, 2.2)
  376. assert_raises(TypeError, cdist, X1, X2, _my_metric, 1.1)
  377. assert_raises(TypeError, cdist, X1, X2, _my_metric, 1.1,
  378. kwarg=2.2, kwarg2=3.3)
  379. # this should work
  380. assert_allclose(cdist(X1, X2, metric=_my_metric,
  381. arg=1.1, kwarg2=3.3), 5.4)
  382. def test_cdist_euclidean_random_unicode(self):
  383. eps = 1e-07
  384. X1 = eo['cdist-X1']
  385. X2 = eo['cdist-X2']
  386. Y1 = wcdist_no_const(X1, X2, u('euclidean'))
  387. Y2 = wcdist_no_const(X1, X2, u('test_euclidean'))
  388. _assert_within_tol(Y1, Y2, eps, verbose > 2)
  389. def test_cdist_minkowski_random_p3d8(self):
  390. eps = 1e-07
  391. X1 = eo['cdist-X1']
  392. X2 = eo['cdist-X2']
  393. Y1 = wcdist_no_const(X1, X2, 'minkowski', p=3.8)
  394. Y2 = wcdist_no_const(X1, X2, 'test_minkowski', p=3.8)
  395. _assert_within_tol(Y1, Y2, eps, verbose > 2)
  396. def test_cdist_minkowski_random_p4d6(self):
  397. eps = 1e-07
  398. X1 = eo['cdist-X1']
  399. X2 = eo['cdist-X2']
  400. Y1 = wcdist_no_const(X1, X2, 'minkowski', p=4.6)
  401. Y2 = wcdist_no_const(X1, X2, 'test_minkowski', p=4.6)
  402. _assert_within_tol(Y1, Y2, eps, verbose > 2)
  403. def test_cdist_minkowski_random_p1d23(self):
  404. eps = 1e-07
  405. X1 = eo['cdist-X1']
  406. X2 = eo['cdist-X2']
  407. Y1 = wcdist_no_const(X1, X2, 'minkowski', p=1.23)
  408. Y2 = wcdist_no_const(X1, X2, 'test_minkowski', p=1.23)
  409. _assert_within_tol(Y1, Y2, eps, verbose > 2)
  410. def test_cdist_cosine_random(self):
  411. eps = 1e-07
  412. X1 = eo['cdist-X1']
  413. X2 = eo['cdist-X2']
  414. Y1 = wcdist(X1, X2, 'cosine')
  415. # Naive implementation
  416. def norms(X):
  417. return np.linalg.norm(X, axis=1).reshape(-1, 1)
  418. Y2 = 1 - np.dot((X1 / norms(X1)), (X2 / norms(X2)).T)
  419. _assert_within_tol(Y1, Y2, eps, verbose > 2)
  420. def test_cdist_mahalanobis(self):
  421. # 1-dimensional observations
  422. x1 = np.array([[2], [3]])
  423. x2 = np.array([[2], [5]])
  424. dist = cdist(x1, x2, metric='mahalanobis')
  425. assert_allclose(dist, [[0.0, np.sqrt(4.5)], [np.sqrt(0.5), np.sqrt(2)]])
  426. # 2-dimensional observations
  427. x1 = np.array([[0, 0], [-1, 0]])
  428. x2 = np.array([[0, 2], [1, 0], [0, -2]])
  429. dist = cdist(x1, x2, metric='mahalanobis')
  430. rt2 = np.sqrt(2)
  431. assert_allclose(dist, [[rt2, rt2, rt2], [2, 2 * rt2, 2]])
  432. # Too few observations
  433. assert_raises(ValueError,
  434. cdist, [[0, 1]], [[2, 3]], metric='mahalanobis')
  435. def test_cdist_custom_notdouble(self):
  436. class myclass(object):
  437. pass
  438. def _my_metric(x, y):
  439. if not isinstance(x[0], myclass) or not isinstance(y[0], myclass):
  440. raise ValueError("Type has been changed")
  441. return 1.123
  442. data = np.array([[myclass()]], dtype=object)
  443. cdist_y = cdist(data, data, metric=_my_metric)
  444. right_y = 1.123
  445. assert_equal(cdist_y, right_y, verbose=verbose > 2)
  446. def _check_calling_conventions(self, X1, X2, metric, eps=1e-07, **kwargs):
  447. # helper function for test_cdist_calling_conventions
  448. try:
  449. y1 = cdist(X1, X2, metric=metric, **kwargs)
  450. y2 = cdist(X1, X2, metric=eval(metric), **kwargs)
  451. y3 = cdist(X1, X2, metric="test_" + metric, **kwargs)
  452. except Exception as e:
  453. e_cls = e.__class__
  454. if verbose > 2:
  455. print(e_cls.__name__)
  456. print(e)
  457. assert_raises(e_cls, cdist, X1, X2, metric=metric, **kwargs)
  458. assert_raises(e_cls, cdist, X1, X2, metric=eval(metric), **kwargs)
  459. assert_raises(e_cls, cdist, X1, X2, metric="test_" + metric, **kwargs)
  460. else:
  461. _assert_within_tol(y1, y2, rtol=eps, verbose_=verbose > 2)
  462. _assert_within_tol(y1, y3, rtol=eps, verbose_=verbose > 2)
  463. def test_cdist_calling_conventions(self):
  464. # Ensures that specifying the metric with a str or scipy function
  465. # gives the same behaviour (i.e. same result or same exception).
  466. # NOTE: The correctness should be checked within each metric tests.
  467. for eo_name in self.rnd_eo_names:
  468. # subsampling input data to speed-up tests
  469. # NOTE: num samples needs to be > than dimensions for mahalanobis
  470. X1 = eo[eo_name][::5, ::-2]
  471. X2 = eo[eo_name][1::5, ::2]
  472. for metric in _METRICS_NAMES:
  473. if verbose > 2:
  474. print("testing: ", metric, " with: ", eo_name)
  475. if metric == 'wminkowski':
  476. continue
  477. if metric in {'dice', 'yule', 'kulsinski', 'matching',
  478. 'rogerstanimoto', 'russellrao', 'sokalmichener',
  479. 'sokalsneath'} and 'bool' not in eo_name:
  480. # python version permits non-bools e.g. for fuzzy logic
  481. continue
  482. self._check_calling_conventions(X1, X2, metric)
  483. # Testing built-in metrics with extra args
  484. if metric == "seuclidean":
  485. X12 = np.vstack([X1, X2]).astype(np.double)
  486. V = np.var(X12, axis=0, ddof=1)
  487. self._check_calling_conventions(X1, X2, metric, V=V)
  488. elif metric == "mahalanobis":
  489. X12 = np.vstack([X1, X2]).astype(np.double)
  490. V = np.atleast_2d(np.cov(X12.T))
  491. VI = np.array(np.linalg.inv(V).T)
  492. self._check_calling_conventions(X1, X2, metric, VI=VI)
  493. def test_cdist_dtype_equivalence(self):
  494. # Tests that the result is not affected by type up-casting
  495. eps = 1e-07
  496. tests = [(eo['random-bool-data'], self.valid_upcasts['bool']),
  497. (eo['random-uint-data'], self.valid_upcasts['uint']),
  498. (eo['random-int-data'], self.valid_upcasts['int']),
  499. (eo['random-float32-data'], self.valid_upcasts['float32'])]
  500. for metric in _METRICS_NAMES:
  501. for test in tests:
  502. X1 = test[0][::5, ::-2]
  503. X2 = test[0][1::5, ::2]
  504. try:
  505. y1 = cdist(X1, X2, metric=metric)
  506. except Exception as e:
  507. e_cls = e.__class__
  508. if verbose > 2:
  509. print(e_cls.__name__)
  510. print(e)
  511. for new_type in test[1]:
  512. X1new = new_type(X1)
  513. X2new = new_type(X2)
  514. assert_raises(e_cls, cdist, X1new, X2new, metric=metric)
  515. else:
  516. for new_type in test[1]:
  517. y2 = cdist(new_type(X1), new_type(X2), metric=metric)
  518. _assert_within_tol(y1, y2, eps, verbose > 2)
  519. def test_cdist_out(self):
  520. # Test that out parameter works properly
  521. eps = 1e-07
  522. X1 = eo['cdist-X1']
  523. X2 = eo['cdist-X2']
  524. out_r, out_c = X1.shape[0], X2.shape[0]
  525. for metric in _METRICS_NAMES:
  526. kwargs = dict()
  527. if metric in ['minkowski', 'wminkowski']:
  528. kwargs['p'] = 1.23
  529. if metric == 'wminkowski':
  530. kwargs['w'] = 1.0 / X1.std(axis=0)
  531. out1 = np.empty((out_r, out_c), dtype=np.double)
  532. Y1 = cdist(X1, X2, metric, **kwargs)
  533. Y2 = cdist(X1, X2, metric, out=out1, **kwargs)
  534. # test that output is numerically equivalent
  535. _assert_within_tol(Y1, Y2, eps, verbose > 2)
  536. # test that Y_test1 and out1 are the same object
  537. assert_(Y2 is out1)
  538. # test for incorrect shape
  539. out2 = np.empty((out_r-1, out_c+1), dtype=np.double)
  540. assert_raises(ValueError, cdist, X1, X2, metric, out=out2, **kwargs)
  541. # test for C-contiguous order
  542. out3 = np.empty((2 * out_r, 2 * out_c), dtype=np.double)[::2, ::2]
  543. out4 = np.empty((out_r, out_c), dtype=np.double, order='F')
  544. assert_raises(ValueError, cdist, X1, X2, metric, out=out3, **kwargs)
  545. assert_raises(ValueError, cdist, X1, X2, metric, out=out4, **kwargs)
  546. # test for incorrect dtype
  547. out5 = np.empty((out_r, out_c), dtype=np.int64)
  548. assert_raises(ValueError, cdist, X1, X2, metric, out=out5, **kwargs)
  549. def test_striding(self):
  550. # test that striding is handled correct with calls to
  551. # _copy_array_if_base_present
  552. eps = 1e-07
  553. X1 = eo['cdist-X1'][::2, ::2]
  554. X2 = eo['cdist-X2'][::2, ::2]
  555. X1_copy = X1.copy()
  556. X2_copy = X2.copy()
  557. # confirm equivalence
  558. assert_equal(X1, X1_copy)
  559. assert_equal(X2, X2_copy)
  560. # confirm contiguity
  561. assert_(not X1.flags.c_contiguous)
  562. assert_(not X2.flags.c_contiguous)
  563. assert_(X1_copy.flags.c_contiguous)
  564. assert_(X2_copy.flags.c_contiguous)
  565. for metric in _METRICS_NAMES:
  566. kwargs = dict()
  567. if metric in ['minkowski', 'wminkowski']:
  568. kwargs['p'] = 1.23
  569. if metric == 'wminkowski':
  570. kwargs['w'] = 1.0 / X1.std(axis=0)
  571. Y1 = cdist(X1, X2, metric, **kwargs)
  572. Y2 = cdist(X1_copy, X2_copy, metric, **kwargs)
  573. # test that output is numerically equivalent
  574. _assert_within_tol(Y1, Y2, eps, verbose > 2)
  575. class TestPdist(object):
  576. def setup_method(self):
  577. self.rnd_eo_names = ['random-float32-data', 'random-int-data',
  578. 'random-uint-data', 'random-double-data',
  579. 'random-bool-data']
  580. self.valid_upcasts = {'bool': [np.uint, np.int_, np.float32, np.double],
  581. 'uint': [np.int_, np.float32, np.double],
  582. 'int': [np.float32, np.double],
  583. 'float32': [np.double]}
  584. def test_pdist_extra_args(self):
  585. # Tests that args and kwargs are correctly handled
  586. def _my_metric(x, y, arg, kwarg=1, kwarg2=2):
  587. return arg + kwarg + kwarg2
  588. X1 = [[1., 2.], [1.2, 2.3], [2.2, 2.3]]
  589. kwargs = {'N0tV4l1D_p4raM': 3.14, "w":np.arange(2)}
  590. args = [3.14] * 200
  591. with suppress_warnings() as w:
  592. w.filter(DeprecationWarning)
  593. for metric in _METRICS_NAMES:
  594. assert_raises(TypeError, pdist, X1, metric=metric, **kwargs)
  595. assert_raises(TypeError, pdist, X1,
  596. metric=eval(metric), **kwargs)
  597. assert_raises(TypeError, pdist, X1,
  598. metric="test_" + metric, **kwargs)
  599. assert_raises(TypeError, pdist, X1, metric=metric, *args)
  600. assert_raises(TypeError, pdist, X1, metric=eval(metric), *args)
  601. assert_raises(TypeError, pdist, X1,
  602. metric="test_" + metric, *args)
  603. assert_raises(TypeError, pdist, X1, _my_metric)
  604. assert_raises(TypeError, pdist, X1, _my_metric, *args)
  605. assert_raises(TypeError, pdist, X1, _my_metric, **kwargs)
  606. assert_raises(TypeError, pdist, X1, _my_metric,
  607. kwarg=2.2, kwarg2=3.3)
  608. assert_raises(TypeError, pdist, X1, _my_metric, 1, 2, kwarg=2.2)
  609. assert_raises(TypeError, pdist, X1, _my_metric, 1.1, 2.2, 3.3)
  610. assert_raises(TypeError, pdist, X1, _my_metric, 1.1, 2.2)
  611. assert_raises(TypeError, pdist, X1, _my_metric, 1.1)
  612. assert_raises(TypeError, pdist, X1, _my_metric, 1.1,
  613. kwarg=2.2, kwarg2=3.3)
  614. # these should work
  615. assert_allclose(pdist(X1, metric=_my_metric,
  616. arg=1.1, kwarg2=3.3), 5.4)
  617. def test_pdist_euclidean_random(self):
  618. eps = 1e-07
  619. X = eo['pdist-double-inp']
  620. Y_right = eo['pdist-euclidean']
  621. Y_test1 = wpdist_no_const(X, 'euclidean')
  622. _assert_within_tol(Y_test1, Y_right, eps)
  623. def test_pdist_euclidean_random_u(self):
  624. eps = 1e-07
  625. X = eo['pdist-double-inp']
  626. Y_right = eo['pdist-euclidean']
  627. Y_test1 = wpdist_no_const(X, u('euclidean'))
  628. _assert_within_tol(Y_test1, Y_right, eps)
  629. def test_pdist_euclidean_random_float32(self):
  630. eps = 1e-07
  631. X = np.float32(eo['pdist-double-inp'])
  632. Y_right = eo['pdist-euclidean']
  633. Y_test1 = wpdist_no_const(X, 'euclidean')
  634. _assert_within_tol(Y_test1, Y_right, eps)
  635. def test_pdist_euclidean_random_nonC(self):
  636. eps = 1e-07
  637. X = eo['pdist-double-inp']
  638. Y_right = eo['pdist-euclidean']
  639. Y_test2 = wpdist_no_const(X, 'test_euclidean')
  640. _assert_within_tol(Y_test2, Y_right, eps)
  641. @pytest.mark.slow
  642. def test_pdist_euclidean_iris_double(self):
  643. eps = 1e-07
  644. X = eo['iris']
  645. Y_right = eo['pdist-euclidean-iris']
  646. Y_test1 = wpdist_no_const(X, 'euclidean')
  647. _assert_within_tol(Y_test1, Y_right, eps)
  648. @pytest.mark.slow
  649. def test_pdist_euclidean_iris_float32(self):
  650. eps = 1e-06
  651. X = np.float32(eo['iris'])
  652. Y_right = eo['pdist-euclidean-iris']
  653. Y_test1 = wpdist_no_const(X, 'euclidean')
  654. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  655. @pytest.mark.slow
  656. def test_pdist_euclidean_iris_nonC(self):
  657. # Test pdist(X, 'test_euclidean') [the non-C implementation] on the
  658. # Iris data set.
  659. eps = 1e-07
  660. X = eo['iris']
  661. Y_right = eo['pdist-euclidean-iris']
  662. Y_test2 = wpdist_no_const(X, 'test_euclidean')
  663. _assert_within_tol(Y_test2, Y_right, eps)
  664. def test_pdist_seuclidean_random(self):
  665. eps = 1e-05
  666. X = eo['pdist-double-inp']
  667. Y_right = eo['pdist-seuclidean']
  668. Y_test1 = pdist(X, 'seuclidean')
  669. _assert_within_tol(Y_test1, Y_right, eps)
  670. def test_pdist_seuclidean_random_float32(self):
  671. eps = 1e-05
  672. X = np.float32(eo['pdist-double-inp'])
  673. Y_right = eo['pdist-seuclidean']
  674. Y_test1 = pdist(X, 'seuclidean')
  675. _assert_within_tol(Y_test1, Y_right, eps)
  676. def test_pdist_seuclidean_random_nonC(self):
  677. # Test pdist(X, 'test_sqeuclidean') [the non-C implementation]
  678. eps = 1e-05
  679. X = eo['pdist-double-inp']
  680. Y_right = eo['pdist-seuclidean']
  681. Y_test2 = pdist(X, 'test_seuclidean')
  682. _assert_within_tol(Y_test2, Y_right, eps)
  683. def test_pdist_seuclidean_iris(self):
  684. eps = 1e-05
  685. X = eo['iris']
  686. Y_right = eo['pdist-seuclidean-iris']
  687. Y_test1 = pdist(X, 'seuclidean')
  688. _assert_within_tol(Y_test1, Y_right, eps)
  689. def test_pdist_seuclidean_iris_float32(self):
  690. # Tests pdist(X, 'seuclidean') on the Iris data set (float32).
  691. eps = 1e-05
  692. X = np.float32(eo['iris'])
  693. Y_right = eo['pdist-seuclidean-iris']
  694. Y_test1 = pdist(X, 'seuclidean')
  695. _assert_within_tol(Y_test1, Y_right, eps)
  696. def test_pdist_seuclidean_iris_nonC(self):
  697. # Test pdist(X, 'test_seuclidean') [the non-C implementation] on the
  698. # Iris data set.
  699. eps = 1e-05
  700. X = eo['iris']
  701. Y_right = eo['pdist-seuclidean-iris']
  702. Y_test2 = pdist(X, 'test_seuclidean')
  703. _assert_within_tol(Y_test2, Y_right, eps)
  704. def test_pdist_cosine_random(self):
  705. eps = 1e-08
  706. X = eo['pdist-double-inp']
  707. Y_right = eo['pdist-cosine']
  708. Y_test1 = wpdist(X, 'cosine')
  709. _assert_within_tol(Y_test1, Y_right, eps)
  710. def test_pdist_cosine_random_float32(self):
  711. eps = 1e-08
  712. X = np.float32(eo['pdist-double-inp'])
  713. Y_right = eo['pdist-cosine']
  714. Y_test1 = wpdist(X, 'cosine')
  715. _assert_within_tol(Y_test1, Y_right, eps)
  716. def test_pdist_cosine_random_nonC(self):
  717. # Test pdist(X, 'test_cosine') [the non-C implementation]
  718. eps = 1e-08
  719. X = eo['pdist-double-inp']
  720. Y_right = eo['pdist-cosine']
  721. Y_test2 = wpdist(X, 'test_cosine')
  722. _assert_within_tol(Y_test2, Y_right, eps)
  723. @pytest.mark.slow
  724. def test_pdist_cosine_iris(self):
  725. eps = 1e-08
  726. X = eo['iris']
  727. Y_right = eo['pdist-cosine-iris']
  728. Y_test1 = wpdist(X, 'cosine')
  729. _assert_within_tol(Y_test1, Y_right, eps)
  730. @pytest.mark.slow
  731. def test_pdist_cosine_iris_float32(self):
  732. eps = 1e-07
  733. X = np.float32(eo['iris'])
  734. Y_right = eo['pdist-cosine-iris']
  735. Y_test1 = wpdist(X, 'cosine')
  736. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  737. @pytest.mark.slow
  738. def test_pdist_cosine_iris_nonC(self):
  739. eps = 1e-08
  740. X = eo['iris']
  741. Y_right = eo['pdist-cosine-iris']
  742. Y_test2 = wpdist(X, 'test_cosine')
  743. _assert_within_tol(Y_test2, Y_right, eps)
  744. def test_pdist_cosine_bounds(self):
  745. # Test adapted from @joernhees's example at gh-5208: case where
  746. # cosine distance used to be negative. XXX: very sensitive to the
  747. # specific norm computation.
  748. x = np.abs(np.random.RandomState(1337).rand(91))
  749. X = np.vstack([x, x])
  750. assert_(wpdist(X, 'cosine')[0] >= 0,
  751. msg='cosine distance should be non-negative')
  752. def test_pdist_cityblock_random(self):
  753. eps = 1e-06
  754. X = eo['pdist-double-inp']
  755. Y_right = eo['pdist-cityblock']
  756. Y_test1 = wpdist_no_const(X, 'cityblock')
  757. _assert_within_tol(Y_test1, Y_right, eps)
  758. def test_pdist_cityblock_random_float32(self):
  759. eps = 1e-06
  760. X = np.float32(eo['pdist-double-inp'])
  761. Y_right = eo['pdist-cityblock']
  762. Y_test1 = wpdist_no_const(X, 'cityblock')
  763. _assert_within_tol(Y_test1, Y_right, eps)
  764. def test_pdist_cityblock_random_nonC(self):
  765. eps = 1e-06
  766. X = eo['pdist-double-inp']
  767. Y_right = eo['pdist-cityblock']
  768. Y_test2 = wpdist_no_const(X, 'test_cityblock')
  769. _assert_within_tol(Y_test2, Y_right, eps)
  770. @pytest.mark.slow
  771. def test_pdist_cityblock_iris(self):
  772. eps = 1e-14
  773. X = eo['iris']
  774. Y_right = eo['pdist-cityblock-iris']
  775. Y_test1 = wpdist_no_const(X, 'cityblock')
  776. _assert_within_tol(Y_test1, Y_right, eps)
  777. @pytest.mark.slow
  778. def test_pdist_cityblock_iris_float32(self):
  779. eps = 1e-06
  780. X = np.float32(eo['iris'])
  781. Y_right = eo['pdist-cityblock-iris']
  782. Y_test1 = wpdist_no_const(X, 'cityblock')
  783. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  784. @pytest.mark.slow
  785. def test_pdist_cityblock_iris_nonC(self):
  786. # Test pdist(X, 'test_cityblock') [the non-C implementation] on the
  787. # Iris data set.
  788. eps = 1e-14
  789. X = eo['iris']
  790. Y_right = eo['pdist-cityblock-iris']
  791. Y_test2 = wpdist_no_const(X, 'test_cityblock')
  792. _assert_within_tol(Y_test2, Y_right, eps)
  793. def test_pdist_correlation_random(self):
  794. eps = 1e-07
  795. X = eo['pdist-double-inp']
  796. Y_right = eo['pdist-correlation']
  797. Y_test1 = wpdist(X, 'correlation')
  798. _assert_within_tol(Y_test1, Y_right, eps)
  799. def test_pdist_correlation_random_float32(self):
  800. eps = 1e-07
  801. X = np.float32(eo['pdist-double-inp'])
  802. Y_right = eo['pdist-correlation']
  803. Y_test1 = wpdist(X, 'correlation')
  804. _assert_within_tol(Y_test1, Y_right, eps)
  805. def test_pdist_correlation_random_nonC(self):
  806. eps = 1e-07
  807. X = eo['pdist-double-inp']
  808. Y_right = eo['pdist-correlation']
  809. Y_test2 = wpdist(X, 'test_correlation')
  810. _assert_within_tol(Y_test2, Y_right, eps)
  811. @pytest.mark.slow
  812. def test_pdist_correlation_iris(self):
  813. eps = 1e-08
  814. X = eo['iris']
  815. Y_right = eo['pdist-correlation-iris']
  816. Y_test1 = wpdist(X, 'correlation')
  817. _assert_within_tol(Y_test1, Y_right, eps)
  818. @pytest.mark.slow
  819. def test_pdist_correlation_iris_float32(self):
  820. eps = 1e-07
  821. X = eo['iris']
  822. Y_right = np.float32(eo['pdist-correlation-iris'])
  823. Y_test1 = wpdist(X, 'correlation')
  824. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  825. @pytest.mark.slow
  826. def test_pdist_correlation_iris_nonC(self):
  827. eps = 1e-08
  828. X = eo['iris']
  829. Y_right = eo['pdist-correlation-iris']
  830. Y_test2 = wpdist(X, 'test_correlation')
  831. _assert_within_tol(Y_test2, Y_right, eps)
  832. def test_pdist_minkowski_random(self):
  833. eps = 1e-05
  834. X = eo['pdist-double-inp']
  835. Y_right = eo['pdist-minkowski-3.2']
  836. Y_test1 = wpdist_no_const(X, 'minkowski', p=3.2)
  837. _assert_within_tol(Y_test1, Y_right, eps)
  838. def test_pdist_minkowski_random_float32(self):
  839. eps = 1e-05
  840. X = np.float32(eo['pdist-double-inp'])
  841. Y_right = eo['pdist-minkowski-3.2']
  842. Y_test1 = wpdist_no_const(X, 'minkowski', p=3.2)
  843. _assert_within_tol(Y_test1, Y_right, eps)
  844. def test_pdist_minkowski_random_nonC(self):
  845. eps = 1e-05
  846. X = eo['pdist-double-inp']
  847. Y_right = eo['pdist-minkowski-3.2']
  848. Y_test2 = wpdist_no_const(X, 'test_minkowski', p=3.2)
  849. _assert_within_tol(Y_test2, Y_right, eps)
  850. @pytest.mark.slow
  851. def test_pdist_minkowski_3_2_iris(self):
  852. eps = 1e-07
  853. X = eo['iris']
  854. Y_right = eo['pdist-minkowski-3.2-iris']
  855. Y_test1 = wpdist_no_const(X, 'minkowski', p=3.2)
  856. _assert_within_tol(Y_test1, Y_right, eps)
  857. @pytest.mark.slow
  858. def test_pdist_minkowski_3_2_iris_float32(self):
  859. eps = 1e-06
  860. X = np.float32(eo['iris'])
  861. Y_right = eo['pdist-minkowski-3.2-iris']
  862. Y_test1 = wpdist_no_const(X, 'minkowski', p=3.2)
  863. _assert_within_tol(Y_test1, Y_right, eps)
  864. @pytest.mark.slow
  865. def test_pdist_minkowski_3_2_iris_nonC(self):
  866. eps = 1e-07
  867. X = eo['iris']
  868. Y_right = eo['pdist-minkowski-3.2-iris']
  869. Y_test2 = wpdist_no_const(X, 'test_minkowski', p=3.2)
  870. _assert_within_tol(Y_test2, Y_right, eps)
  871. @pytest.mark.slow
  872. def test_pdist_minkowski_5_8_iris(self):
  873. eps = 1e-07
  874. X = eo['iris']
  875. Y_right = eo['pdist-minkowski-5.8-iris']
  876. Y_test1 = wpdist_no_const(X, 'minkowski', p=5.8)
  877. _assert_within_tol(Y_test1, Y_right, eps)
  878. @pytest.mark.slow
  879. def test_pdist_minkowski_5_8_iris_float32(self):
  880. eps = 1e-06
  881. X = np.float32(eo['iris'])
  882. Y_right = eo['pdist-minkowski-5.8-iris']
  883. Y_test1 = wpdist_no_const(X, 'minkowski', p=5.8)
  884. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  885. @pytest.mark.slow
  886. def test_pdist_minkowski_5_8_iris_nonC(self):
  887. eps = 1e-07
  888. X = eo['iris']
  889. Y_right = eo['pdist-minkowski-5.8-iris']
  890. Y_test2 = wpdist_no_const(X, 'test_minkowski', p=5.8)
  891. _assert_within_tol(Y_test2, Y_right, eps)
  892. def test_pdist_mahalanobis(self):
  893. # 1-dimensional observations
  894. x = np.array([2.0, 2.0, 3.0, 5.0]).reshape(-1, 1)
  895. dist = pdist(x, metric='mahalanobis')
  896. assert_allclose(dist, [0.0, np.sqrt(0.5), np.sqrt(4.5),
  897. np.sqrt(0.5), np.sqrt(4.5), np.sqrt(2.0)])
  898. # 2-dimensional observations
  899. x = np.array([[0, 0], [-1, 0], [0, 2], [1, 0], [0, -2]])
  900. dist = pdist(x, metric='mahalanobis')
  901. rt2 = np.sqrt(2)
  902. assert_allclose(dist, [rt2, rt2, rt2, rt2, 2, 2 * rt2, 2, 2, 2 * rt2, 2])
  903. # Too few observations
  904. assert_raises(ValueError,
  905. wpdist, [[0, 1], [2, 3]], metric='mahalanobis')
  906. def test_pdist_hamming_random(self):
  907. eps = 1e-07
  908. X = eo['pdist-boolean-inp']
  909. Y_right = eo['pdist-hamming']
  910. Y_test1 = wpdist(X, 'hamming')
  911. _assert_within_tol(Y_test1, Y_right, eps)
  912. def test_pdist_hamming_random_float32(self):
  913. eps = 1e-07
  914. X = np.float32(eo['pdist-boolean-inp'])
  915. Y_right = eo['pdist-hamming']
  916. Y_test1 = wpdist(X, 'hamming')
  917. _assert_within_tol(Y_test1, Y_right, eps)
  918. def test_pdist_hamming_random_nonC(self):
  919. eps = 1e-07
  920. X = eo['pdist-boolean-inp']
  921. Y_right = eo['pdist-hamming']
  922. Y_test2 = wpdist(X, 'test_hamming')
  923. _assert_within_tol(Y_test2, Y_right, eps)
  924. def test_pdist_dhamming_random(self):
  925. eps = 1e-07
  926. X = np.float64(eo['pdist-boolean-inp'])
  927. Y_right = eo['pdist-hamming']
  928. Y_test1 = wpdist(X, 'hamming')
  929. _assert_within_tol(Y_test1, Y_right, eps)
  930. def test_pdist_dhamming_random_float32(self):
  931. eps = 1e-07
  932. X = np.float32(eo['pdist-boolean-inp'])
  933. Y_right = eo['pdist-hamming']
  934. Y_test1 = wpdist(X, 'hamming')
  935. _assert_within_tol(Y_test1, Y_right, eps)
  936. def test_pdist_dhamming_random_nonC(self):
  937. eps = 1e-07
  938. X = np.float64(eo['pdist-boolean-inp'])
  939. Y_right = eo['pdist-hamming']
  940. Y_test2 = wpdist(X, 'test_hamming')
  941. _assert_within_tol(Y_test2, Y_right, eps)
  942. def test_pdist_jaccard_random(self):
  943. eps = 1e-08
  944. X = eo['pdist-boolean-inp']
  945. Y_right = eo['pdist-jaccard']
  946. Y_test1 = wpdist(X, 'jaccard')
  947. _assert_within_tol(Y_test1, Y_right, eps)
  948. def test_pdist_jaccard_random_float32(self):
  949. eps = 1e-08
  950. X = np.float32(eo['pdist-boolean-inp'])
  951. Y_right = eo['pdist-jaccard']
  952. Y_test1 = wpdist(X, 'jaccard')
  953. _assert_within_tol(Y_test1, Y_right, eps)
  954. def test_pdist_jaccard_random_nonC(self):
  955. eps = 1e-08
  956. X = eo['pdist-boolean-inp']
  957. Y_right = eo['pdist-jaccard']
  958. Y_test2 = wpdist(X, 'test_jaccard')
  959. _assert_within_tol(Y_test2, Y_right, eps)
  960. def test_pdist_djaccard_random(self):
  961. eps = 1e-08
  962. X = np.float64(eo['pdist-boolean-inp'])
  963. Y_right = eo['pdist-jaccard']
  964. Y_test1 = wpdist(X, 'jaccard')
  965. _assert_within_tol(Y_test1, Y_right, eps)
  966. def test_pdist_djaccard_random_float32(self):
  967. eps = 1e-08
  968. X = np.float32(eo['pdist-boolean-inp'])
  969. Y_right = eo['pdist-jaccard']
  970. Y_test1 = wpdist(X, 'jaccard')
  971. _assert_within_tol(Y_test1, Y_right, eps)
  972. def test_pdist_djaccard_allzeros(self):
  973. eps = 1e-08
  974. Y = pdist(np.zeros((5, 3)), 'jaccard')
  975. _assert_within_tol(np.zeros(10), Y, eps)
  976. def test_pdist_djaccard_random_nonC(self):
  977. eps = 1e-08
  978. X = np.float64(eo['pdist-boolean-inp'])
  979. Y_right = eo['pdist-jaccard']
  980. Y_test2 = wpdist(X, 'test_jaccard')
  981. _assert_within_tol(Y_test2, Y_right, eps)
  982. def test_pdist_jensenshannon_random(self):
  983. eps = 1e-08
  984. X = eo['pdist-double-inp']
  985. Y_right = eo['pdist-jensenshannon']
  986. Y_test1 = pdist(X, 'jensenshannon')
  987. _assert_within_tol(Y_test1, Y_right, eps)
  988. def test_pdist_jensenshannon_random_float32(self):
  989. eps = 1e-07
  990. X = np.float32(eo['pdist-double-inp'])
  991. Y_right = eo['pdist-jensenshannon']
  992. Y_test1 = pdist(X, 'jensenshannon')
  993. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  994. def test_pdist_jensenshannon_random_nonC(self):
  995. eps = 1e-08
  996. X = eo['pdist-double-inp']
  997. Y_right = eo['pdist-jensenshannon']
  998. Y_test2 = pdist(X, 'test_jensenshannon')
  999. _assert_within_tol(Y_test2, Y_right, eps)
  1000. def test_pdist_jensenshannon_iris(self):
  1001. eps = 1e-12
  1002. X = eo['iris']
  1003. Y_right = eo['pdist-jensenshannon-iris']
  1004. Y_test1 = pdist(X, 'jensenshannon')
  1005. _assert_within_tol(Y_test1, Y_right, eps)
  1006. def test_pdist_jensenshannon_iris_float32(self):
  1007. eps = 1e-06
  1008. X = np.float32(eo['iris'])
  1009. Y_right = eo['pdist-jensenshannon-iris']
  1010. Y_test1 = pdist(X, 'jensenshannon')
  1011. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  1012. def test_pdist_jensenshannon_iris_nonC(self):
  1013. eps = 5e-12
  1014. X = eo['iris']
  1015. Y_right = eo['pdist-jensenshannon-iris']
  1016. Y_test2 = pdist(X, 'test_jensenshannon')
  1017. _assert_within_tol(Y_test2, Y_right, eps)
  1018. def test_pdist_djaccard_allzeros_nonC(self):
  1019. eps = 1e-08
  1020. Y = pdist(np.zeros((5, 3)), 'test_jaccard')
  1021. _assert_within_tol(np.zeros(10), Y, eps)
  1022. def test_pdist_chebyshev_random(self):
  1023. eps = 1e-08
  1024. X = eo['pdist-double-inp']
  1025. Y_right = eo['pdist-chebyshev']
  1026. Y_test1 = pdist(X, 'chebyshev')
  1027. _assert_within_tol(Y_test1, Y_right, eps)
  1028. def test_pdist_chebyshev_random_float32(self):
  1029. eps = 1e-07
  1030. X = np.float32(eo['pdist-double-inp'])
  1031. Y_right = eo['pdist-chebyshev']
  1032. Y_test1 = pdist(X, 'chebyshev')
  1033. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  1034. def test_pdist_chebyshev_random_nonC(self):
  1035. eps = 1e-08
  1036. X = eo['pdist-double-inp']
  1037. Y_right = eo['pdist-chebyshev']
  1038. Y_test2 = pdist(X, 'test_chebyshev')
  1039. _assert_within_tol(Y_test2, Y_right, eps)
  1040. def test_pdist_chebyshev_iris(self):
  1041. eps = 1e-15
  1042. X = eo['iris']
  1043. Y_right = eo['pdist-chebyshev-iris']
  1044. Y_test1 = pdist(X, 'chebyshev')
  1045. _assert_within_tol(Y_test1, Y_right, eps)
  1046. def test_pdist_chebyshev_iris_float32(self):
  1047. eps = 1e-06
  1048. X = np.float32(eo['iris'])
  1049. Y_right = eo['pdist-chebyshev-iris']
  1050. Y_test1 = pdist(X, 'chebyshev')
  1051. _assert_within_tol(Y_test1, Y_right, eps, verbose > 2)
  1052. def test_pdist_chebyshev_iris_nonC(self):
  1053. eps = 1e-15
  1054. X = eo['iris']
  1055. Y_right = eo['pdist-chebyshev-iris']
  1056. Y_test2 = pdist(X, 'test_chebyshev')
  1057. _assert_within_tol(Y_test2, Y_right, eps)
  1058. def test_pdist_matching_mtica1(self):
  1059. # Test matching(*,*) with mtica example #1 (nums).
  1060. m = wmatching(np.array([1, 0, 1, 1, 0]),
  1061. np.array([1, 1, 0, 1, 1]))
  1062. m2 = wmatching(np.array([1, 0, 1, 1, 0], dtype=bool),
  1063. np.array([1, 1, 0, 1, 1], dtype=bool))
  1064. assert_allclose(m, 0.6, rtol=0, atol=1e-10)
  1065. assert_allclose(m2, 0.6, rtol=0, atol=1e-10)
  1066. def test_pdist_matching_mtica2(self):
  1067. # Test matching(*,*) with mtica example #2.
  1068. m = wmatching(np.array([1, 0, 1]),
  1069. np.array([1, 1, 0]))
  1070. m2 = wmatching(np.array([1, 0, 1], dtype=bool),
  1071. np.array([1, 1, 0], dtype=bool))
  1072. assert_allclose(m, 2 / 3, rtol=0, atol=1e-10)
  1073. assert_allclose(m2, 2 / 3, rtol=0, atol=1e-10)
  1074. def test_pdist_jaccard_mtica1(self):
  1075. m = wjaccard(np.array([1, 0, 1, 1, 0]),
  1076. np.array([1, 1, 0, 1, 1]))
  1077. m2 = wjaccard(np.array([1, 0, 1, 1, 0], dtype=bool),
  1078. np.array([1, 1, 0, 1, 1], dtype=bool))
  1079. assert_allclose(m, 0.6, rtol=0, atol=1e-10)
  1080. assert_allclose(m2, 0.6, rtol=0, atol=1e-10)
  1081. def test_pdist_jaccard_mtica2(self):
  1082. m = wjaccard(np.array([1, 0, 1]),
  1083. np.array([1, 1, 0]))
  1084. m2 = wjaccard(np.array([1, 0, 1], dtype=bool),
  1085. np.array([1, 1, 0], dtype=bool))
  1086. assert_allclose(m, 2 / 3, rtol=0, atol=1e-10)
  1087. assert_allclose(m2, 2 / 3, rtol=0, atol=1e-10)
  1088. def test_pdist_yule_mtica1(self):
  1089. m = wyule(np.array([1, 0, 1, 1, 0]),
  1090. np.array([1, 1, 0, 1, 1]))
  1091. m2 = wyule(np.array([1, 0, 1, 1, 0], dtype=bool),
  1092. np.array([1, 1, 0, 1, 1], dtype=bool))
  1093. if verbose > 2:
  1094. print(m)
  1095. assert_allclose(m, 2, rtol=0, atol=1e-10)
  1096. assert_allclose(m2, 2, rtol=0, atol=1e-10)
  1097. def test_pdist_yule_mtica2(self):
  1098. m = wyule(np.array([1, 0, 1]),
  1099. np.array([1, 1, 0]))
  1100. m2 = wyule(np.array([1, 0, 1], dtype=bool),
  1101. np.array([1, 1, 0], dtype=bool))
  1102. if verbose > 2:
  1103. print(m)
  1104. assert_allclose(m, 2, rtol=0, atol=1e-10)
  1105. assert_allclose(m2, 2, rtol=0, atol=1e-10)
  1106. def test_pdist_dice_mtica1(self):
  1107. m = wdice(np.array([1, 0, 1, 1, 0]),
  1108. np.array([1, 1, 0, 1, 1]))
  1109. m2 = wdice(np.array([1, 0, 1, 1, 0], dtype=bool),
  1110. np.array([1, 1, 0, 1, 1], dtype=bool))
  1111. if verbose > 2:
  1112. print(m)
  1113. assert_allclose(m, 3 / 7, rtol=0, atol=1e-10)
  1114. assert_allclose(m2, 3 / 7, rtol=0, atol=1e-10)
  1115. def test_pdist_dice_mtica2(self):
  1116. m = wdice(np.array([1, 0, 1]),
  1117. np.array([1, 1, 0]))
  1118. m2 = wdice(np.array([1, 0, 1], dtype=bool),
  1119. np.array([1, 1, 0], dtype=bool))
  1120. if verbose > 2:
  1121. print(m)
  1122. assert_allclose(m, 0.5, rtol=0, atol=1e-10)
  1123. assert_allclose(m2, 0.5, rtol=0, atol=1e-10)
  1124. def test_pdist_sokalsneath_mtica1(self):
  1125. m = sokalsneath(np.array([1, 0, 1, 1, 0]),
  1126. np.array([1, 1, 0, 1, 1]))
  1127. m2 = sokalsneath(np.array([1, 0, 1, 1, 0], dtype=bool),
  1128. np.array([1, 1, 0, 1, 1], dtype=bool))
  1129. if verbose > 2:
  1130. print(m)
  1131. assert_allclose(m, 3 / 4, rtol=0, atol=1e-10)
  1132. assert_allclose(m2, 3 / 4, rtol=0, atol=1e-10)
  1133. def test_pdist_sokalsneath_mtica2(self):
  1134. m = wsokalsneath(np.array([1, 0, 1]),
  1135. np.array([1, 1, 0]))
  1136. m2 = wsokalsneath(np.array([1, 0, 1], dtype=bool),
  1137. np.array([1, 1, 0], dtype=bool))
  1138. if verbose > 2:
  1139. print(m)
  1140. assert_allclose(m, 4 / 5, rtol=0, atol=1e-10)
  1141. assert_allclose(m2, 4 / 5, rtol=0, atol=1e-10)
  1142. def test_pdist_rogerstanimoto_mtica1(self):
  1143. m = wrogerstanimoto(np.array([1, 0, 1, 1, 0]),
  1144. np.array([1, 1, 0, 1, 1]))
  1145. m2 = wrogerstanimoto(np.array([1, 0, 1, 1, 0], dtype=bool),
  1146. np.array([1, 1, 0, 1, 1], dtype=bool))
  1147. if verbose > 2:
  1148. print(m)
  1149. assert_allclose(m, 3 / 4, rtol=0, atol=1e-10)
  1150. assert_allclose(m2, 3 / 4, rtol=0, atol=1e-10)
  1151. def test_pdist_rogerstanimoto_mtica2(self):
  1152. m = wrogerstanimoto(np.array([1, 0, 1]),
  1153. np.array([1, 1, 0]))
  1154. m2 = wrogerstanimoto(np.array([1, 0, 1], dtype=bool),
  1155. np.array([1, 1, 0], dtype=bool))
  1156. if verbose > 2:
  1157. print(m)
  1158. assert_allclose(m, 4 / 5, rtol=0, atol=1e-10)
  1159. assert_allclose(m2, 4 / 5, rtol=0, atol=1e-10)
  1160. def test_pdist_russellrao_mtica1(self):
  1161. m = wrussellrao(np.array([1, 0, 1, 1, 0]),
  1162. np.array([1, 1, 0, 1, 1]))
  1163. m2 = wrussellrao(np.array([1, 0, 1, 1, 0], dtype=bool),
  1164. np.array([1, 1, 0, 1, 1], dtype=bool))
  1165. if verbose > 2:
  1166. print(m)
  1167. assert_allclose(m, 3 / 5, rtol=0, atol=1e-10)
  1168. assert_allclose(m2, 3 / 5, rtol=0, atol=1e-10)
  1169. def test_pdist_russellrao_mtica2(self):
  1170. m = wrussellrao(np.array([1, 0, 1]),
  1171. np.array([1, 1, 0]))
  1172. m2 = wrussellrao(np.array([1, 0, 1], dtype=bool),
  1173. np.array([1, 1, 0], dtype=bool))
  1174. if verbose > 2:
  1175. print(m)
  1176. assert_allclose(m, 2 / 3, rtol=0, atol=1e-10)
  1177. assert_allclose(m2, 2 / 3, rtol=0, atol=1e-10)
  1178. @pytest.mark.slow
  1179. def test_pdist_canberra_match(self):
  1180. D = eo['iris']
  1181. if verbose > 2:
  1182. print(D.shape, D.dtype)
  1183. eps = 1e-10
  1184. y1 = wpdist_no_const(D, "canberra")
  1185. y2 = wpdist_no_const(D, "test_canberra")
  1186. _assert_within_tol(y1, y2, eps, verbose > 2)
  1187. def test_pdist_canberra_ticket_711(self):
  1188. # Test pdist(X, 'canberra') to see if Canberra gives the right result
  1189. # as reported on gh-1238.
  1190. eps = 1e-8
  1191. pdist_y = wpdist_no_const(([3.3], [3.4]), "canberra")
  1192. right_y = 0.01492537
  1193. _assert_within_tol(pdist_y, right_y, eps, verbose > 2)
  1194. def test_pdist_custom_notdouble(self):
  1195. # tests that when using a custom metric the data type is not altered
  1196. class myclass(object):
  1197. pass
  1198. def _my_metric(x, y):
  1199. if not isinstance(x[0], myclass) or not isinstance(y[0], myclass):
  1200. raise ValueError("Type has been changed")
  1201. return 1.123
  1202. data = np.array([[myclass()], [myclass()]], dtype=object)
  1203. pdist_y = pdist(data, metric=_my_metric)
  1204. right_y = 1.123
  1205. assert_equal(pdist_y, right_y, verbose=verbose > 2)
  1206. def _check_calling_conventions(self, X, metric, eps=1e-07, **kwargs):
  1207. # helper function for test_pdist_calling_conventions
  1208. try:
  1209. y1 = pdist(X, metric=metric, **kwargs)
  1210. y2 = pdist(X, metric=eval(metric), **kwargs)
  1211. y3 = pdist(X, metric="test_" + metric, **kwargs)
  1212. except Exception as e:
  1213. e_cls = e.__class__
  1214. if verbose > 2:
  1215. print(e_cls.__name__)
  1216. print(e)
  1217. assert_raises(e_cls, pdist, X, metric=metric, **kwargs)
  1218. assert_raises(e_cls, pdist, X, metric=eval(metric), **kwargs)
  1219. assert_raises(e_cls, pdist, X, metric="test_" + metric, **kwargs)
  1220. else:
  1221. _assert_within_tol(y1, y2, rtol=eps, verbose_=verbose > 2)
  1222. _assert_within_tol(y1, y3, rtol=eps, verbose_=verbose > 2)
  1223. def test_pdist_calling_conventions(self):
  1224. # Ensures that specifying the metric with a str or scipy function
  1225. # gives the same behaviour (i.e. same result or same exception).
  1226. # NOTE: The correctness should be checked within each metric tests.
  1227. # NOTE: Extra args should be checked with a dedicated test
  1228. eps = 1e-07
  1229. for eo_name in self.rnd_eo_names:
  1230. # subsampling input data to speed-up tests
  1231. # NOTE: num samples needs to be > than dimensions for mahalanobis
  1232. X = eo[eo_name][::5, ::2]
  1233. for metric in _METRICS_NAMES:
  1234. if metric == 'wminkowski':
  1235. continue
  1236. if verbose > 2:
  1237. print("testing: ", metric, " with: ", eo_name)
  1238. if metric in {'dice', 'yule', 'kulsinski', 'matching',
  1239. 'rogerstanimoto', 'russellrao', 'sokalmichener',
  1240. 'sokalsneath'} and 'bool' not in eo_name:
  1241. # python version permits non-bools e.g. for fuzzy logic
  1242. continue
  1243. self._check_calling_conventions(X, metric)
  1244. # Testing built-in metrics with extra args
  1245. if metric == "seuclidean":
  1246. V = np.var(X.astype(np.double), axis=0, ddof=1)
  1247. self._check_calling_conventions(X, metric, V=V)
  1248. elif metric == "mahalanobis":
  1249. V = np.atleast_2d(np.cov(X.astype(np.double).T))
  1250. VI = np.array(np.linalg.inv(V).T)
  1251. self._check_calling_conventions(X, metric, VI=VI)
  1252. def test_pdist_dtype_equivalence(self):
  1253. # Tests that the result is not affected by type up-casting
  1254. eps = 1e-07
  1255. tests = [(eo['random-bool-data'], self.valid_upcasts['bool']),
  1256. (eo['random-uint-data'], self.valid_upcasts['uint']),
  1257. (eo['random-int-data'], self.valid_upcasts['int']),
  1258. (eo['random-float32-data'], self.valid_upcasts['float32'])]
  1259. for metric in _METRICS_NAMES:
  1260. for test in tests:
  1261. X1 = test[0][::5, ::2]
  1262. try:
  1263. y1 = pdist(X1, metric=metric)
  1264. except Exception as e:
  1265. e_cls = e.__class__
  1266. if verbose > 2:
  1267. print(e_cls.__name__)
  1268. print(e)
  1269. for new_type in test[1]:
  1270. X2 = new_type(X1)
  1271. assert_raises(e_cls, pdist, X2, metric=metric)
  1272. else:
  1273. for new_type in test[1]:
  1274. y2 = pdist(new_type(X1), metric=metric)
  1275. _assert_within_tol(y1, y2, eps, verbose > 2)
  1276. def test_pdist_out(self):
  1277. # Test that out parameter works properly
  1278. eps = 1e-07
  1279. X = eo['random-float32-data'][::5, ::2]
  1280. out_size = int((X.shape[0] * (X.shape[0] - 1)) / 2)
  1281. for metric in _METRICS_NAMES:
  1282. kwargs = dict()
  1283. if metric in ['minkowski', 'wminkowski']:
  1284. kwargs['p'] = 1.23
  1285. if metric == 'wminkowski':
  1286. kwargs['w'] = 1.0 / X.std(axis=0)
  1287. out1 = np.empty(out_size, dtype=np.double)
  1288. Y_right = pdist(X, metric, **kwargs)
  1289. Y_test1 = pdist(X, metric, out=out1, **kwargs)
  1290. # test that output is numerically equivalent
  1291. _assert_within_tol(Y_test1, Y_right, eps)
  1292. # test that Y_test1 and out1 are the same object
  1293. assert_(Y_test1 is out1)
  1294. # test for incorrect shape
  1295. out2 = np.empty(out_size + 3, dtype=np.double)
  1296. assert_raises(ValueError, pdist, X, metric, out=out2, **kwargs)
  1297. # test for (C-)contiguous output
  1298. out3 = np.empty(2 * out_size, dtype=np.double)[::2]
  1299. assert_raises(ValueError, pdist, X, metric, out=out3, **kwargs)
  1300. # test for incorrect dtype
  1301. out5 = np.empty(out_size, dtype=np.int64)
  1302. assert_raises(ValueError, pdist, X, metric, out=out5, **kwargs)
  1303. def test_striding(self):
  1304. # test that striding is handled correct with calls to
  1305. # _copy_array_if_base_present
  1306. eps = 1e-07
  1307. X = eo['random-float32-data'][::5, ::2]
  1308. X_copy = X.copy()
  1309. # confirm contiguity
  1310. assert_(not X.flags.c_contiguous)
  1311. assert_(X_copy.flags.c_contiguous)
  1312. for metric in _METRICS_NAMES:
  1313. kwargs = dict()
  1314. if metric in ['minkowski', 'wminkowski']:
  1315. kwargs['p'] = 1.23
  1316. if metric == 'wminkowski':
  1317. kwargs['w'] = 1.0 / X.std(axis=0)
  1318. Y1 = pdist(X, metric, **kwargs)
  1319. Y2 = pdist(X_copy, metric, **kwargs)
  1320. # test that output is numerically equivalent
  1321. _assert_within_tol(Y1, Y2, eps, verbose > 2)
  1322. class TestSomeDistanceFunctions(object):
  1323. def setup_method(self):
  1324. # 1D arrays
  1325. x = np.array([1.0, 2.0, 3.0])
  1326. y = np.array([1.0, 1.0, 5.0])
  1327. # 3x1 arrays
  1328. x31 = x[:, np.newaxis]
  1329. y31 = y[:, np.newaxis]
  1330. # 1x3 arrays
  1331. x13 = x31.T
  1332. y13 = y31.T
  1333. self.cases = [(x, y), (x31, y31), (x13, y13)]
  1334. def test_minkowski(self):
  1335. with suppress_warnings() as w:
  1336. w.filter(message="`wminkowski` is deprecated")
  1337. for x, y in self.cases:
  1338. dist1 = wminkowski(x, y, p=1)
  1339. assert_almost_equal(dist1, 3.0)
  1340. dist1p5 = wminkowski(x, y, p=1.5)
  1341. assert_almost_equal(dist1p5, (1.0 + 2.0**1.5)**(2. / 3))
  1342. dist2 = wminkowski(x, y, p=2)
  1343. def test_old_wminkowski(self):
  1344. with suppress_warnings() as wrn:
  1345. wrn.filter(message="`wminkowski` is deprecated")
  1346. w = np.array([1.0, 2.0, 0.5])
  1347. for x, y in self.cases:
  1348. dist1 = old_wminkowski(x, y, p=1, w=w)
  1349. assert_almost_equal(dist1, 3.0)
  1350. dist1p5 = old_wminkowski(x, y, p=1.5, w=w)
  1351. assert_almost_equal(dist1p5, (2.0**1.5+1.0)**(2./3))
  1352. dist2 = old_wminkowski(x, y, p=2, w=w)
  1353. assert_almost_equal(dist2, np.sqrt(5))
  1354. # test weights Issue #7893
  1355. arr = np.arange(4)
  1356. w = np.full_like(arr, 4)
  1357. assert_almost_equal(old_wminkowski(arr, arr + 1, p=2, w=w), 8.0)
  1358. assert_almost_equal(wminkowski(arr, arr + 1, p=2, w=w), 4.0)
  1359. def test_euclidean(self):
  1360. for x, y in self.cases:
  1361. dist = weuclidean(x, y)
  1362. assert_almost_equal(dist, np.sqrt(5))
  1363. def test_sqeuclidean(self):
  1364. for x, y in self.cases:
  1365. dist = wsqeuclidean(x, y)
  1366. assert_almost_equal(dist, 5.0)
  1367. def test_cosine(self):
  1368. for x, y in self.cases:
  1369. dist = wcosine(x, y)
  1370. assert_almost_equal(dist, 1.0 - 18.0 / (np.sqrt(14) * np.sqrt(27)))
  1371. def test_correlation(self):
  1372. xm = np.array([-1.0, 0, 1.0])
  1373. ym = np.array([-4.0 / 3, -4.0 / 3, 5.0 - 7.0 / 3])
  1374. for x, y in self.cases:
  1375. dist = wcorrelation(x, y)
  1376. assert_almost_equal(dist, 1.0 - np.dot(xm, ym) / (norm(xm) * norm(ym)))
  1377. def test_mahalanobis(self):
  1378. x = np.array([1.0, 2.0, 3.0])
  1379. y = np.array([1.0, 1.0, 5.0])
  1380. vi = np.array([[2.0, 1.0, 0.0], [1.0, 2.0, 1.0], [0.0, 1.0, 2.0]])
  1381. for x, y in self.cases:
  1382. dist = mahalanobis(x, y, vi)
  1383. assert_almost_equal(dist, np.sqrt(6.0))
  1384. class TestSquareForm(object):
  1385. checked_dtypes = [np.float64, np.float32, np.int32, np.int8, bool]
  1386. def test_squareform_matrix(self):
  1387. for dtype in self.checked_dtypes:
  1388. self.check_squareform_matrix(dtype)
  1389. def test_squareform_vector(self):
  1390. for dtype in self.checked_dtypes:
  1391. self.check_squareform_vector(dtype)
  1392. def check_squareform_matrix(self, dtype):
  1393. A = np.zeros((0, 0), dtype=dtype)
  1394. rA = squareform(A)
  1395. assert_equal(rA.shape, (0,))
  1396. assert_equal(rA.dtype, dtype)
  1397. A = np.zeros((1, 1), dtype=dtype)
  1398. rA = squareform(A)
  1399. assert_equal(rA.shape, (0,))
  1400. assert_equal(rA.dtype, dtype)
  1401. A = np.array([[0, 4.2], [4.2, 0]], dtype=dtype)
  1402. rA = squareform(A)
  1403. assert_equal(rA.shape, (1,))
  1404. assert_equal(rA.dtype, dtype)
  1405. assert_array_equal(rA, np.array([4.2], dtype=dtype))
  1406. def check_squareform_vector(self, dtype):
  1407. v = np.zeros((0,), dtype=dtype)
  1408. rv = squareform(v)
  1409. assert_equal(rv.shape, (1, 1))
  1410. assert_equal(rv.dtype, dtype)
  1411. assert_array_equal(rv, [[0]])
  1412. v = np.array([8.3], dtype=dtype)
  1413. rv = squareform(v)
  1414. assert_equal(rv.shape, (2, 2))
  1415. assert_equal(rv.dtype, dtype)
  1416. assert_array_equal(rv, np.array([[0, 8.3], [8.3, 0]], dtype=dtype))
  1417. def test_squareform_multi_matrix(self):
  1418. for n in xrange(2, 5):
  1419. self.check_squareform_multi_matrix(n)
  1420. def check_squareform_multi_matrix(self, n):
  1421. X = np.random.rand(n, 4)
  1422. Y = wpdist_no_const(X)
  1423. assert_equal(len(Y.shape), 1)
  1424. A = squareform(Y)
  1425. Yr = squareform(A)
  1426. s = A.shape
  1427. k = 0
  1428. if verbose >= 3:
  1429. print(A.shape, Y.shape, Yr.shape)
  1430. assert_equal(len(s), 2)
  1431. assert_equal(len(Yr.shape), 1)
  1432. assert_equal(s[0], s[1])
  1433. for i in xrange(0, s[0]):
  1434. for j in xrange(i + 1, s[1]):
  1435. if i != j:
  1436. assert_equal(A[i, j], Y[k])
  1437. k += 1
  1438. else:
  1439. assert_equal(A[i, j], 0)
  1440. class TestNumObsY(object):
  1441. def test_num_obs_y_multi_matrix(self):
  1442. for n in xrange(2, 10):
  1443. X = np.random.rand(n, 4)
  1444. Y = wpdist_no_const(X)
  1445. assert_equal(num_obs_y(Y), n)
  1446. def test_num_obs_y_1(self):
  1447. # Tests num_obs_y(y) on a condensed distance matrix over 1
  1448. # observations. Expecting exception.
  1449. assert_raises(ValueError, self.check_y, 1)
  1450. def test_num_obs_y_2(self):
  1451. # Tests num_obs_y(y) on a condensed distance matrix over 2
  1452. # observations.
  1453. assert_(self.check_y(2))
  1454. def test_num_obs_y_3(self):
  1455. assert_(self.check_y(3))
  1456. def test_num_obs_y_4(self):
  1457. assert_(self.check_y(4))
  1458. def test_num_obs_y_5_10(self):
  1459. for i in xrange(5, 16):
  1460. self.minit(i)
  1461. def test_num_obs_y_2_100(self):
  1462. # Tests num_obs_y(y) on 100 improper condensed distance matrices.
  1463. # Expecting exception.
  1464. a = set([])
  1465. for n in xrange(2, 16):
  1466. a.add(n * (n - 1) / 2)
  1467. for i in xrange(5, 105):
  1468. if i not in a:
  1469. assert_raises(ValueError, self.bad_y, i)
  1470. def minit(self, n):
  1471. assert_(self.check_y(n))
  1472. def bad_y(self, n):
  1473. y = np.random.rand(n)
  1474. return num_obs_y(y)
  1475. def check_y(self, n):
  1476. return num_obs_y(self.make_y(n)) == n
  1477. def make_y(self, n):
  1478. return np.random.rand((n * (n - 1)) // 2)
  1479. class TestNumObsDM(object):
  1480. def test_num_obs_dm_multi_matrix(self):
  1481. for n in xrange(1, 10):
  1482. X = np.random.rand(n, 4)
  1483. Y = wpdist_no_const(X)
  1484. A = squareform(Y)
  1485. if verbose >= 3:
  1486. print(A.shape, Y.shape)
  1487. assert_equal(num_obs_dm(A), n)
  1488. def test_num_obs_dm_0(self):
  1489. # Tests num_obs_dm(D) on a 0x0 distance matrix. Expecting exception.
  1490. assert_(self.check_D(0))
  1491. def test_num_obs_dm_1(self):
  1492. # Tests num_obs_dm(D) on a 1x1 distance matrix.
  1493. assert_(self.check_D(1))
  1494. def test_num_obs_dm_2(self):
  1495. assert_(self.check_D(2))
  1496. def test_num_obs_dm_3(self):
  1497. assert_(self.check_D(2))
  1498. def test_num_obs_dm_4(self):
  1499. assert_(self.check_D(4))
  1500. def check_D(self, n):
  1501. return num_obs_dm(self.make_D(n)) == n
  1502. def make_D(self, n):
  1503. return np.random.rand(n, n)
  1504. def is_valid_dm_throw(D):
  1505. return is_valid_dm(D, throw=True)
  1506. class TestIsValidDM(object):
  1507. def test_is_valid_dm_improper_shape_1D_E(self):
  1508. D = np.zeros((5,), dtype=np.double)
  1509. assert_raises(ValueError, is_valid_dm_throw, (D))
  1510. def test_is_valid_dm_improper_shape_1D_F(self):
  1511. D = np.zeros((5,), dtype=np.double)
  1512. assert_equal(is_valid_dm(D), False)
  1513. def test_is_valid_dm_improper_shape_3D_E(self):
  1514. D = np.zeros((3, 3, 3), dtype=np.double)
  1515. assert_raises(ValueError, is_valid_dm_throw, (D))
  1516. def test_is_valid_dm_improper_shape_3D_F(self):
  1517. D = np.zeros((3, 3, 3), dtype=np.double)
  1518. assert_equal(is_valid_dm(D), False)
  1519. def test_is_valid_dm_nonzero_diagonal_E(self):
  1520. y = np.random.rand(10)
  1521. D = squareform(y)
  1522. for i in xrange(0, 5):
  1523. D[i, i] = 2.0
  1524. assert_raises(ValueError, is_valid_dm_throw, (D))
  1525. def test_is_valid_dm_nonzero_diagonal_F(self):
  1526. y = np.random.rand(10)
  1527. D = squareform(y)
  1528. for i in xrange(0, 5):
  1529. D[i, i] = 2.0
  1530. assert_equal(is_valid_dm(D), False)
  1531. def test_is_valid_dm_asymmetric_E(self):
  1532. y = np.random.rand(10)
  1533. D = squareform(y)
  1534. D[1, 3] = D[3, 1] + 1
  1535. assert_raises(ValueError, is_valid_dm_throw, (D))
  1536. def test_is_valid_dm_asymmetric_F(self):
  1537. y = np.random.rand(10)
  1538. D = squareform(y)
  1539. D[1, 3] = D[3, 1] + 1
  1540. assert_equal(is_valid_dm(D), False)
  1541. def test_is_valid_dm_correct_1_by_1(self):
  1542. D = np.zeros((1, 1), dtype=np.double)
  1543. assert_equal(is_valid_dm(D), True)
  1544. def test_is_valid_dm_correct_2_by_2(self):
  1545. y = np.random.rand(1)
  1546. D = squareform(y)
  1547. assert_equal(is_valid_dm(D), True)
  1548. def test_is_valid_dm_correct_3_by_3(self):
  1549. y = np.random.rand(3)
  1550. D = squareform(y)
  1551. assert_equal(is_valid_dm(D), True)
  1552. def test_is_valid_dm_correct_4_by_4(self):
  1553. y = np.random.rand(6)
  1554. D = squareform(y)
  1555. assert_equal(is_valid_dm(D), True)
  1556. def test_is_valid_dm_correct_5_by_5(self):
  1557. y = np.random.rand(10)
  1558. D = squareform(y)
  1559. assert_equal(is_valid_dm(D), True)
  1560. def is_valid_y_throw(y):
  1561. return is_valid_y(y, throw=True)
  1562. class TestIsValidY(object):
  1563. # If test case name ends on "_E" then an exception is expected for the
  1564. # given input, if it ends in "_F" then False is expected for the is_valid_y
  1565. # check. Otherwise the input is expected to be valid.
  1566. def test_is_valid_y_improper_shape_2D_E(self):
  1567. y = np.zeros((3, 3,), dtype=np.double)
  1568. assert_raises(ValueError, is_valid_y_throw, (y))
  1569. def test_is_valid_y_improper_shape_2D_F(self):
  1570. y = np.zeros((3, 3,), dtype=np.double)
  1571. assert_equal(is_valid_y(y), False)
  1572. def test_is_valid_y_improper_shape_3D_E(self):
  1573. y = np.zeros((3, 3, 3), dtype=np.double)
  1574. assert_raises(ValueError, is_valid_y_throw, (y))
  1575. def test_is_valid_y_improper_shape_3D_F(self):
  1576. y = np.zeros((3, 3, 3), dtype=np.double)
  1577. assert_equal(is_valid_y(y), False)
  1578. def test_is_valid_y_correct_2_by_2(self):
  1579. y = self.correct_n_by_n(2)
  1580. assert_equal(is_valid_y(y), True)
  1581. def test_is_valid_y_correct_3_by_3(self):
  1582. y = self.correct_n_by_n(3)
  1583. assert_equal(is_valid_y(y), True)
  1584. def test_is_valid_y_correct_4_by_4(self):
  1585. y = self.correct_n_by_n(4)
  1586. assert_equal(is_valid_y(y), True)
  1587. def test_is_valid_y_correct_5_by_5(self):
  1588. y = self.correct_n_by_n(5)
  1589. assert_equal(is_valid_y(y), True)
  1590. def test_is_valid_y_2_100(self):
  1591. a = set([])
  1592. for n in xrange(2, 16):
  1593. a.add(n * (n - 1) / 2)
  1594. for i in xrange(5, 105):
  1595. if i not in a:
  1596. assert_raises(ValueError, self.bad_y, i)
  1597. def bad_y(self, n):
  1598. y = np.random.rand(n)
  1599. return is_valid_y(y, throw=True)
  1600. def correct_n_by_n(self, n):
  1601. y = np.random.rand((n * (n - 1)) // 2)
  1602. return y
  1603. def test_bad_p():
  1604. # Raise ValueError if p < 1.
  1605. p = 0.5
  1606. with suppress_warnings() as w:
  1607. w.filter(message="`wminkowski` is deprecated")
  1608. assert_raises(ValueError, wminkowski, [1, 2], [3, 4], p)
  1609. assert_raises(ValueError, wminkowski, [1, 2], [3, 4], p, [1, 1])
  1610. def test_sokalsneath_all_false():
  1611. # Regression test for ticket #876
  1612. assert_raises(ValueError, sokalsneath, [False, False, False], [False, False, False])
  1613. def test_canberra():
  1614. # Regression test for ticket #1430.
  1615. assert_equal(wcanberra([1, 2, 3], [2, 4, 6]), 1)
  1616. assert_equal(wcanberra([1, 1, 0, 0], [1, 0, 1, 0]), 2)
  1617. def test_braycurtis():
  1618. # Regression test for ticket #1430.
  1619. assert_almost_equal(wbraycurtis([1, 2, 3], [2, 4, 6]), 1. / 3, decimal=15)
  1620. assert_almost_equal(wbraycurtis([1, 1, 0, 0], [1, 0, 1, 0]), 0.5, decimal=15)
  1621. def test_euclideans():
  1622. # Regression test for ticket #1328.
  1623. x1 = np.array([1, 1, 1])
  1624. x2 = np.array([0, 0, 0])
  1625. # Basic test of the calculation.
  1626. assert_almost_equal(wsqeuclidean(x1, x2), 3.0, decimal=14)
  1627. assert_almost_equal(weuclidean(x1, x2), np.sqrt(3), decimal=14)
  1628. # Check flattening for (1, N) or (N, 1) inputs
  1629. assert_almost_equal(weuclidean(x1[np.newaxis, :], x2[np.newaxis, :]),
  1630. np.sqrt(3), decimal=14)
  1631. assert_almost_equal(wsqeuclidean(x1[np.newaxis, :], x2[np.newaxis, :]),
  1632. 3.0, decimal=14)
  1633. assert_almost_equal(wsqeuclidean(x1[:, np.newaxis], x2[:, np.newaxis]),
  1634. 3.0, decimal=14)
  1635. # Distance metrics only defined for vectors (= 1-D)
  1636. x = np.arange(4).reshape(2, 2)
  1637. assert_raises(ValueError, weuclidean, x, x)
  1638. assert_raises(ValueError, wsqeuclidean, x, x)
  1639. # Another check, with random data.
  1640. rs = np.random.RandomState(1234567890)
  1641. x = rs.rand(10)
  1642. y = rs.rand(10)
  1643. d1 = weuclidean(x, y)
  1644. d2 = wsqeuclidean(x, y)
  1645. assert_almost_equal(d1**2, d2, decimal=14)
  1646. def test_hamming_unequal_length():
  1647. # Regression test for gh-4290.
  1648. x = [0, 0, 1]
  1649. y = [1, 0, 1, 0]
  1650. # Used to give an AttributeError from ndarray.mean called on bool
  1651. assert_raises(ValueError, whamming, x, y)
  1652. def test_hamming_string_array():
  1653. # https://github.com/scikit-learn/scikit-learn/issues/4014
  1654. a = np.array(['eggs', 'spam', 'spam', 'eggs', 'spam', 'spam', 'spam',
  1655. 'spam', 'spam', 'spam', 'spam', 'eggs', 'eggs', 'spam',
  1656. 'eggs', 'eggs', 'eggs', 'eggs', 'eggs', 'spam'],
  1657. dtype='|S4')
  1658. b = np.array(['eggs', 'spam', 'spam', 'eggs', 'eggs', 'spam', 'spam',
  1659. 'spam', 'spam', 'eggs', 'spam', 'eggs', 'spam', 'eggs',
  1660. 'spam', 'spam', 'eggs', 'spam', 'spam', 'eggs'],
  1661. dtype='|S4')
  1662. desired = 0.45
  1663. assert_allclose(whamming(a, b), desired)
  1664. def test_minkowski_w():
  1665. # Regression test for gh-8142.
  1666. arr_in = np.array([[83.33333333, 100., 83.33333333, 100., 36.,
  1667. 60., 90., 150., 24., 48.],
  1668. [83.33333333, 100., 83.33333333, 100., 36.,
  1669. 60., 90., 150., 24., 48.]])
  1670. p0 = pdist(arr_in, metric='minkowski', p=1, w=None)
  1671. c0 = cdist(arr_in, arr_in, metric='minkowski', p=1, w=None)
  1672. p1 = pdist(arr_in, metric='minkowski', p=1)
  1673. c1 = cdist(arr_in, arr_in, metric='minkowski', p=1)
  1674. assert_allclose(p0, p1, rtol=1e-15)
  1675. assert_allclose(c0, c1, rtol=1e-15)
  1676. def test_sqeuclidean_dtypes():
  1677. # Assert that sqeuclidean returns the right types of values.
  1678. # Integer types should be converted to floating for stability.
  1679. # Floating point types should be the same as the input.
  1680. x = [1, 2, 3]
  1681. y = [4, 5, 6]
  1682. for dtype in [np.int8, np.int16, np.int32, np.int64]:
  1683. d = wsqeuclidean(np.asarray(x, dtype=dtype), np.asarray(y, dtype=dtype))
  1684. assert_(np.issubdtype(d.dtype, np.floating))
  1685. for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
  1686. d1 = wsqeuclidean([0], np.asarray([-1], dtype=dtype))
  1687. d2 = wsqeuclidean(np.asarray([-1], dtype=dtype), [0])
  1688. assert_equal(d1, d2)
  1689. assert_equal(d1, np.float64(np.iinfo(dtype).max)**2)
  1690. dtypes = [np.float32, np.float64, np.complex64, np.complex128]
  1691. for dtype in ['float16', 'float128']:
  1692. # These aren't present in older numpy versions; float128 may also not
  1693. # be present on all platforms.
  1694. if hasattr(np, dtype):
  1695. dtypes.append(getattr(np, dtype))
  1696. for dtype in dtypes:
  1697. d = wsqeuclidean(np.asarray(x, dtype=dtype), np.asarray(y, dtype=dtype))
  1698. assert_equal(d.dtype, dtype)
  1699. def test_sokalmichener():
  1700. # Test that sokalmichener has the same result for bool and int inputs.
  1701. p = [True, True, False]
  1702. q = [True, False, True]
  1703. x = [int(b) for b in p]
  1704. y = [int(b) for b in q]
  1705. dist1 = sokalmichener(p, q)
  1706. dist2 = sokalmichener(x, y)
  1707. # These should be exactly the same.
  1708. assert_equal(dist1, dist2)
  1709. def test_modifies_input():
  1710. # test whether cdist or pdist modifies input arrays
  1711. X1 = np.asarray([[1., 2., 3.],
  1712. [1.2, 2.3, 3.4],
  1713. [2.2, 2.3, 4.4],
  1714. [22.2, 23.3, 44.4]])
  1715. X1_copy = X1.copy()
  1716. with suppress_warnings() as w:
  1717. w.filter(message="`wminkowski` is deprecated")
  1718. for metric in _METRICS_NAMES:
  1719. kwargs = {"w": 1.0 / X1.std(axis=0)} if metric == "wminkowski" else {}
  1720. cdist(X1, X1, metric, **kwargs)
  1721. pdist(X1, metric, **kwargs)
  1722. assert_array_equal(X1, X1_copy)
  1723. def test_Xdist_deprecated_args():
  1724. # testing both cdist and pdist deprecated warnings
  1725. X1 = np.asarray([[1., 2., 3.],
  1726. [1.2, 2.3, 3.4],
  1727. [2.2, 2.3, 4.4],
  1728. [22.2, 23.3, 44.4]])
  1729. weights = np.arange(3)
  1730. warn_msg_kwargs = "Got unexpected kwarg"
  1731. warn_msg_args = "[0-9]* metric parameters have been passed as positional"
  1732. for metric in _METRICS_NAMES:
  1733. kwargs = {"w": weights} if metric == "wminkowski" else dict()
  1734. with suppress_warnings() as w:
  1735. log = w.record(message=warn_msg_args)
  1736. w.filter(message=warn_msg_kwargs)
  1737. w.filter(message="`wminkowski` is deprecated")
  1738. cdist(X1, X1, metric, 2., **kwargs)
  1739. pdist(X1, metric, 2., **kwargs)
  1740. assert_(len(log) == 2)
  1741. for arg in ["p", "V", "VI"]:
  1742. kwargs = {arg:"foo"}
  1743. if metric == "wminkowski":
  1744. if "p" in kwargs or "w" in kwargs:
  1745. continue
  1746. kwargs["w"] = weights
  1747. if((arg == "V" and metric == "seuclidean") or
  1748. (arg == "VI" and metric == "mahalanobis") or
  1749. (arg == "p" and metric == "minkowski")):
  1750. continue
  1751. with suppress_warnings() as w:
  1752. log = w.record(message=warn_msg_kwargs)
  1753. w.filter(message="`wminkowski` is deprecated")
  1754. cdist(X1, X1, metric, **kwargs)
  1755. pdist(X1, metric, **kwargs)
  1756. assert_(len(log) == 2)
  1757. def test_Xdist_non_negative_weights():
  1758. X = eo['random-float32-data'][::5, ::2]
  1759. w = np.ones(X.shape[1])
  1760. w[::5] = -w[::5]
  1761. for metric in _METRICS_NAMES:
  1762. if metric in ['seuclidean', 'mahalanobis', 'jensenshannon']:
  1763. continue
  1764. for m in [metric, eval(metric), "test_" + metric]:
  1765. assert_raises(ValueError, pdist, X, m, w=w)
  1766. assert_raises(ValueError, cdist, X, X, m, w=w)
  1767. def test__validate_vector():
  1768. x = [1, 2, 3]
  1769. y = _validate_vector(x)
  1770. assert_array_equal(y, x)
  1771. y = _validate_vector(x, dtype=np.float64)
  1772. assert_array_equal(y, x)
  1773. assert_equal(y.dtype, np.float64)
  1774. x = [1]
  1775. y = _validate_vector(x)
  1776. assert_equal(y.ndim, 1)
  1777. assert_equal(y, x)
  1778. x = 1
  1779. y = _validate_vector(x)
  1780. assert_equal(y.ndim, 1)
  1781. assert_equal(y, [x])
  1782. x = np.arange(5).reshape(1, -1, 1)
  1783. y = _validate_vector(x)
  1784. assert_equal(y.ndim, 1)
  1785. assert_array_equal(y, x[0, :, 0])
  1786. x = [[1, 2], [3, 4]]
  1787. assert_raises(ValueError, _validate_vector, x)