_procrustes.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. """
  2. This module provides functions to perform full Procrustes analysis.
  3. This code was originally written by Justin Kucynski and ported over from
  4. scikit-bio by Yoshiki Vazquez-Baeza.
  5. """
  6. from __future__ import absolute_import, division, print_function
  7. import numpy as np
  8. from scipy.linalg import orthogonal_procrustes
  9. __all__ = ['procrustes']
  10. def procrustes(data1, data2):
  11. r"""Procrustes analysis, a similarity test for two data sets.
  12. Each input matrix is a set of points or vectors (the rows of the matrix).
  13. The dimension of the space is the number of columns of each matrix. Given
  14. two identically sized matrices, procrustes standardizes both such that:
  15. - :math:`tr(AA^{T}) = 1`.
  16. - Both sets of points are centered around the origin.
  17. Procrustes ([1]_, [2]_) then applies the optimal transform to the second
  18. matrix (including scaling/dilation, rotations, and reflections) to minimize
  19. :math:`M^{2}=\sum(data1-data2)^{2}`, or the sum of the squares of the
  20. pointwise differences between the two input datasets.
  21. This function was not designed to handle datasets with different numbers of
  22. datapoints (rows). If two data sets have different dimensionality
  23. (different number of columns), simply add columns of zeros to the smaller
  24. of the two.
  25. Parameters
  26. ----------
  27. data1 : array_like
  28. Matrix, n rows represent points in k (columns) space `data1` is the
  29. reference data, after it is standardised, the data from `data2` will be
  30. transformed to fit the pattern in `data1` (must have >1 unique points).
  31. data2 : array_like
  32. n rows of data in k space to be fit to `data1`. Must be the same
  33. shape ``(numrows, numcols)`` as data1 (must have >1 unique points).
  34. Returns
  35. -------
  36. mtx1 : array_like
  37. A standardized version of `data1`.
  38. mtx2 : array_like
  39. The orientation of `data2` that best fits `data1`. Centered, but not
  40. necessarily :math:`tr(AA^{T}) = 1`.
  41. disparity : float
  42. :math:`M^{2}` as defined above.
  43. Raises
  44. ------
  45. ValueError
  46. If the input arrays are not two-dimensional.
  47. If the shape of the input arrays is different.
  48. If the input arrays have zero columns or zero rows.
  49. See Also
  50. --------
  51. scipy.linalg.orthogonal_procrustes
  52. scipy.spatial.distance.directed_hausdorff : Another similarity test
  53. for two data sets
  54. Notes
  55. -----
  56. - The disparity should not depend on the order of the input matrices, but
  57. the output matrices will, as only the first output matrix is guaranteed
  58. to be scaled such that :math:`tr(AA^{T}) = 1`.
  59. - Duplicate data points are generally ok, duplicating a data point will
  60. increase its effect on the procrustes fit.
  61. - The disparity scales as the number of points per input matrix.
  62. References
  63. ----------
  64. .. [1] Krzanowski, W. J. (2000). "Principles of Multivariate analysis".
  65. .. [2] Gower, J. C. (1975). "Generalized procrustes analysis".
  66. Examples
  67. --------
  68. >>> from scipy.spatial import procrustes
  69. The matrix ``b`` is a rotated, shifted, scaled and mirrored version of
  70. ``a`` here:
  71. >>> a = np.array([[1, 3], [1, 2], [1, 1], [2, 1]], 'd')
  72. >>> b = np.array([[4, -2], [4, -4], [4, -6], [2, -6]], 'd')
  73. >>> mtx1, mtx2, disparity = procrustes(a, b)
  74. >>> round(disparity)
  75. 0.0
  76. """
  77. mtx1 = np.array(data1, dtype=np.double, copy=True)
  78. mtx2 = np.array(data2, dtype=np.double, copy=True)
  79. if mtx1.ndim != 2 or mtx2.ndim != 2:
  80. raise ValueError("Input matrices must be two-dimensional")
  81. if mtx1.shape != mtx2.shape:
  82. raise ValueError("Input matrices must be of same shape")
  83. if mtx1.size == 0:
  84. raise ValueError("Input matrices must be >0 rows and >0 cols")
  85. # translate all the data to the origin
  86. mtx1 -= np.mean(mtx1, 0)
  87. mtx2 -= np.mean(mtx2, 0)
  88. norm1 = np.linalg.norm(mtx1)
  89. norm2 = np.linalg.norm(mtx2)
  90. if norm1 == 0 or norm2 == 0:
  91. raise ValueError("Input matrices must contain >1 unique points")
  92. # change scaling of data (in rows) such that trace(mtx*mtx') = 1
  93. mtx1 /= norm1
  94. mtx2 /= norm2
  95. # transform mtx2 to minimize disparity
  96. R, s = orthogonal_procrustes(mtx1, mtx2)
  97. mtx2 = np.dot(mtx2, R.T) * s
  98. # measure the dissimilarity between the two datasets
  99. disparity = np.sum(np.square(mtx1 - mtx2))
  100. return mtx1, mtx2, disparity