_sketches.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. """ Sketching-based Matrix Computations """
  2. # Author: Jordi Montes <jomsdev@gmail.com>
  3. # August 28, 2017
  4. from __future__ import division, print_function, absolute_import
  5. import numpy as np
  6. from scipy._lib._util import check_random_state
  7. __all__ = ['clarkson_woodruff_transform']
  8. def cwt_matrix(n_rows, n_columns, seed=None):
  9. r""""
  10. Generate a matrix S for the Clarkson-Woodruff sketch.
  11. Given the desired size of matrix, the method returns a matrix S of size
  12. (n_rows, n_columns) where each column has all the entries set to 0 less one
  13. position which has been randomly set to +1 or -1 with equal probability.
  14. Parameters
  15. ----------
  16. n_rows: int
  17. Number of rows of S
  18. n_columns: int
  19. Number of columns of S
  20. seed : None or int or `numpy.random.RandomState` instance, optional
  21. This parameter defines the ``RandomState`` object to use for drawing
  22. random variates.
  23. If None (or ``np.random``), the global ``np.random`` state is used.
  24. If integer, it is used to seed the local ``RandomState`` instance.
  25. Default is None.
  26. Returns
  27. -------
  28. S : (n_rows, n_columns) array_like
  29. Notes
  30. -----
  31. Given a matrix A, with probability at least 9/10,
  32. .. math:: ||SA|| == (1 \pm \epsilon)||A||
  33. Where epsilon is related to the size of S
  34. """
  35. S = np.zeros((n_rows, n_columns))
  36. nz_positions = np.random.randint(0, n_rows, n_columns)
  37. rng = check_random_state(seed)
  38. values = rng.choice([1, -1], n_columns)
  39. for i in range(n_columns):
  40. S[nz_positions[i]][i] = values[i]
  41. return S
  42. def clarkson_woodruff_transform(input_matrix, sketch_size, seed=None):
  43. r""""
  44. Find low-rank matrix approximation via the Clarkson-Woodruff Transform.
  45. Given an input_matrix ``A`` of size ``(n, d)``, compute a matrix ``A'`` of
  46. size (sketch_size, d) which holds:
  47. .. math:: ||Ax|| = (1 \pm \epsilon)||A'x||
  48. with high probability.
  49. The error is related to the number of rows of the sketch and it is bounded
  50. .. math:: poly(r(\epsilon^{-1}))
  51. Parameters
  52. ----------
  53. input_matrix: array_like
  54. Input matrix, of shape ``(n, d)``.
  55. sketch_size: int
  56. Number of rows for the sketch.
  57. seed : None or int or `numpy.random.RandomState` instance, optional
  58. This parameter defines the ``RandomState`` object to use for drawing
  59. random variates.
  60. If None (or ``np.random``), the global ``np.random`` state is used.
  61. If integer, it is used to seed the local ``RandomState`` instance.
  62. Default is None.
  63. Returns
  64. -------
  65. A' : array_like
  66. Sketch of the input matrix ``A``, of size ``(sketch_size, d)``.
  67. Notes
  68. -----
  69. This is an implementation of the Clarkson-Woodruff Transform (CountSketch).
  70. ``A'`` can be computed in principle in ``O(nnz(A))`` (with ``nnz`` meaning
  71. the number of nonzero entries), however we don't take advantage of sparse
  72. matrices in this implementation.
  73. Examples
  74. --------
  75. Given a big dense matrix ``A``:
  76. >>> from scipy import linalg
  77. >>> n_rows, n_columns, sketch_n_rows = (2000, 100, 100)
  78. >>> threshold = 0.1
  79. >>> tmp = np.random.normal(0, 0.1, n_rows*n_columns)
  80. >>> A = np.reshape(tmp, (n_rows, n_columns))
  81. >>> sketch = linalg.clarkson_woodruff_transform(A, sketch_n_rows)
  82. >>> sketch.shape
  83. (100, 100)
  84. >>> normA = linalg.norm(A)
  85. >>> norm_sketch = linalg.norm(sketch)
  86. Now with high probability, the condition ``abs(normA-normSketch) <
  87. threshold`` holds.
  88. References
  89. ----------
  90. .. [1] Kenneth L. Clarkson and David P. Woodruff. Low rank approximation and
  91. regression in input sparsity time. In STOC, 2013.
  92. """
  93. S = cwt_matrix(sketch_size, input_matrix.shape[0], seed)
  94. return np.dot(S, input_matrix)