_trustregion_exact.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. """Nearly exact trust-region optimization subproblem."""
  2. from __future__ import division, print_function, absolute_import
  3. import numpy as np
  4. from scipy.linalg import (norm, get_lapack_funcs, solve_triangular,
  5. cho_solve)
  6. from ._trustregion import (_minimize_trust_region, BaseQuadraticSubproblem)
  7. __all__ = ['_minimize_trustregion_exact',
  8. 'estimate_smallest_singular_value',
  9. 'singular_leading_submatrix',
  10. 'IterativeSubproblem']
  11. def _minimize_trustregion_exact(fun, x0, args=(), jac=None, hess=None,
  12. **trust_region_options):
  13. """
  14. Minimization of scalar function of one or more variables using
  15. a nearly exact trust-region algorithm.
  16. Options
  17. -------
  18. initial_tr_radius : float
  19. Initial trust-region radius.
  20. max_tr_radius : float
  21. Maximum value of the trust-region radius. No steps that are longer
  22. than this value will be proposed.
  23. eta : float
  24. Trust region related acceptance stringency for proposed steps.
  25. gtol : float
  26. Gradient norm must be less than ``gtol`` before successful
  27. termination.
  28. """
  29. if jac is None:
  30. raise ValueError('Jacobian is required for trust region '
  31. 'exact minimization.')
  32. if hess is None:
  33. raise ValueError('Hessian matrix is required for trust region '
  34. 'exact minimization.')
  35. return _minimize_trust_region(fun, x0, args=args, jac=jac, hess=hess,
  36. subproblem=IterativeSubproblem,
  37. **trust_region_options)
  38. def estimate_smallest_singular_value(U):
  39. """Given upper triangular matrix ``U`` estimate the smallest singular
  40. value and the correspondent right singular vector in O(n**2) operations.
  41. Parameters
  42. ----------
  43. U : ndarray
  44. Square upper triangular matrix.
  45. Returns
  46. -------
  47. s_min : float
  48. Estimated smallest singular value of the provided matrix.
  49. z_min : ndarray
  50. Estimatied right singular vector.
  51. Notes
  52. -----
  53. The procedure is based on [1]_ and is done in two steps. First it finds
  54. a vector ``e`` with components selected from {+1, -1} such that the
  55. solution ``w`` from the system ``U.T w = e`` is as large as possible.
  56. Next it estimate ``U v = w``. The smallest singular value is close
  57. to ``norm(w)/norm(v)`` and the right singular vector is close
  58. to ``v/norm(v)``.
  59. The estimation will be better more ill-conditioned is the matrix.
  60. References
  61. ----------
  62. .. [1] Cline, A. K., Moler, C. B., Stewart, G. W., Wilkinson, J. H.
  63. An estimate for the condition number of a matrix. 1979.
  64. SIAM Journal on Numerical Analysis, 16(2), 368-375.
  65. """
  66. U = np.atleast_2d(U)
  67. m, n = U.shape
  68. if m != n:
  69. raise ValueError("A square triangular matrix should be provided.")
  70. # A vector `e` with components selected from {+1, -1}
  71. # is selected so that the solution `w` to the system
  72. # `U.T w = e` is as large as possible. Implementation
  73. # based on algorithm 3.5.1, p. 142, from reference [2]
  74. # adapted for lower triangular matrix.
  75. p = np.zeros(n)
  76. w = np.empty(n)
  77. # Implemented according to: Golub, G. H., Van Loan, C. F. (2013).
  78. # "Matrix computations". Forth Edition. JHU press. pp. 140-142.
  79. for k in range(n):
  80. wp = (1-p[k]) / U.T[k, k]
  81. wm = (-1-p[k]) / U.T[k, k]
  82. pp = p[k+1:] + U.T[k+1:, k]*wp
  83. pm = p[k+1:] + U.T[k+1:, k]*wm
  84. if abs(wp) + norm(pp, 1) >= abs(wm) + norm(pm, 1):
  85. w[k] = wp
  86. p[k+1:] = pp
  87. else:
  88. w[k] = wm
  89. p[k+1:] = pm
  90. # The system `U v = w` is solved using backward substitution.
  91. v = solve_triangular(U, w)
  92. v_norm = norm(v)
  93. w_norm = norm(w)
  94. # Smallest singular value
  95. s_min = w_norm / v_norm
  96. # Associated vector
  97. z_min = v / v_norm
  98. return s_min, z_min
  99. def gershgorin_bounds(H):
  100. """
  101. Given a square matrix ``H`` compute upper
  102. and lower bounds for its eigenvalues (Gregoshgorin Bounds).
  103. Defined ref. [1].
  104. References
  105. ----------
  106. .. [1] Conn, A. R., Gould, N. I., & Toint, P. L.
  107. Trust region methods. 2000. Siam. pp. 19.
  108. """
  109. H_diag = np.diag(H)
  110. H_diag_abs = np.abs(H_diag)
  111. H_row_sums = np.sum(np.abs(H), axis=1)
  112. lb = np.min(H_diag + H_diag_abs - H_row_sums)
  113. ub = np.max(H_diag - H_diag_abs + H_row_sums)
  114. return lb, ub
  115. def singular_leading_submatrix(A, U, k):
  116. """
  117. Compute term that makes the leading ``k`` by ``k``
  118. submatrix from ``A`` singular.
  119. Parameters
  120. ----------
  121. A : ndarray
  122. Symmetric matrix that is not positive definite.
  123. U : ndarray
  124. Upper triangular matrix resulting of an incomplete
  125. Cholesky decomposition of matrix ``A``.
  126. k : int
  127. Positive integer such that the leading k by k submatrix from
  128. `A` is the first non-positive definite leading submatrix.
  129. Returns
  130. -------
  131. delta : float
  132. Amount that should be added to the element (k, k) of the
  133. leading k by k submatrix of ``A`` to make it singular.
  134. v : ndarray
  135. A vector such that ``v.T B v = 0``. Where B is the matrix A after
  136. ``delta`` is added to its element (k, k).
  137. """
  138. # Compute delta
  139. delta = np.sum(U[:k-1, k-1]**2) - A[k-1, k-1]
  140. n = len(A)
  141. # Inicialize v
  142. v = np.zeros(n)
  143. v[k-1] = 1
  144. # Compute the remaining values of v by solving a triangular system.
  145. if k != 1:
  146. v[:k-1] = solve_triangular(U[:k-1, :k-1], -U[:k-1, k-1])
  147. return delta, v
  148. class IterativeSubproblem(BaseQuadraticSubproblem):
  149. """Quadratic subproblem solved by nearly exact iterative method.
  150. Notes
  151. -----
  152. This subproblem solver was based on [1]_, [2]_ and [3]_,
  153. which implement similar algorithms. The algorithm is basically
  154. that of [1]_ but ideas from [2]_ and [3]_ were also used.
  155. References
  156. ----------
  157. .. [1] A.R. Conn, N.I. Gould, and P.L. Toint, "Trust region methods",
  158. Siam, pp. 169-200, 2000.
  159. .. [2] J. Nocedal and S. Wright, "Numerical optimization",
  160. Springer Science & Business Media. pp. 83-91, 2006.
  161. .. [3] J.J. More and D.C. Sorensen, "Computing a trust region step",
  162. SIAM Journal on Scientific and Statistical Computing, vol. 4(3),
  163. pp. 553-572, 1983.
  164. """
  165. # UPDATE_COEFF appears in reference [1]_
  166. # in formula 7.3.14 (p. 190) named as "theta".
  167. # As recommended there it value is fixed in 0.01.
  168. UPDATE_COEFF = 0.01
  169. EPS = np.finfo(float).eps
  170. def __init__(self, x, fun, jac, hess, hessp=None,
  171. k_easy=0.1, k_hard=0.2):
  172. super(IterativeSubproblem, self).__init__(x, fun, jac, hess)
  173. # When the trust-region shrinks in two consecutive
  174. # calculations (``tr_radius < previous_tr_radius``)
  175. # the lower bound ``lambda_lb`` may be reused,
  176. # facilitating the convergence. To indicate no
  177. # previous value is known at first ``previous_tr_radius``
  178. # is set to -1 and ``lambda_lb`` to None.
  179. self.previous_tr_radius = -1
  180. self.lambda_lb = None
  181. self.niter = 0
  182. # ``k_easy`` and ``k_hard`` are parameters used
  183. # to determine the stop criteria to the iterative
  184. # subproblem solver. Take a look at pp. 194-197
  185. # from reference _[1] for a more detailed description.
  186. self.k_easy = k_easy
  187. self.k_hard = k_hard
  188. # Get Lapack function for cholesky decomposition.
  189. # The implemented Scipy wrapper does not return
  190. # the incomplete factorization needed by the method.
  191. self.cholesky, = get_lapack_funcs(('potrf',), (self.hess,))
  192. # Get info about Hessian
  193. self.dimension = len(self.hess)
  194. self.hess_gershgorin_lb,\
  195. self.hess_gershgorin_ub = gershgorin_bounds(self.hess)
  196. self.hess_inf = norm(self.hess, np.Inf)
  197. self.hess_fro = norm(self.hess, 'fro')
  198. # A constant such that for vectors smaler than that
  199. # backward substituition is not reliable. It was stabilished
  200. # based on Golub, G. H., Van Loan, C. F. (2013).
  201. # "Matrix computations". Forth Edition. JHU press., p.165.
  202. self.CLOSE_TO_ZERO = self.dimension * self.EPS * self.hess_inf
  203. def _initial_values(self, tr_radius):
  204. """Given a trust radius, return a good initial guess for
  205. the damping factor, the lower bound and the upper bound.
  206. The values were chosen accordingly to the guidelines on
  207. section 7.3.8 (p. 192) from [1]_.
  208. """
  209. # Upper bound for the damping factor
  210. lambda_ub = max(0, self.jac_mag/tr_radius + min(-self.hess_gershgorin_lb,
  211. self.hess_fro,
  212. self.hess_inf))
  213. # Lower bound for the damping factor
  214. lambda_lb = max(0, -min(self.hess.diagonal()),
  215. self.jac_mag/tr_radius - min(self.hess_gershgorin_ub,
  216. self.hess_fro,
  217. self.hess_inf))
  218. # Improve bounds with previous info
  219. if tr_radius < self.previous_tr_radius:
  220. lambda_lb = max(self.lambda_lb, lambda_lb)
  221. # Initial guess for the damping factor
  222. if lambda_lb == 0:
  223. lambda_initial = 0
  224. else:
  225. lambda_initial = max(np.sqrt(lambda_lb * lambda_ub),
  226. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  227. return lambda_initial, lambda_lb, lambda_ub
  228. def solve(self, tr_radius):
  229. """Solve quadratic subproblem"""
  230. lambda_current, lambda_lb, lambda_ub = self._initial_values(tr_radius)
  231. n = self.dimension
  232. hits_boundary = True
  233. already_factorized = False
  234. self.niter = 0
  235. while True:
  236. # Compute Cholesky factorization
  237. if already_factorized:
  238. already_factorized = False
  239. else:
  240. H = self.hess+lambda_current*np.eye(n)
  241. U, info = self.cholesky(H, lower=False,
  242. overwrite_a=False,
  243. clean=True)
  244. self.niter += 1
  245. # Check if factorization succeeded
  246. if info == 0 and self.jac_mag > self.CLOSE_TO_ZERO:
  247. # Successful factorization
  248. # Solve `U.T U p = s`
  249. p = cho_solve((U, False), -self.jac)
  250. p_norm = norm(p)
  251. # Check for interior convergence
  252. if p_norm <= tr_radius and lambda_current == 0:
  253. hits_boundary = False
  254. break
  255. # Solve `U.T w = p`
  256. w = solve_triangular(U, p, trans='T')
  257. w_norm = norm(w)
  258. # Compute Newton step accordingly to
  259. # formula (4.44) p.87 from ref [2]_.
  260. delta_lambda = (p_norm/w_norm)**2 * (p_norm-tr_radius)/tr_radius
  261. lambda_new = lambda_current + delta_lambda
  262. if p_norm < tr_radius: # Inside boundary
  263. s_min, z_min = estimate_smallest_singular_value(U)
  264. ta, tb = self.get_boundaries_intersections(p, z_min,
  265. tr_radius)
  266. # Choose `step_len` with the smallest magnitude.
  267. # The reason for this choice is explained at
  268. # ref [3]_, p. 6 (Immediately before the formula
  269. # for `tau`).
  270. step_len = min([ta, tb], key=abs)
  271. # Compute the quadratic term (p.T*H*p)
  272. quadratic_term = np.dot(p, np.dot(H, p))
  273. # Check stop criteria
  274. relative_error = (step_len**2 * s_min**2) / (quadratic_term + lambda_current*tr_radius**2)
  275. if relative_error <= self.k_hard:
  276. p += step_len * z_min
  277. break
  278. # Update uncertanty bounds
  279. lambda_ub = lambda_current
  280. lambda_lb = max(lambda_lb, lambda_current - s_min**2)
  281. # Compute Cholesky factorization
  282. H = self.hess + lambda_new*np.eye(n)
  283. c, info = self.cholesky(H, lower=False,
  284. overwrite_a=False,
  285. clean=True)
  286. # Check if the factorization have succeeded
  287. #
  288. if info == 0: # Successful factorization
  289. # Update damping factor
  290. lambda_current = lambda_new
  291. already_factorized = True
  292. else: # Unsuccessful factorization
  293. # Update uncertanty bounds
  294. lambda_lb = max(lambda_lb, lambda_new)
  295. # Update damping factor
  296. lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
  297. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  298. else: # Outside boundary
  299. # Check stop criteria
  300. relative_error = abs(p_norm - tr_radius) / tr_radius
  301. if relative_error <= self.k_easy:
  302. break
  303. # Update uncertanty bounds
  304. lambda_lb = lambda_current
  305. # Update damping factor
  306. lambda_current = lambda_new
  307. elif info == 0 and self.jac_mag <= self.CLOSE_TO_ZERO:
  308. # jac_mag very close to zero
  309. # Check for interior convergence
  310. if lambda_current == 0:
  311. p = np.zeros(n)
  312. hits_boundary = False
  313. break
  314. s_min, z_min = estimate_smallest_singular_value(U)
  315. step_len = tr_radius
  316. # Check stop criteria
  317. if step_len**2 * s_min**2 <= self.k_hard * lambda_current * tr_radius**2:
  318. p = step_len * z_min
  319. break
  320. # Update uncertanty bounds
  321. lambda_ub = lambda_current
  322. lambda_lb = max(lambda_lb, lambda_current - s_min**2)
  323. # Update damping factor
  324. lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
  325. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  326. else: # Unsuccessful factorization
  327. # Compute auxiliary terms
  328. delta, v = singular_leading_submatrix(H, U, info)
  329. v_norm = norm(v)
  330. # Update uncertanty interval
  331. lambda_lb = max(lambda_lb, lambda_current + delta/v_norm**2)
  332. # Update damping factor
  333. lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
  334. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  335. self.lambda_lb = lambda_lb
  336. self.lambda_current = lambda_current
  337. self.previous_tr_radius = tr_radius
  338. return p, hits_boundary