_hessian_update_strategy.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. """Hessian update strategies for quasi-Newton optimization methods."""
  2. from __future__ import division, print_function, absolute_import
  3. import numpy as np
  4. from numpy.linalg import norm
  5. from scipy.linalg import get_blas_funcs
  6. from warnings import warn
  7. __all__ = ['HessianUpdateStrategy', 'BFGS', 'SR1']
  8. class HessianUpdateStrategy(object):
  9. """Interface for implementing Hessian update strategies.
  10. Many optimization methods make use of Hessian (or inverse Hessian)
  11. approximations, such as the quasi-Newton methods BFGS, SR1, L-BFGS.
  12. Some of these approximations, however, do not actually need to store
  13. the entire matrix or can compute the internal matrix product with a
  14. given vector in a very efficiently manner. This class serves as an
  15. abstract interface between the optimization algorithm and the
  16. quasi-Newton update strategies, giving freedom of implementation
  17. to store and update the internal matrix as efficiently as possible.
  18. Different choices of initialization and update procedure will result
  19. in different quasi-Newton strategies.
  20. Four methods should be implemented in derived classes: ``initialize``,
  21. ``update``, ``dot`` and ``get_matrix``.
  22. Notes
  23. -----
  24. Any instance of a class that implements this interface,
  25. can be accepted by the method ``minimize`` and used by
  26. the compatible solvers to approximate the Hessian (or
  27. inverse Hessian) used by the optimization algorithms.
  28. """
  29. def initialize(self, n, approx_type):
  30. """Initialize internal matrix.
  31. Allocate internal memory for storing and updating
  32. the Hessian or its inverse.
  33. Parameters
  34. ----------
  35. n : int
  36. Problem dimension.
  37. approx_type : {'hess', 'inv_hess'}
  38. Selects either the Hessian or the inverse Hessian.
  39. When set to 'hess' the Hessian will be stored and updated.
  40. When set to 'inv_hess' its inverse will be used instead.
  41. """
  42. raise NotImplementedError("The method ``initialize(n, approx_type)``"
  43. " is not implemented.")
  44. def update(self, delta_x, delta_grad):
  45. """Update internal matrix.
  46. Update Hessian matrix or its inverse (depending on how 'approx_type'
  47. is defined) using information about the last evaluated points.
  48. Parameters
  49. ----------
  50. delta_x : ndarray
  51. The difference between two points the gradient
  52. function have been evaluated at: ``delta_x = x2 - x1``.
  53. delta_grad : ndarray
  54. The difference between the gradients:
  55. ``delta_grad = grad(x2) - grad(x1)``.
  56. """
  57. raise NotImplementedError("The method ``update(delta_x, delta_grad)``"
  58. " is not implemented.")
  59. def dot(self, p):
  60. """Compute the product of the internal matrix with the given vector.
  61. Parameters
  62. ----------
  63. p : array_like
  64. 1-d array representing a vector.
  65. Returns
  66. -------
  67. Hp : array
  68. 1-d represents the result of multiplying the approximation matrix
  69. by vector p.
  70. """
  71. raise NotImplementedError("The method ``dot(p)``"
  72. " is not implemented.")
  73. def get_matrix(self):
  74. """Return current internal matrix.
  75. Returns
  76. -------
  77. H : ndarray, shape (n, n)
  78. Dense matrix containing either the Hessian
  79. or its inverse (depending on how 'approx_type'
  80. is defined).
  81. """
  82. raise NotImplementedError("The method ``get_matrix(p)``"
  83. " is not implemented.")
  84. class FullHessianUpdateStrategy(HessianUpdateStrategy):
  85. """Hessian update strategy with full dimensional internal representation.
  86. """
  87. _syr = get_blas_funcs('syr', dtype='d') # Symmetric rank 1 update
  88. _syr2 = get_blas_funcs('syr2', dtype='d') # Symmetric rank 2 update
  89. # Symmetric matrix-vector product
  90. _symv = get_blas_funcs('symv', dtype='d')
  91. def __init__(self, init_scale='auto'):
  92. self.init_scale = init_scale
  93. # Until initialize is called we can't really use the class,
  94. # so it makes sense to set everything to None.
  95. self.first_iteration = None
  96. self.approx_type = None
  97. self.B = None
  98. self.H = None
  99. def initialize(self, n, approx_type):
  100. """Initialize internal matrix.
  101. Allocate internal memory for storing and updating
  102. the Hessian or its inverse.
  103. Parameters
  104. ----------
  105. n : int
  106. Problem dimension.
  107. approx_type : {'hess', 'inv_hess'}
  108. Selects either the Hessian or the inverse Hessian.
  109. When set to 'hess' the Hessian will be stored and updated.
  110. When set to 'inv_hess' its inverse will be used instead.
  111. """
  112. self.first_iteration = True
  113. self.n = n
  114. self.approx_type = approx_type
  115. if approx_type not in ('hess', 'inv_hess'):
  116. raise ValueError("`approx_type` must be 'hess' or 'inv_hess'.")
  117. # Create matrix
  118. if self.approx_type == 'hess':
  119. self.B = np.eye(n, dtype=float)
  120. else:
  121. self.H = np.eye(n, dtype=float)
  122. def _auto_scale(self, delta_x, delta_grad):
  123. # Heuristic to scale matrix at first iteration.
  124. # Described in Nocedal and Wright "Numerical Optimization"
  125. # p.143 formula (6.20).
  126. s_norm2 = np.dot(delta_x, delta_x)
  127. y_norm2 = np.dot(delta_grad, delta_grad)
  128. ys = np.abs(np.dot(delta_grad, delta_x))
  129. if ys == 0.0 or y_norm2 == 0 or s_norm2 == 0:
  130. return 1
  131. if self.approx_type == 'hess':
  132. return y_norm2 / ys
  133. else:
  134. return ys / y_norm2
  135. def _update_implementation(self, delta_x, delta_grad):
  136. raise NotImplementedError("The method ``_update_implementation``"
  137. " is not implemented.")
  138. def update(self, delta_x, delta_grad):
  139. """Update internal matrix.
  140. Update Hessian matrix or its inverse (depending on how 'approx_type'
  141. is defined) using information about the last evaluated points.
  142. Parameters
  143. ----------
  144. delta_x : ndarray
  145. The difference between two points the gradient
  146. function have been evaluated at: ``delta_x = x2 - x1``.
  147. delta_grad : ndarray
  148. The difference between the gradients:
  149. ``delta_grad = grad(x2) - grad(x1)``.
  150. """
  151. if np.all(delta_x == 0.0):
  152. return
  153. if np.all(delta_grad == 0.0):
  154. warn('delta_grad == 0.0. Check if the approximated '
  155. 'function is linear. If the function is linear '
  156. 'better results can be obtained by defining the '
  157. 'Hessian as zero instead of using quasi-Newton '
  158. 'approximations.', UserWarning)
  159. return
  160. if self.first_iteration:
  161. # Get user specific scale
  162. if self.init_scale == "auto":
  163. scale = self._auto_scale(delta_x, delta_grad)
  164. else:
  165. scale = float(self.init_scale)
  166. # Scale initial matrix with ``scale * np.eye(n)``
  167. if self.approx_type == 'hess':
  168. self.B *= scale
  169. else:
  170. self.H *= scale
  171. self.first_iteration = False
  172. self._update_implementation(delta_x, delta_grad)
  173. def dot(self, p):
  174. """Compute the product of the internal matrix with the given vector.
  175. Parameters
  176. ----------
  177. p : array_like
  178. 1-d array representing a vector.
  179. Returns
  180. -------
  181. Hp : array
  182. 1-d represents the result of multiplying the approximation matrix
  183. by vector p.
  184. """
  185. if self.approx_type == 'hess':
  186. return self._symv(1, self.B, p)
  187. else:
  188. return self._symv(1, self.H, p)
  189. def get_matrix(self):
  190. """Return the current internal matrix.
  191. Returns
  192. -------
  193. M : ndarray, shape (n, n)
  194. Dense matrix containing either the Hessian or its inverse
  195. (depending on how `approx_type` was defined).
  196. """
  197. if self.approx_type == 'hess':
  198. M = np.copy(self.B)
  199. else:
  200. M = np.copy(self.H)
  201. li = np.tril_indices_from(M, k=-1)
  202. M[li] = M.T[li]
  203. return M
  204. class BFGS(FullHessianUpdateStrategy):
  205. """Broyden-Fletcher-Goldfarb-Shanno (BFGS) Hessian update strategy.
  206. Parameters
  207. ----------
  208. exception_strategy : {'skip_update', 'damp_update'}, optional
  209. Define how to proceed when the curvature condition is violated.
  210. Set it to 'skip_update' to just skip the update. Or, alternatively,
  211. set it to 'damp_update' to interpolate between the actual BFGS
  212. result and the unmodified matrix. Both exceptions strategies
  213. are explained in [1]_, p.536-537.
  214. min_curvature : float
  215. This number, scaled by a normalization factor, defines the
  216. minimum curvature ``dot(delta_grad, delta_x)`` allowed to go
  217. unaffected by the exception strategy. By default is equal to
  218. 1e-8 when ``exception_strategy = 'skip_update'`` and equal
  219. to 0.2 when ``exception_strategy = 'damp_update'``.
  220. init_scale : {float, 'auto'}
  221. Matrix scale at first iteration. At the first
  222. iteration the Hessian matrix or its inverse will be initialized
  223. with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
  224. Set it to 'auto' in order to use an automatic heuristic for choosing
  225. the initial scale. The heuristic is described in [1]_, p.143.
  226. By default uses 'auto'.
  227. Notes
  228. -----
  229. The update is based on the description in [1]_, p.140.
  230. References
  231. ----------
  232. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  233. Second Edition (2006).
  234. """
  235. def __init__(self, exception_strategy='skip_update', min_curvature=None,
  236. init_scale='auto'):
  237. if exception_strategy == 'skip_update':
  238. if min_curvature is not None:
  239. self.min_curvature = min_curvature
  240. else:
  241. self.min_curvature = 1e-8
  242. elif exception_strategy == 'damp_update':
  243. if min_curvature is not None:
  244. self.min_curvature = min_curvature
  245. else:
  246. self.min_curvature = 0.2
  247. else:
  248. raise ValueError("`exception_strategy` must be 'skip_update' "
  249. "or 'damp_update'.")
  250. super(BFGS, self).__init__(init_scale)
  251. self.exception_strategy = exception_strategy
  252. def _update_inverse_hessian(self, ys, Hy, yHy, s):
  253. """Update the inverse Hessian matrix.
  254. BFGS update using the formula:
  255. ``H <- H + ((H*y).T*y + s.T*y)/(s.T*y)^2 * (s*s.T)
  256. - 1/(s.T*y) * ((H*y)*s.T + s*(H*y).T)``
  257. where ``s = delta_x`` and ``y = delta_grad``. This formula is
  258. equivalent to (6.17) in [1]_ written in a more efficient way
  259. for implementation.
  260. References
  261. ----------
  262. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  263. Second Edition (2006).
  264. """
  265. self.H = self._syr2(-1.0 / ys, s, Hy, a=self.H)
  266. self.H = self._syr((ys+yHy)/ys**2, s, a=self.H)
  267. def _update_hessian(self, ys, Bs, sBs, y):
  268. """Update the Hessian matrix.
  269. BFGS update using the formula:
  270. ``B <- B - (B*s)*(B*s).T/s.T*(B*s) + y*y^T/s.T*y``
  271. where ``s`` is short for ``delta_x`` and ``y`` is short
  272. for ``delta_grad``. Formula (6.19) in [1]_.
  273. References
  274. ----------
  275. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  276. Second Edition (2006).
  277. """
  278. self.B = self._syr(1.0 / ys, y, a=self.B)
  279. self.B = self._syr(-1.0 / sBs, Bs, a=self.B)
  280. def _update_implementation(self, delta_x, delta_grad):
  281. # Auxiliary variables w and z
  282. if self.approx_type == 'hess':
  283. w = delta_x
  284. z = delta_grad
  285. else:
  286. w = delta_grad
  287. z = delta_x
  288. # Do some common operations
  289. wz = np.dot(w, z)
  290. Mw = self.dot(w)
  291. wMw = Mw.dot(w)
  292. # Guarantee that wMw > 0 by reinitializing matrix.
  293. # While this is always true in exact arithmetics,
  294. # indefinite matrix may appear due to roundoff errors.
  295. if wMw <= 0.0:
  296. scale = self._auto_scale(delta_x, delta_grad)
  297. # Reinitialize matrix
  298. if self.approx_type == 'hess':
  299. self.B = scale * np.eye(self.n, dtype=float)
  300. else:
  301. self.H = scale * np.eye(self.n, dtype=float)
  302. # Do common operations for new matrix
  303. Mw = self.dot(w)
  304. wMw = Mw.dot(w)
  305. # Check if curvature condition is violated
  306. if wz <= self.min_curvature * wMw:
  307. # If the option 'skip_update' is set
  308. # we just skip the update when the condion
  309. # is violated.
  310. if self.exception_strategy == 'skip_update':
  311. return
  312. # If the option 'damp_update' is set we
  313. # interpolate between the actual BFGS
  314. # result and the unmodified matrix.
  315. elif self.exception_strategy == 'damp_update':
  316. update_factor = (1-self.min_curvature) / (1 - wz/wMw)
  317. z = update_factor*z + (1-update_factor)*Mw
  318. wz = np.dot(w, z)
  319. # Update matrix
  320. if self.approx_type == 'hess':
  321. self._update_hessian(wz, Mw, wMw, z)
  322. else:
  323. self._update_inverse_hessian(wz, Mw, wMw, z)
  324. class SR1(FullHessianUpdateStrategy):
  325. """Symmetric-rank-1 Hessian update strategy.
  326. Parameters
  327. ----------
  328. min_denominator : float
  329. This number, scaled by a normalization factor,
  330. defines the minimum denominator magnitude allowed
  331. in the update. When the condition is violated we skip
  332. the update. By default uses ``1e-8``.
  333. init_scale : {float, 'auto'}, optional
  334. Matrix scale at first iteration. At the first
  335. iteration the Hessian matrix or its inverse will be initialized
  336. with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
  337. Set it to 'auto' in order to use an automatic heuristic for choosing
  338. the initial scale. The heuristic is described in [1]_, p.143.
  339. By default uses 'auto'.
  340. Notes
  341. -----
  342. The update is based on the description in [1]_, p.144-146.
  343. References
  344. ----------
  345. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  346. Second Edition (2006).
  347. """
  348. def __init__(self, min_denominator=1e-8, init_scale='auto'):
  349. self.min_denominator = min_denominator
  350. super(SR1, self).__init__(init_scale)
  351. def _update_implementation(self, delta_x, delta_grad):
  352. # Auxiliary variables w and z
  353. if self.approx_type == 'hess':
  354. w = delta_x
  355. z = delta_grad
  356. else:
  357. w = delta_grad
  358. z = delta_x
  359. # Do some common operations
  360. Mw = self.dot(w)
  361. z_minus_Mw = z - Mw
  362. denominator = np.dot(w, z_minus_Mw)
  363. # If the denominator is too small
  364. # we just skip the update.
  365. if np.abs(denominator) <= self.min_denominator*norm(w)*norm(z_minus_Mw):
  366. return
  367. # Update matrix
  368. if self.approx_type == 'hess':
  369. self.B = self._syr(1/denominator, z_minus_Mw, a=self.B)
  370. else:
  371. self.H = self._syr(1/denominator, z_minus_Mw, a=self.H)