linesearch.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879
  1. """
  2. Functions
  3. ---------
  4. .. autosummary::
  5. :toctree: generated/
  6. line_search_armijo
  7. line_search_wolfe1
  8. line_search_wolfe2
  9. scalar_search_wolfe1
  10. scalar_search_wolfe2
  11. """
  12. from __future__ import division, print_function, absolute_import
  13. from warnings import warn
  14. from scipy.optimize import minpack2
  15. import numpy as np
  16. from scipy._lib.six import xrange
  17. __all__ = ['LineSearchWarning', 'line_search_wolfe1', 'line_search_wolfe2',
  18. 'scalar_search_wolfe1', 'scalar_search_wolfe2',
  19. 'line_search_armijo']
  20. class LineSearchWarning(RuntimeWarning):
  21. pass
  22. #------------------------------------------------------------------------------
  23. # Minpack's Wolfe line and scalar searches
  24. #------------------------------------------------------------------------------
  25. def line_search_wolfe1(f, fprime, xk, pk, gfk=None,
  26. old_fval=None, old_old_fval=None,
  27. args=(), c1=1e-4, c2=0.9, amax=50, amin=1e-8,
  28. xtol=1e-14):
  29. """
  30. As `scalar_search_wolfe1` but do a line search to direction `pk`
  31. Parameters
  32. ----------
  33. f : callable
  34. Function `f(x)`
  35. fprime : callable
  36. Gradient of `f`
  37. xk : array_like
  38. Current point
  39. pk : array_like
  40. Search direction
  41. gfk : array_like, optional
  42. Gradient of `f` at point `xk`
  43. old_fval : float, optional
  44. Value of `f` at point `xk`
  45. old_old_fval : float, optional
  46. Value of `f` at point preceding `xk`
  47. The rest of the parameters are the same as for `scalar_search_wolfe1`.
  48. Returns
  49. -------
  50. stp, f_count, g_count, fval, old_fval
  51. As in `line_search_wolfe1`
  52. gval : array
  53. Gradient of `f` at the final point
  54. """
  55. if gfk is None:
  56. gfk = fprime(xk)
  57. if isinstance(fprime, tuple):
  58. eps = fprime[1]
  59. fprime = fprime[0]
  60. newargs = (f, eps) + args
  61. gradient = False
  62. else:
  63. newargs = args
  64. gradient = True
  65. gval = [gfk]
  66. gc = [0]
  67. fc = [0]
  68. def phi(s):
  69. fc[0] += 1
  70. return f(xk + s*pk, *args)
  71. def derphi(s):
  72. gval[0] = fprime(xk + s*pk, *newargs)
  73. if gradient:
  74. gc[0] += 1
  75. else:
  76. fc[0] += len(xk) + 1
  77. return np.dot(gval[0], pk)
  78. derphi0 = np.dot(gfk, pk)
  79. stp, fval, old_fval = scalar_search_wolfe1(
  80. phi, derphi, old_fval, old_old_fval, derphi0,
  81. c1=c1, c2=c2, amax=amax, amin=amin, xtol=xtol)
  82. return stp, fc[0], gc[0], fval, old_fval, gval[0]
  83. def scalar_search_wolfe1(phi, derphi, phi0=None, old_phi0=None, derphi0=None,
  84. c1=1e-4, c2=0.9,
  85. amax=50, amin=1e-8, xtol=1e-14):
  86. """
  87. Scalar function search for alpha that satisfies strong Wolfe conditions
  88. alpha > 0 is assumed to be a descent direction.
  89. Parameters
  90. ----------
  91. phi : callable phi(alpha)
  92. Function at point `alpha`
  93. derphi : callable dphi(alpha)
  94. Derivative `d phi(alpha)/ds`. Returns a scalar.
  95. phi0 : float, optional
  96. Value of `f` at 0
  97. old_phi0 : float, optional
  98. Value of `f` at the previous point
  99. derphi0 : float, optional
  100. Value `derphi` at 0
  101. c1, c2 : float, optional
  102. Wolfe parameters
  103. amax, amin : float, optional
  104. Maximum and minimum step size
  105. xtol : float, optional
  106. Relative tolerance for an acceptable step.
  107. Returns
  108. -------
  109. alpha : float
  110. Step size, or None if no suitable step was found
  111. phi : float
  112. Value of `phi` at the new point `alpha`
  113. phi0 : float
  114. Value of `phi` at `alpha=0`
  115. Notes
  116. -----
  117. Uses routine DCSRCH from MINPACK.
  118. """
  119. if phi0 is None:
  120. phi0 = phi(0.)
  121. if derphi0 is None:
  122. derphi0 = derphi(0.)
  123. if old_phi0 is not None and derphi0 != 0:
  124. alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
  125. if alpha1 < 0:
  126. alpha1 = 1.0
  127. else:
  128. alpha1 = 1.0
  129. phi1 = phi0
  130. derphi1 = derphi0
  131. isave = np.zeros((2,), np.intc)
  132. dsave = np.zeros((13,), float)
  133. task = b'START'
  134. maxiter = 100
  135. for i in xrange(maxiter):
  136. stp, phi1, derphi1, task = minpack2.dcsrch(alpha1, phi1, derphi1,
  137. c1, c2, xtol, task,
  138. amin, amax, isave, dsave)
  139. if task[:2] == b'FG':
  140. alpha1 = stp
  141. phi1 = phi(stp)
  142. derphi1 = derphi(stp)
  143. else:
  144. break
  145. else:
  146. # maxiter reached, the line search did not converge
  147. stp = None
  148. if task[:5] == b'ERROR' or task[:4] == b'WARN':
  149. stp = None # failed
  150. return stp, phi1, phi0
  151. line_search = line_search_wolfe1
  152. #------------------------------------------------------------------------------
  153. # Pure-Python Wolfe line and scalar searches
  154. #------------------------------------------------------------------------------
  155. def line_search_wolfe2(f, myfprime, xk, pk, gfk=None, old_fval=None,
  156. old_old_fval=None, args=(), c1=1e-4, c2=0.9, amax=None,
  157. extra_condition=None, maxiter=10):
  158. """Find alpha that satisfies strong Wolfe conditions.
  159. Parameters
  160. ----------
  161. f : callable f(x,*args)
  162. Objective function.
  163. myfprime : callable f'(x,*args)
  164. Objective function gradient.
  165. xk : ndarray
  166. Starting point.
  167. pk : ndarray
  168. Search direction.
  169. gfk : ndarray, optional
  170. Gradient value for x=xk (xk being the current parameter
  171. estimate). Will be recomputed if omitted.
  172. old_fval : float, optional
  173. Function value for x=xk. Will be recomputed if omitted.
  174. old_old_fval : float, optional
  175. Function value for the point preceding x=xk
  176. args : tuple, optional
  177. Additional arguments passed to objective function.
  178. c1 : float, optional
  179. Parameter for Armijo condition rule.
  180. c2 : float, optional
  181. Parameter for curvature condition rule.
  182. amax : float, optional
  183. Maximum step size
  184. extra_condition : callable, optional
  185. A callable of the form ``extra_condition(alpha, x, f, g)``
  186. returning a boolean. Arguments are the proposed step ``alpha``
  187. and the corresponding ``x``, ``f`` and ``g`` values. The line search
  188. accepts the value of ``alpha`` only if this
  189. callable returns ``True``. If the callable returns ``False``
  190. for the step length, the algorithm will continue with
  191. new iterates. The callable is only called for iterates
  192. satisfying the strong Wolfe conditions.
  193. maxiter : int, optional
  194. Maximum number of iterations to perform
  195. Returns
  196. -------
  197. alpha : float or None
  198. Alpha for which ``x_new = x0 + alpha * pk``,
  199. or None if the line search algorithm did not converge.
  200. fc : int
  201. Number of function evaluations made.
  202. gc : int
  203. Number of gradient evaluations made.
  204. new_fval : float or None
  205. New function value ``f(x_new)=f(x0+alpha*pk)``,
  206. or None if the line search algorithm did not converge.
  207. old_fval : float
  208. Old function value ``f(x0)``.
  209. new_slope : float or None
  210. The local slope along the search direction at the
  211. new value ``<myfprime(x_new), pk>``,
  212. or None if the line search algorithm did not converge.
  213. Notes
  214. -----
  215. Uses the line search algorithm to enforce strong Wolfe
  216. conditions. See Wright and Nocedal, 'Numerical Optimization',
  217. 1999, pg. 59-60.
  218. For the zoom phase it uses an algorithm by [...].
  219. """
  220. fc = [0]
  221. gc = [0]
  222. gval = [None]
  223. gval_alpha = [None]
  224. def phi(alpha):
  225. fc[0] += 1
  226. return f(xk + alpha * pk, *args)
  227. if isinstance(myfprime, tuple):
  228. def derphi(alpha):
  229. fc[0] += len(xk) + 1
  230. eps = myfprime[1]
  231. fprime = myfprime[0]
  232. newargs = (f, eps) + args
  233. gval[0] = fprime(xk + alpha * pk, *newargs) # store for later use
  234. gval_alpha[0] = alpha
  235. return np.dot(gval[0], pk)
  236. else:
  237. fprime = myfprime
  238. def derphi(alpha):
  239. gc[0] += 1
  240. gval[0] = fprime(xk + alpha * pk, *args) # store for later use
  241. gval_alpha[0] = alpha
  242. return np.dot(gval[0], pk)
  243. if gfk is None:
  244. gfk = fprime(xk, *args)
  245. derphi0 = np.dot(gfk, pk)
  246. if extra_condition is not None:
  247. # Add the current gradient as argument, to avoid needless
  248. # re-evaluation
  249. def extra_condition2(alpha, phi):
  250. if gval_alpha[0] != alpha:
  251. derphi(alpha)
  252. x = xk + alpha * pk
  253. return extra_condition(alpha, x, phi, gval[0])
  254. else:
  255. extra_condition2 = None
  256. alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  257. phi, derphi, old_fval, old_old_fval, derphi0, c1, c2, amax,
  258. extra_condition2, maxiter=maxiter)
  259. if derphi_star is None:
  260. warn('The line search algorithm did not converge', LineSearchWarning)
  261. else:
  262. # derphi_star is a number (derphi) -- so use the most recently
  263. # calculated gradient used in computing it derphi = gfk*pk
  264. # this is the gradient at the next step no need to compute it
  265. # again in the outer loop.
  266. derphi_star = gval[0]
  267. return alpha_star, fc[0], gc[0], phi_star, old_fval, derphi_star
  268. def scalar_search_wolfe2(phi, derphi=None, phi0=None,
  269. old_phi0=None, derphi0=None,
  270. c1=1e-4, c2=0.9, amax=None,
  271. extra_condition=None, maxiter=10):
  272. """Find alpha that satisfies strong Wolfe conditions.
  273. alpha > 0 is assumed to be a descent direction.
  274. Parameters
  275. ----------
  276. phi : callable f(x)
  277. Objective scalar function.
  278. derphi : callable f'(x), optional
  279. Objective function derivative (can be None)
  280. phi0 : float, optional
  281. Value of phi at s=0
  282. old_phi0 : float, optional
  283. Value of phi at previous point
  284. derphi0 : float, optional
  285. Value of derphi at s=0
  286. c1 : float, optional
  287. Parameter for Armijo condition rule.
  288. c2 : float, optional
  289. Parameter for curvature condition rule.
  290. amax : float, optional
  291. Maximum step size
  292. extra_condition : callable, optional
  293. A callable of the form ``extra_condition(alpha, phi_value)``
  294. returning a boolean. The line search accepts the value
  295. of ``alpha`` only if this callable returns ``True``.
  296. If the callable returns ``False`` for the step length,
  297. the algorithm will continue with new iterates.
  298. The callable is only called for iterates satisfying
  299. the strong Wolfe conditions.
  300. maxiter : int, optional
  301. Maximum number of iterations to perform
  302. Returns
  303. -------
  304. alpha_star : float or None
  305. Best alpha, or None if the line search algorithm did not converge.
  306. phi_star : float
  307. phi at alpha_star
  308. phi0 : float
  309. phi at 0
  310. derphi_star : float or None
  311. derphi at alpha_star, or None if the line search algorithm
  312. did not converge.
  313. Notes
  314. -----
  315. Uses the line search algorithm to enforce strong Wolfe
  316. conditions. See Wright and Nocedal, 'Numerical Optimization',
  317. 1999, pg. 59-60.
  318. For the zoom phase it uses an algorithm by [...].
  319. """
  320. if phi0 is None:
  321. phi0 = phi(0.)
  322. if derphi0 is None and derphi is not None:
  323. derphi0 = derphi(0.)
  324. alpha0 = 0
  325. if old_phi0 is not None and derphi0 != 0:
  326. alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
  327. else:
  328. alpha1 = 1.0
  329. if alpha1 < 0:
  330. alpha1 = 1.0
  331. phi_a1 = phi(alpha1)
  332. #derphi_a1 = derphi(alpha1) evaluated below
  333. phi_a0 = phi0
  334. derphi_a0 = derphi0
  335. if extra_condition is None:
  336. extra_condition = lambda alpha, phi: True
  337. for i in xrange(maxiter):
  338. if alpha1 == 0 or (amax is not None and alpha0 == amax):
  339. # alpha1 == 0: This shouldn't happen. Perhaps the increment has
  340. # slipped below machine precision?
  341. alpha_star = None
  342. phi_star = phi0
  343. phi0 = old_phi0
  344. derphi_star = None
  345. if alpha1 == 0:
  346. msg = 'Rounding errors prevent the line search from converging'
  347. else:
  348. msg = "The line search algorithm could not find a solution " + \
  349. "less than or equal to amax: %s" % amax
  350. warn(msg, LineSearchWarning)
  351. break
  352. if (phi_a1 > phi0 + c1 * alpha1 * derphi0) or \
  353. ((phi_a1 >= phi_a0) and (i > 1)):
  354. alpha_star, phi_star, derphi_star = \
  355. _zoom(alpha0, alpha1, phi_a0,
  356. phi_a1, derphi_a0, phi, derphi,
  357. phi0, derphi0, c1, c2, extra_condition)
  358. break
  359. derphi_a1 = derphi(alpha1)
  360. if (abs(derphi_a1) <= -c2*derphi0):
  361. if extra_condition(alpha1, phi_a1):
  362. alpha_star = alpha1
  363. phi_star = phi_a1
  364. derphi_star = derphi_a1
  365. break
  366. if (derphi_a1 >= 0):
  367. alpha_star, phi_star, derphi_star = \
  368. _zoom(alpha1, alpha0, phi_a1,
  369. phi_a0, derphi_a1, phi, derphi,
  370. phi0, derphi0, c1, c2, extra_condition)
  371. break
  372. alpha2 = 2 * alpha1 # increase by factor of two on each iteration
  373. if amax is not None:
  374. alpha2 = min(alpha2, amax)
  375. alpha0 = alpha1
  376. alpha1 = alpha2
  377. phi_a0 = phi_a1
  378. phi_a1 = phi(alpha1)
  379. derphi_a0 = derphi_a1
  380. else:
  381. # stopping test maxiter reached
  382. alpha_star = alpha1
  383. phi_star = phi_a1
  384. derphi_star = None
  385. warn('The line search algorithm did not converge', LineSearchWarning)
  386. return alpha_star, phi_star, phi0, derphi_star
  387. def _cubicmin(a, fa, fpa, b, fb, c, fc):
  388. """
  389. Finds the minimizer for a cubic polynomial that goes through the
  390. points (a,fa), (b,fb), and (c,fc) with derivative at a of fpa.
  391. If no minimizer can be found return None
  392. """
  393. # f(x) = A *(x-a)^3 + B*(x-a)^2 + C*(x-a) + D
  394. with np.errstate(divide='raise', over='raise', invalid='raise'):
  395. try:
  396. C = fpa
  397. db = b - a
  398. dc = c - a
  399. denom = (db * dc) ** 2 * (db - dc)
  400. d1 = np.empty((2, 2))
  401. d1[0, 0] = dc ** 2
  402. d1[0, 1] = -db ** 2
  403. d1[1, 0] = -dc ** 3
  404. d1[1, 1] = db ** 3
  405. [A, B] = np.dot(d1, np.asarray([fb - fa - C * db,
  406. fc - fa - C * dc]).flatten())
  407. A /= denom
  408. B /= denom
  409. radical = B * B - 3 * A * C
  410. xmin = a + (-B + np.sqrt(radical)) / (3 * A)
  411. except ArithmeticError:
  412. return None
  413. if not np.isfinite(xmin):
  414. return None
  415. return xmin
  416. def _quadmin(a, fa, fpa, b, fb):
  417. """
  418. Finds the minimizer for a quadratic polynomial that goes through
  419. the points (a,fa), (b,fb) with derivative at a of fpa,
  420. """
  421. # f(x) = B*(x-a)^2 + C*(x-a) + D
  422. with np.errstate(divide='raise', over='raise', invalid='raise'):
  423. try:
  424. D = fa
  425. C = fpa
  426. db = b - a * 1.0
  427. B = (fb - D - C * db) / (db * db)
  428. xmin = a - C / (2.0 * B)
  429. except ArithmeticError:
  430. return None
  431. if not np.isfinite(xmin):
  432. return None
  433. return xmin
  434. def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
  435. phi, derphi, phi0, derphi0, c1, c2, extra_condition):
  436. """
  437. Part of the optimization algorithm in `scalar_search_wolfe2`.
  438. """
  439. maxiter = 10
  440. i = 0
  441. delta1 = 0.2 # cubic interpolant check
  442. delta2 = 0.1 # quadratic interpolant check
  443. phi_rec = phi0
  444. a_rec = 0
  445. while True:
  446. # interpolate to find a trial step length between a_lo and
  447. # a_hi Need to choose interpolation here. Use cubic
  448. # interpolation and then if the result is within delta *
  449. # dalpha or outside of the interval bounded by a_lo or a_hi
  450. # then use quadratic interpolation, if the result is still too
  451. # close, then use bisection
  452. dalpha = a_hi - a_lo
  453. if dalpha < 0:
  454. a, b = a_hi, a_lo
  455. else:
  456. a, b = a_lo, a_hi
  457. # minimizer of cubic interpolant
  458. # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
  459. #
  460. # if the result is too close to the end points (or out of the
  461. # interval) then use quadratic interpolation with phi_lo,
  462. # derphi_lo and phi_hi if the result is still too close to the
  463. # end points (or out of the interval) then use bisection
  464. if (i > 0):
  465. cchk = delta1 * dalpha
  466. a_j = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi,
  467. a_rec, phi_rec)
  468. if (i == 0) or (a_j is None) or (a_j > b - cchk) or (a_j < a + cchk):
  469. qchk = delta2 * dalpha
  470. a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
  471. if (a_j is None) or (a_j > b-qchk) or (a_j < a+qchk):
  472. a_j = a_lo + 0.5*dalpha
  473. # Check new value of a_j
  474. phi_aj = phi(a_j)
  475. if (phi_aj > phi0 + c1*a_j*derphi0) or (phi_aj >= phi_lo):
  476. phi_rec = phi_hi
  477. a_rec = a_hi
  478. a_hi = a_j
  479. phi_hi = phi_aj
  480. else:
  481. derphi_aj = derphi(a_j)
  482. if abs(derphi_aj) <= -c2*derphi0 and extra_condition(a_j, phi_aj):
  483. a_star = a_j
  484. val_star = phi_aj
  485. valprime_star = derphi_aj
  486. break
  487. if derphi_aj*(a_hi - a_lo) >= 0:
  488. phi_rec = phi_hi
  489. a_rec = a_hi
  490. a_hi = a_lo
  491. phi_hi = phi_lo
  492. else:
  493. phi_rec = phi_lo
  494. a_rec = a_lo
  495. a_lo = a_j
  496. phi_lo = phi_aj
  497. derphi_lo = derphi_aj
  498. i += 1
  499. if (i > maxiter):
  500. # Failed to find a conforming step size
  501. a_star = None
  502. val_star = None
  503. valprime_star = None
  504. break
  505. return a_star, val_star, valprime_star
  506. #------------------------------------------------------------------------------
  507. # Armijo line and scalar searches
  508. #------------------------------------------------------------------------------
  509. def line_search_armijo(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=1):
  510. """Minimize over alpha, the function ``f(xk+alpha pk)``.
  511. Parameters
  512. ----------
  513. f : callable
  514. Function to be minimized.
  515. xk : array_like
  516. Current point.
  517. pk : array_like
  518. Search direction.
  519. gfk : array_like
  520. Gradient of `f` at point `xk`.
  521. old_fval : float
  522. Value of `f` at point `xk`.
  523. args : tuple, optional
  524. Optional arguments.
  525. c1 : float, optional
  526. Value to control stopping criterion.
  527. alpha0 : scalar, optional
  528. Value of `alpha` at start of the optimization.
  529. Returns
  530. -------
  531. alpha
  532. f_count
  533. f_val_at_alpha
  534. Notes
  535. -----
  536. Uses the interpolation algorithm (Armijo backtracking) as suggested by
  537. Wright and Nocedal in 'Numerical Optimization', 1999, pg. 56-57
  538. """
  539. xk = np.atleast_1d(xk)
  540. fc = [0]
  541. def phi(alpha1):
  542. fc[0] += 1
  543. return f(xk + alpha1*pk, *args)
  544. if old_fval is None:
  545. phi0 = phi(0.)
  546. else:
  547. phi0 = old_fval # compute f(xk) -- done in past loop
  548. derphi0 = np.dot(gfk, pk)
  549. alpha, phi1 = scalar_search_armijo(phi, phi0, derphi0, c1=c1,
  550. alpha0=alpha0)
  551. return alpha, fc[0], phi1
  552. def line_search_BFGS(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=1):
  553. """
  554. Compatibility wrapper for `line_search_armijo`
  555. """
  556. r = line_search_armijo(f, xk, pk, gfk, old_fval, args=args, c1=c1,
  557. alpha0=alpha0)
  558. return r[0], r[1], 0, r[2]
  559. def scalar_search_armijo(phi, phi0, derphi0, c1=1e-4, alpha0=1, amin=0):
  560. """Minimize over alpha, the function ``phi(alpha)``.
  561. Uses the interpolation algorithm (Armijo backtracking) as suggested by
  562. Wright and Nocedal in 'Numerical Optimization', 1999, pg. 56-57
  563. alpha > 0 is assumed to be a descent direction.
  564. Returns
  565. -------
  566. alpha
  567. phi1
  568. """
  569. phi_a0 = phi(alpha0)
  570. if phi_a0 <= phi0 + c1*alpha0*derphi0:
  571. return alpha0, phi_a0
  572. # Otherwise compute the minimizer of a quadratic interpolant:
  573. alpha1 = -(derphi0) * alpha0**2 / 2.0 / (phi_a0 - phi0 - derphi0 * alpha0)
  574. phi_a1 = phi(alpha1)
  575. if (phi_a1 <= phi0 + c1*alpha1*derphi0):
  576. return alpha1, phi_a1
  577. # Otherwise loop with cubic interpolation until we find an alpha which
  578. # satisfies the first Wolfe condition (since we are backtracking, we will
  579. # assume that the value of alpha is not too small and satisfies the second
  580. # condition.
  581. while alpha1 > amin: # we are assuming alpha>0 is a descent direction
  582. factor = alpha0**2 * alpha1**2 * (alpha1-alpha0)
  583. a = alpha0**2 * (phi_a1 - phi0 - derphi0*alpha1) - \
  584. alpha1**2 * (phi_a0 - phi0 - derphi0*alpha0)
  585. a = a / factor
  586. b = -alpha0**3 * (phi_a1 - phi0 - derphi0*alpha1) + \
  587. alpha1**3 * (phi_a0 - phi0 - derphi0*alpha0)
  588. b = b / factor
  589. alpha2 = (-b + np.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0*a)
  590. phi_a2 = phi(alpha2)
  591. if (phi_a2 <= phi0 + c1*alpha2*derphi0):
  592. return alpha2, phi_a2
  593. if (alpha1 - alpha2) > alpha1 / 2.0 or (1 - alpha2/alpha1) < 0.96:
  594. alpha2 = alpha1 / 2.0
  595. alpha0 = alpha1
  596. alpha1 = alpha2
  597. phi_a0 = phi_a1
  598. phi_a1 = phi_a2
  599. # Failed to find a suitable step length
  600. return None, phi_a1
  601. #------------------------------------------------------------------------------
  602. # Non-monotone line search for DF-SANE
  603. #------------------------------------------------------------------------------
  604. def _nonmonotone_line_search_cruz(f, x_k, d, prev_fs, eta,
  605. gamma=1e-4, tau_min=0.1, tau_max=0.5):
  606. """
  607. Nonmonotone backtracking line search as described in [1]_
  608. Parameters
  609. ----------
  610. f : callable
  611. Function returning a tuple ``(f, F)`` where ``f`` is the value
  612. of a merit function and ``F`` the residual.
  613. x_k : ndarray
  614. Initial position
  615. d : ndarray
  616. Search direction
  617. prev_fs : float
  618. List of previous merit function values. Should have ``len(prev_fs) <= M``
  619. where ``M`` is the nonmonotonicity window parameter.
  620. eta : float
  621. Allowed merit function increase, see [1]_
  622. gamma, tau_min, tau_max : float, optional
  623. Search parameters, see [1]_
  624. Returns
  625. -------
  626. alpha : float
  627. Step length
  628. xp : ndarray
  629. Next position
  630. fp : float
  631. Merit function value at next position
  632. Fp : ndarray
  633. Residual at next position
  634. References
  635. ----------
  636. [1] "Spectral residual method without gradient information for solving
  637. large-scale nonlinear systems of equations." W. La Cruz,
  638. J.M. Martinez, M. Raydan. Math. Comp. **75**, 1429 (2006).
  639. """
  640. f_k = prev_fs[-1]
  641. f_bar = max(prev_fs)
  642. alpha_p = 1
  643. alpha_m = 1
  644. alpha = 1
  645. while True:
  646. xp = x_k + alpha_p * d
  647. fp, Fp = f(xp)
  648. if fp <= f_bar + eta - gamma * alpha_p**2 * f_k:
  649. alpha = alpha_p
  650. break
  651. alpha_tp = alpha_p**2 * f_k / (fp + (2*alpha_p - 1)*f_k)
  652. xp = x_k - alpha_m * d
  653. fp, Fp = f(xp)
  654. if fp <= f_bar + eta - gamma * alpha_m**2 * f_k:
  655. alpha = -alpha_m
  656. break
  657. alpha_tm = alpha_m**2 * f_k / (fp + (2*alpha_m - 1)*f_k)
  658. alpha_p = np.clip(alpha_tp, tau_min * alpha_p, tau_max * alpha_p)
  659. alpha_m = np.clip(alpha_tm, tau_min * alpha_m, tau_max * alpha_m)
  660. return alpha, xp, fp, Fp
  661. def _nonmonotone_line_search_cheng(f, x_k, d, f_k, C, Q, eta,
  662. gamma=1e-4, tau_min=0.1, tau_max=0.5,
  663. nu=0.85):
  664. """
  665. Nonmonotone line search from [1]
  666. Parameters
  667. ----------
  668. f : callable
  669. Function returning a tuple ``(f, F)`` where ``f`` is the value
  670. of a merit function and ``F`` the residual.
  671. x_k : ndarray
  672. Initial position
  673. d : ndarray
  674. Search direction
  675. f_k : float
  676. Initial merit function value
  677. C, Q : float
  678. Control parameters. On the first iteration, give values
  679. Q=1.0, C=f_k
  680. eta : float
  681. Allowed merit function increase, see [1]_
  682. nu, gamma, tau_min, tau_max : float, optional
  683. Search parameters, see [1]_
  684. Returns
  685. -------
  686. alpha : float
  687. Step length
  688. xp : ndarray
  689. Next position
  690. fp : float
  691. Merit function value at next position
  692. Fp : ndarray
  693. Residual at next position
  694. C : float
  695. New value for the control parameter C
  696. Q : float
  697. New value for the control parameter Q
  698. References
  699. ----------
  700. .. [1] W. Cheng & D.-H. Li, ''A derivative-free nonmonotone line
  701. search and its application to the spectral residual
  702. method'', IMA J. Numer. Anal. 29, 814 (2009).
  703. """
  704. alpha_p = 1
  705. alpha_m = 1
  706. alpha = 1
  707. while True:
  708. xp = x_k + alpha_p * d
  709. fp, Fp = f(xp)
  710. if fp <= C + eta - gamma * alpha_p**2 * f_k:
  711. alpha = alpha_p
  712. break
  713. alpha_tp = alpha_p**2 * f_k / (fp + (2*alpha_p - 1)*f_k)
  714. xp = x_k - alpha_m * d
  715. fp, Fp = f(xp)
  716. if fp <= C + eta - gamma * alpha_m**2 * f_k:
  717. alpha = -alpha_m
  718. break
  719. alpha_tm = alpha_m**2 * f_k / (fp + (2*alpha_m - 1)*f_k)
  720. alpha_p = np.clip(alpha_tp, tau_min * alpha_p, tau_max * alpha_p)
  721. alpha_m = np.clip(alpha_tm, tau_min * alpha_m, tau_max * alpha_m)
  722. # Update C and Q
  723. Q_next = nu * Q + 1
  724. C = (nu * Q * (C + eta) + fp) / Q_next
  725. Q = Q_next
  726. return alpha, xp, fp, Fp, C, Q