line_search_strong_wolfe.py 14.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# NIFTy
# Copyright (C) 2017  Theo Steininger
#
# Author: Theo Steininger
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

19
20
21
22
23
24
import numpy as np

from .line_search import LineSearch


class LineSearchStrongWolfe(LineSearch):
25
26
27
28
    """Class for finding a step size that satisfies the strong Wolfe conditions.
    
    Algorithm contains two stages. It begins whit a trial step length and it 
    keeps increasing the it until it finds an acceptable step length or an
29
30
31
    interval. If it does not satisfy the Wolfe conditions it performs the Zoom 
    algorithm (second stage). By interpolating it decreases the size of the 
    interval until an acceptable step length is found.  
32
33
34
    
    Parameters
    ----------
35
    c1 : float 
36
        Parameter for Armijo condition rule. (Default: 1e-4)
37
    c2 : float
38
        Parameter for curvature condition rule. (Default: 0.9)
39
    max_step_size : float
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
        Maximum step allowed in to be made in the descent direction. 
        (Default: 50)
    max_iterations : integer
        Maximum number of iterations performed by the line search algorithm.
        (Default: 10)
    max_zoom_iterations : integer
        Maximum number of iterations performed by the zoom algorithm. 
        (Default: 10)
        
    Attributes
    ----------
    c1 : float
        Parameter for Armijo condition rule.
    c2 : float
        Parameter for curvature condition rule.
55
    max_step_size : float
56
57
58
59
60
61
        Maximum step allowed in to be made in the descent direction. 
    max_iterations : integer
        Maximum number of iterations performed by the line search algorithm.
    max_zoom_iterations : integer
        Maximum number of iterations performed by the zoom algorithm.
        
62
63
64
65
    """

    def __init__(self, c1=1e-4, c2=0.9,
                 max_step_size=50, max_iterations=10,
66
                 max_zoom_iterations=10):
67
68
69
70
71
72
73
74
75
76


        super(LineSearchStrongWolfe, self).__init__()

        self.c1 = np.float(c1)
        self.c2 = np.float(c2)
        self.max_step_size = max_step_size
        self.max_iterations = int(max_iterations)
        self.max_zoom_iterations = int(max_zoom_iterations)

77
    def perform_line_search(self, energy, pk, f_k_minus_1=None):
78
79
        """Performs the first stage of the algorithm.
        
80
81
82
        It starts with a trial step size and it keeps increasing it until it 
        satisfy the strong Wolf conditions. It also performs the descent and 
        returns the optimal step length and the new enrgy.
83
84
85
86
87
88
89
90
91
        
        Parameters
        ----------
        energy : Energy object
            Energy object from which we will calculate the energy and the
            gradient at a specific point.
        pk : Field
            Unit vector pointing into the search direction.
        f_k_minus_1 : float
92
            Value of the fuction (which is being minimized) at the k-1 
93
            iteration of the line search procedure. (Default: None)
94
95
96
97
98
99
100
101
102
103
        
        Returns
        -------
        alpha_star : float
            The optimal step length in the descent direction.
        phi_star : float
            Value of the energy after the performed descent.
        energy_star : Energy object
            The new Energy object on the new position.
            
104
105
        """        
        
106
        self._set_line_energy(energy, pk, f_k_minus_1=f_k_minus_1)
107
108
109
110
111
112
113
        c1 = self.c1
        c2 = self.c2
        max_step_size = self.max_step_size
        max_iterations = self.max_iterations

        # initialize the zero phis
        old_phi_0 = self.f_k_minus_1
114
115
116
        energy_0 = self.line_energy.at(0)
        phi_0 = energy_0.value
        phiprime_0 = energy_0.gradient
117
118
119
120
121
122
123

        if phiprime_0 == 0:
            self.logger.warn("Flat gradient in search direction.")
            return 0., 0.

        # set alphas
        alpha0 = 0.
124
125
126
        if self.prefered_initial_step_size is not None:
            alpha1 = self.prefered_initial_step_size
        elif old_phi_0 is not None and phiprime_0 != 0:
127
128
129
130
131
132
133
134
135
136
137
138
            alpha1 = min(1.0, 1.01*2*(phi_0 - old_phi_0)/phiprime_0)
            if alpha1 < 0:
                alpha1 = 1.0
        else:
            alpha1 = 1.0

        # give the alpha0 phis the right init value
        phi_alpha0 = phi_0
        phiprime_alpha0 = phiprime_0

        # start the minimization loop
        for i in xrange(max_iterations):
139
140
            energy_alpha1 = self.line_energy.at(alpha1)
            phi_alpha1 = energy_alpha1.value
141
142
143
144
            if alpha1 == 0:
                self.logger.warn("Increment size became 0.")
                alpha_star = 0.
                phi_star = phi_0
145
                energy_star = energy_0
146
147
148
149
                break

            if (phi_alpha1 > phi_0 + c1*alpha1*phiprime_0) or \
               ((phi_alpha1 >= phi_alpha0) and (i > 1)):
150
151
                (alpha_star, phi_star, energy_star) = self._zoom(
                                                    alpha0, alpha1,
152
153
154
155
156
157
158
                                                    phi_0, phiprime_0,
                                                    phi_alpha0,
                                                    phiprime_alpha0,
                                                    phi_alpha1,
                                                    c1, c2)
                break

159
            phiprime_alpha1 = energy_alpha1.gradient
160
161
162
            if abs(phiprime_alpha1) <= -c2*phiprime_0:
                alpha_star = alpha1
                phi_star = phi_alpha1
163
                energy_star = energy_alpha1
164
165
166
                break

            if phiprime_alpha1 >= 0:
167
168
                (alpha_star, phi_star, energy_star) = self._zoom(
                                                    alpha1, alpha0,
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
                                                    phi_0, phiprime_0,
                                                    phi_alpha1,
                                                    phiprime_alpha1,
                                                    phi_alpha0,
                                                    c1, c2)
                break

            # update alphas
            alpha0, alpha1 = alpha1, min(2*alpha1, max_step_size)
            phi_alpha0 = phi_alpha1
            phiprime_alpha0 = phiprime_alpha1
            # phi_alpha1 = self._phi(alpha1)

        else:
            # max_iterations was reached
            alpha_star = alpha1
            phi_star = phi_alpha1
186
            energy_star = energy_alpha1
187
188
            self.logger.error("The line search algorithm did not converge.")

189
190
191
192
        # extract the full energy from the line_energy
        energy_star = energy_star.energy

        return alpha_star, phi_star, energy_star
193
194
195

    def _zoom(self, alpha_lo, alpha_hi, phi_0, phiprime_0,
              phi_lo, phiprime_lo, phi_hi, c1, c2):
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
        """Performs the second stage of the line search algorithm.
        
        If the first stage was not successful then the Zoom algorithm tries to 
        find a suitable step length by using bisection, quadratic, cubic 
        interpolation.
        
        Parameters
        ----------
        alpha_lo : float
            The lower boundary for the step length interval.
        alph_hi : float
            The upper boundary for the step length interval.
        phi_0 : float
            Value of the energy at the starting point of the line search 
            algorithm.
        phiprime_0 : Field
            Gradient at the starting point of the line search algorithm.
        phi_lo : float
            Value of the energy if we perform a step of length alpha_lo in 
            descent direction.
        phiprime_lo : Field
            Gradient at the nwe position if we perform a step of length 
            alpha_lo in descent direction.
        phi_hi : float
            Value of the energy if we perform a step of length alpha_hi in 
            descent direction.
        c1 : float
            Parameter for Armijo condition rule.
        c2 : float
            Parameter for curvature condition rule.
        
        Returns
        -------
        alpha_star : float
            The optimal step length in the descent direction.
        phi_star : float
            Value of the energy after the performed descent.
        energy_star : Energy object
            The new Energy object on the new position.
        
        """
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
        max_iterations = self.max_zoom_iterations
        # define the cubic and quadratic interpolant checks
        cubic_delta = 0.2  # cubic
        quad_delta = 0.1  # quadratic

        # initialize the most recent versions (j-1) of phi and alpha
        alpha_recent = 0
        phi_recent = phi_0

        for i in xrange(max_iterations):
            delta_alpha = alpha_hi - alpha_lo
            if delta_alpha < 0:
                a, b = alpha_hi, alpha_lo
            else:
                a, b = alpha_lo, alpha_hi

            # Try cubic interpolation
            if i > 0:
                cubic_check = cubic_delta * delta_alpha
                alpha_j = self._cubicmin(alpha_lo, phi_lo, phiprime_lo,
                                         alpha_hi, phi_hi,
                                         alpha_recent, phi_recent)
            # If cubic was not successful or not available, try quadratic
            if (i == 0) or (alpha_j is None) or (alpha_j > b - cubic_check) or\
               (alpha_j < a + cubic_check):
                quad_check = quad_delta * delta_alpha
                alpha_j = self._quadmin(alpha_lo, phi_lo, phiprime_lo,
                                        alpha_hi, phi_hi)
                # If quadratic was not successfull, try bisection
                if (alpha_j is None) or (alpha_j > b - quad_check) or \
                   (alpha_j < a + quad_check):
                    alpha_j = alpha_lo + 0.5*delta_alpha

            # Check if the current value of alpha_j is already sufficient
271
272
            energy_alphaj = self.line_energy.at(alpha_j)
            phi_alphaj = energy_alphaj.value
273

274
275
276
277
278
279
280
            # If the first Wolfe condition is not met replace alpha_hi
            # by alpha_j
            if (phi_alphaj > phi_0 + c1*alpha_j*phiprime_0) or\
               (phi_alphaj >= phi_lo):
                alpha_recent, phi_recent = alpha_hi, phi_hi
                alpha_hi, phi_hi = alpha_j, phi_alphaj
            else:
281
                phiprime_alphaj = energy_alphaj.gradient
282
283
284
285
                # If the second Wolfe condition is met, return the result
                if abs(phiprime_alphaj) <= -c2*phiprime_0:
                    alpha_star = alpha_j
                    phi_star = phi_alphaj
286
                    energy_star = energy_alphaj
287
288
289
290
291
292
293
294
295
296
297
298
                    break
                # If not, check the sign of the slope
                if phiprime_alphaj*delta_alpha >= 0:
                    alpha_recent, phi_recent = alpha_hi, phi_hi
                    alpha_hi, phi_hi = alpha_lo, phi_lo
                else:
                    alpha_recent, phi_recent = alpha_lo, phi_lo
                # Replace alpha_lo by alpha_j
                (alpha_lo, phi_lo, phiprime_lo) = (alpha_j, phi_alphaj,
                                                   phiprime_alphaj)

        else:
299
300
            alpha_star, phi_star, energy_star = \
                alpha_j, phi_alphaj, energy_alphaj
301
302
303
            self.logger.error("The line search algorithm (zoom) did not "
                              "converge.")

304
        return alpha_star, phi_star, energy_star
305
306

    def _cubicmin(self, a, fa, fpa, b, fb, c, fc):
307
308
        """Estimating the minimum with cubic interpolation.
        
309
        Finds the minimizer for a cubic polynomial that goes through the
310
311
        points ( a,f(a) ), ( b,f(b) ), and ( c,f(c) ) with derivative at point a of fpa.
        f(x) = A *(x-a)^3 + B*(x-a)^2 + C*(x-a) + D
312
        If no minimizer can be found return None
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
        
        Parameters
        ----------
        a : float
            Selected point.
        fa : float
            Value of polynomial at point a.
        fpa : Field
            Derivative at point a.
        b : float
            Selected point.
        fb : float
            Value of polynomial at point b.
        c : float
            Selected point.
        fc : float
            Value of polynomial at point c.
        
        Returns
        -------
        xmin : float
            Position of the approximated minimum.
        
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
        """

        with np.errstate(divide='raise', over='raise', invalid='raise'):
            try:
                C = fpa
                db = b - a
                dc = c - a
                denom = (db * dc) ** 2 * (db - dc)
                d1 = np.empty((2, 2))
                d1[0, 0] = dc ** 2
                d1[0, 1] = -db ** 2
                d1[1, 0] = -dc ** 3
                d1[1, 1] = db ** 3
                [A, B] = np.dot(d1, np.asarray([fb - fa - C * db,
                                                fc - fa - C * dc]).flatten())
                A /= denom
                B /= denom
                radical = B * B - 3 * A * C
                xmin = a + (-B + np.sqrt(radical)) / (3 * A)
            except ArithmeticError:
                return None
        if not np.isfinite(xmin):
            return None
        return xmin

    def _quadmin(self, a, fa, fpa, b, fb):
362
363
        """Estimating the minimum with quadratic interpolation.
        
364
        Finds the minimizer for a quadratic polynomial that goes through
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
        the points ( a,f(a) ), ( b,f(b) ) with derivative at point a of fpa.
        f(x) = B*(x-a)^2 + C*(x-a) + D
        
        Parameters
        ----------
        a : float
            Selected point.
        fa : float
            Value of polynomial at point a.
        fpa : Field
            Derivative at point a.
        b : float
            Selected point.
        fb : float
            Value of polynomial at point b.
        
        Returns
        -------
        xmin : float
            Position of the approximated minimum.       
385
386
387
388
389
390
391
392
393
394
395
396
397
398
        """
        # f(x) = B*(x-a)^2 + C*(x-a) + D
        with np.errstate(divide='raise', over='raise', invalid='raise'):
            try:
                D = fa
                C = fpa
                db = b - a * 1.0
                B = (fb - D - C * db) / (db * db)
                xmin = a - C / (2.0 * B)
            except ArithmeticError:
                return None
        if not np.isfinite(xmin):
            return None
        return xmin