descent_minimizers.py 14.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
Theo Steininger's avatar
Theo Steininger committed
13
#
Martin Reinecke's avatar
Martin Reinecke committed
14
# Copyright(C) 2013-2018 Max-Planck-Society
Theo Steininger's avatar
Theo Steininger committed
15
16
17
#
# NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik
# and financially supported by the Studienstiftung des deutschen Volkes.
Theo Steininger's avatar
Theo Steininger committed
18

19
from __future__ import absolute_import, division, print_function
20

21
import numpy as np
Philipp Arras's avatar
Philipp Arras committed
22

23
from ..compat import *
24
from ..logger import logger
Martin Reinecke's avatar
Martin Reinecke committed
25
from .line_search_strong_wolfe import LineSearchStrongWolfe
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from .minimizer import Minimizer


class DescentMinimizer(Minimizer):
    """ A base class used by gradient methods to find a local minimum.

    Descent minimization methods are used to find a local minimum of a scalar
    function by following a descent direction. This class implements the
    minimization procedure once a descent direction is known. The descent
    direction has to be implemented separately.

    Parameters
    ----------
    controller : IterationController
        Object that decides when to terminate the minimization.
    line_searcher : callable *optional*
        Function which infers the step size in the descent direction
        (default : LineSearchStrongWolfe()).
    """

    def __init__(self, controller, line_searcher=LineSearchStrongWolfe()):
        self._controller = controller
        self.line_searcher = line_searcher

    def __call__(self, energy):
        """ Performs the minimization of the provided Energy functional.

        Parameters
        ----------
        energy : Energy
           Energy object which provides value, gradient and metric at a
           specific position in parameter space.

        Returns
        -------
        Energy
            Latest `energy` of the minimization.
        int
            Can be controller.CONVERGED or controller.ERROR

        Notes
        -----
        The minimization is stopped if
            * the controller returns controller.CONVERGED or controller.ERROR,
            * a perfectly flat point is reached,
            * according to the line-search the minimum is found,
        """
        f_k_minus_1 = None
        controller = self._controller
        status = controller.start(energy)
        if status != controller.CONTINUE:
            return energy, status

        while True:
            # check if position is at a flat point
            if energy.gradient_norm == 0:
                return energy, controller.CONVERGED

            # compute a step length that reduces energy.value sufficiently
            new_energy, success = self.line_searcher.perform_line_search(
                energy=energy, pk=self.get_descent_direction(energy),
                f_k_minus_1=f_k_minus_1)
            if not success:
                self.reset()

            f_k_minus_1 = energy.value

            if new_energy.value > energy.value:
                logger.error("Error: Energy has increased")
                return energy, controller.ERROR

            if new_energy.value == energy.value:
                logger.warning(
                    "Warning: Energy has not changed. Assuming convergence...")
                return new_energy, controller.CONVERGED

            energy = new_energy
            status = self._controller.check(energy)
            if status != controller.CONTINUE:
                return energy, status

    def reset(self):
        pass

    def get_descent_direction(self, energy):
        """ Calculates the next descent direction.

        Parameters
        ----------
        energy : Energy
            An instance of the Energy class which shall be minimized. The
            position of `energy` is used as the starting point of minimization.

        Returns
        -------
        Field
           The descent direction.
        """
        raise NotImplementedError


class SteepestDescent(DescentMinimizer):
    """ Implementation of the steepest descent minimization scheme.

    Also known as 'gradient descent'. This algorithm simply follows the
    functional's gradient for minimization.
    """

    def get_descent_direction(self, energy):
        return -energy.gradient


138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
class NewtonCG(DescentMinimizer):
    """ Calculates the descent direction according to a Newton-CG scheme.

    Algorithm derived from SciPy sources.
    """

    def __init__(self, controller, line_searcher=None):
        if line_searcher is None:
            line_searcher = LineSearchStrongWolfe(
                preferred_initial_step_size=1.)
        super(NewtonCG, self).__init__(controller=controller,
                                       line_searcher=line_searcher)

    def get_descent_direction(self, energy):
        float64eps = np.finfo(np.float64).eps
        grad = energy.gradient
        maggrad = abs(grad).sum()
Martin Reinecke's avatar
Martin Reinecke committed
155
        termcond = np.min([0.5, np.sqrt(maggrad)]) * maggrad
156
157
158
159
160
        xsupi = energy.position*0
        ri = grad
        psupi = -ri
        dri0 = ri.vdot(ri)

Martin Reinecke's avatar
Martin Reinecke committed
161
        i = 0
162
163
164
        while True:
            if abs(ri).sum() <= termcond:
                return xsupi
Martin Reinecke's avatar
Martin Reinecke committed
165
            Ap = energy.apply_metric(psupi)
166
167
            # check curvature
            curv = psupi.vdot(Ap)
Martin Reinecke's avatar
Martin Reinecke committed
168
            if 0 <= curv <= 3*float64eps:
169
170
                return xsupi
            elif curv < 0:
Martin Reinecke's avatar
Martin Reinecke committed
171
                return xsupi if i > 0 else (dri0/curv) * grad
172
173
174
175
176
177
178
179
180
181
182
183
184
            alphai = dri0/curv
            xsupi = xsupi + alphai*psupi
            ri = ri + alphai*Ap
            dri1 = ri.vdot(ri)
            psupi = (dri1/dri0)*psupi - ri
            i += 1
            dri0 = dri1  # update numpy.dot(ri,ri) for next time.

        # curvature keeps increasing, bail out
        raise ValueError("Warning: CG iterations didn't converge. "
                         "The Hessian is not positive definite.")


185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
class L_BFGS(DescentMinimizer):
    def __init__(self, controller, line_searcher=LineSearchStrongWolfe(),
                 max_history_length=5):
        super(L_BFGS, self).__init__(controller=controller,
                                     line_searcher=line_searcher)
        self.max_history_length = max_history_length

    def __call__(self, energy):
        self.reset()
        return super(L_BFGS, self).__call__(energy)

    def reset(self):
        self._k = 0
        self._s = [None]*self.max_history_length
        self._y = [None]*self.max_history_length

    def get_descent_direction(self, energy):
        x = energy.position
        s = self._s
        y = self._y
        k = self._k
        maxhist = self.max_history_length
        gradient = energy.gradient

        nhist = min(k, maxhist)
        alpha = [None]*maxhist
        p = -gradient
        if k > 0:
            idx = (k-1) % maxhist
            s[idx] = x-self._lastx
            y[idx] = gradient-self._lastgrad
        if nhist > 0:
            for i in range(k-1, k-nhist-1, -1):
                idx = i % maxhist
                alpha[idx] = s[idx].vdot(p)/s[idx].vdot(y[idx])
                p = p - alpha[idx]*y[idx]
            idx = (k-1) % maxhist
            fact = s[idx].vdot(y[idx]) / y[idx].vdot(y[idx])
            if fact <= 0.:
                logger.error("L-BFGS curvature not positive definite!")
            p = p*fact
            for i in range(k-nhist, k):
                idx = i % maxhist
                beta = y[idx].vdot(p) / s[idx].vdot(y[idx])
                p = p + (alpha[idx]-beta)*s[idx]
        self._lastx = x
        self._lastgrad = gradient
        self._k += 1
        return p
Theo Steininger's avatar
Theo Steininger committed
234
235


236
class VL_BFGS(DescentMinimizer):
Martin Reinecke's avatar
Martin Reinecke committed
237
238
239
240
241
242
243
244
245
246
247
248
249
250
    """Implementation of the Vector-free L-BFGS minimization scheme.

    Find the descent direction by using the inverse Hessian.
    Instead of storing the whole matrix, it stores only the last few
    updates, which are used to do operations requiring the inverse
    Hessian product. The updates are represented in a new basis to optimize
    the algorithm.

    References
    ----------
    W. Chen, Z. Wang, J. Zhou, "Large-scale L-BFGS using MapReduce", 2014,
    Microsoft
    """

Martin Reinecke's avatar
Martin Reinecke committed
251
252
    def __init__(self, controller, line_searcher=LineSearchStrongWolfe(),
                 max_history_length=5):
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
253
254
        super(VL_BFGS, self).__init__(controller=controller,
                                      line_searcher=line_searcher)
255
256
        self.max_history_length = max_history_length

257
    def __call__(self, energy):
258
        self._information_store = None
259
        return super(VL_BFGS, self).__call__(energy)
260

Martin Reinecke's avatar
stage 1    
Martin Reinecke committed
261
262
263
    def reset(self):
        self._information_store = None

264
    def get_descent_direction(self, energy):
265
266
        x = energy.position
        gradient = energy.gradient
267
268
269
270
        # initialize the information store if it doesn't already exist
        try:
            self._information_store.add_new_point(x, gradient)
        except AttributeError:
Martin Reinecke's avatar
Martin Reinecke committed
271
272
            self._information_store = _InformationStore(
                self.max_history_length, x0=x, gradient=gradient)
273
274
275
276

        b = self._information_store.b
        delta = self._information_store.delta

277
        descent_direction = delta[0] * b[0]
Martin Reinecke's avatar
Martin Reinecke committed
278
        for i in range(1, len(delta)):
279
            descent_direction = descent_direction + delta[i]*b[i]
280

281
        return descent_direction
Theo Steininger's avatar
Theo Steininger committed
282
283


Martin Reinecke's avatar
Martin Reinecke committed
284
class _InformationStore(object):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
285
    """Class for storing a list of past updates.
286

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
287
288
    Parameters
    ----------
Martin Reinecke's avatar
Martin Reinecke committed
289
    max_history_length : int
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
290
291
292
293
294
        Maximum number of stored past updates.
    x0 : Field
        Initial position in variable space.
    gradient : Field
        Gradient at position x0.
295

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
296
297
    Attributes
    ----------
Martin Reinecke's avatar
Martin Reinecke committed
298
    max_history_length : int
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
299
300
        Maximum number of stored past updates.
    s : List
Martin Reinecke's avatar
Martin Reinecke committed
301
        Circular buffer of past position differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
302
    y : List
Martin Reinecke's avatar
Martin Reinecke committed
303
        Circular buffer of past gradient differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
304
    last_x : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
305
        Latest position in variable space.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
306
    last_gradient : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
307
        Gradient at latest position.
Martin Reinecke's avatar
Martin Reinecke committed
308
    k : int
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
309
        Number of updates that have taken place
Martin Reinecke's avatar
Martin Reinecke committed
310
    ss : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
311
        2D circular buffer of scalar products between different elements of s.
Martin Reinecke's avatar
Martin Reinecke committed
312
    sy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
313
        2D circular buffer of scalar products between elements of s and y.
Martin Reinecke's avatar
Martin Reinecke committed
314
    yy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
315
        2D circular buffer of scalar products between different elements of y.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
316
    """
Philipp Arras's avatar
Philipp Arras committed
317

318
319
    def __init__(self, max_history_length, x0, gradient):
        self.max_history_length = max_history_length
320
321
        self.s = [None]*max_history_length
        self.y = [None]*max_history_length
322
323
        self.last_x = x0
        self.last_gradient = gradient
Theo Steininger's avatar
Theo Steininger committed
324
        self.k = 0
325

Martin Reinecke's avatar
Martin Reinecke committed
326
        mmax = max_history_length
Martin Reinecke's avatar
Martin Reinecke committed
327
328
329
        self.ss = np.empty((mmax, mmax), dtype=np.float64)
        self.sy = np.empty((mmax, mmax), dtype=np.float64)
        self.yy = np.empty((mmax, mmax), dtype=np.float64)
330
331
332

    @property
    def history_length(self):
Martin Reinecke's avatar
Martin Reinecke committed
333
        """Returns the number of currently stored updates."""
334
335
336
337
        return min(self.k, self.max_history_length)

    @property
    def b(self):
338
339
        """Combines s, y and gradient to form the new base vectors b.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
340
341
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
342
        List
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
343
344
            List of new basis vectors.
        """
345
346
        result = []
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
347
        mmax = self.max_history_length
348

Martin Reinecke's avatar
Martin Reinecke committed
349
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
350
            result.append(self.s[(self.k-m+i) % mmax])
351

Martin Reinecke's avatar
Martin Reinecke committed
352
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
353
            result.append(self.y[(self.k-m+i) % mmax])
354
355
356
357
358
359
360

        result.append(self.last_gradient)

        return result

    @property
    def b_dot_b(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
361
        """Generates the (2m+1) * (2m+1) scalar matrix.
362

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
363
        The i,j-th element of the matrix is a scalar product between the i-th
364
365
        and j-th base vector.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
366
367
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
368
        numpy.ndarray
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
369
370
            Scalar matrix.
        """
371
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
372
        mmax = self.max_history_length
373
374
375
        k = self.k
        result = np.empty((2*m+1, 2*m+1), dtype=np.float)

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
376
        # update the stores
Martin Reinecke's avatar
Martin Reinecke committed
377
        k1 = (k-1) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
378
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
379
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
380
381
382
            self.ss[kmi, k1] = self.ss[k1, kmi] = self.s[kmi].vdot(self.s[k1])
            self.yy[kmi, k1] = self.yy[k1, kmi] = self.y[kmi].vdot(self.y[k1])
            self.sy[kmi, k1] = self.s[kmi].vdot(self.y[k1])
Martin Reinecke's avatar
Martin Reinecke committed
383
        for j in range(m-1):
Martin Reinecke's avatar
Martin Reinecke committed
384
385
            kmj = (k-m+j) % mmax
            self.sy[k1, kmj] = self.s[k1].vdot(self.y[kmj])
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
386

Martin Reinecke's avatar
Martin Reinecke committed
387
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
388
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
389
            for j in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
390
                kmj = (k-m+j) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
391
392
393
                result[i, j] = self.ss[kmi, kmj]
                result[i, m+j] = result[m+j, i] = self.sy[kmi, kmj]
                result[m+i, m+j] = self.yy[kmi, kmj]
394

395
            sgrad_i = self.s[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
396
            result[2*m, i] = result[i, 2*m] = sgrad_i
397

Martin Reinecke's avatar
fix    
Martin Reinecke committed
398
            ygrad_i = self.y[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
399
            result[2*m, m+i] = result[m+i, 2*m] = ygrad_i
400

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
401
        result[2*m, 2*m] = self.last_gradient.norm()
402
        return result
Theo Steininger's avatar
Theo Steininger committed
403
404

    @property
405
    def delta(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
406
        """Calculates the new scalar coefficients (deltas).
407

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
408
409
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
410
        List
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
411
412
            List of the new scalar coefficients (deltas).
        """
413
414
415
416
417
418
419
420
        m = self.history_length
        b_dot_b = self.b_dot_b

        delta = np.zeros(2*m+1, dtype=np.float)
        delta[2*m] = -1

        alpha = np.empty(m, dtype=np.float)

Martin Reinecke's avatar
Martin Reinecke committed
421
422
        for j in range(m-1, -1, -1):
            delta_b_b = sum([delta[l] * b_dot_b[l, j] for l in range(2*m+1)])
423
424
425
            alpha[j] = delta_b_b/b_dot_b[j, m+j]
            delta[m+j] -= alpha[j]

Martin Reinecke's avatar
Martin Reinecke committed
426
        for i in range(2*m+1):
427
428
            delta[i] *= b_dot_b[m-1, 2*m-1]/b_dot_b[2*m-1, 2*m-1]

Martin Reinecke's avatar
Martin Reinecke committed
429
        for j in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
430
            delta_b_b = sum([delta[l]*b_dot_b[m+j, l] for l in range(2*m+1)])
431
432
433
434
435
            beta = delta_b_b/b_dot_b[j, m+j]
            delta[j] += (alpha[j] - beta)

        return delta

Theo Steininger's avatar
Theo Steininger committed
436
    def add_new_point(self, x, gradient):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
437
438
        """Updates the s list and y list.

Martin Reinecke's avatar
Martin Reinecke committed
439
440
        Calculates the new position and gradient differences and enters them
        into the respective list.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
441
        """
Martin Reinecke's avatar
Martin Reinecke committed
442
443
444
        mmax = self.max_history_length
        self.s[self.k % mmax] = x - self.last_x
        self.y[self.k % mmax] = gradient - self.last_gradient
Theo Steininger's avatar
Theo Steininger committed
445

446
447
        self.last_x = x
        self.last_gradient = gradient
Theo Steininger's avatar
Theo Steininger committed
448

Martin Reinecke's avatar
fixes    
Martin Reinecke committed
449
        self.k += 1