descent_minimizers.py 14.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
Theo Steininger's avatar
Theo Steininger committed
13
#
14
# Copyright(C) 2013-2019 Max-Planck-Society
Theo Steininger's avatar
Theo Steininger committed
15
#
16
# NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik.
17

18
import numpy as np
Philipp Arras's avatar
Philipp Arras committed
19

20
from ..compat import *
21
from ..logger import logger
Martin Reinecke's avatar
Martin Reinecke committed
22
from .line_search import LineSearch
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from .minimizer import Minimizer


class DescentMinimizer(Minimizer):
    """ A base class used by gradient methods to find a local minimum.

    Descent minimization methods are used to find a local minimum of a scalar
    function by following a descent direction. This class implements the
    minimization procedure once a descent direction is known. The descent
    direction has to be implemented separately.

    Parameters
    ----------
    controller : IterationController
        Object that decides when to terminate the minimization.
    line_searcher : callable *optional*
        Function which infers the step size in the descent direction
Martin Reinecke's avatar
Martin Reinecke committed
40
        (default : LineSearch()).
41
42
    """

Martin Reinecke's avatar
Martin Reinecke committed
43
    def __init__(self, controller, line_searcher=LineSearch()):
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
        self._controller = controller
        self.line_searcher = line_searcher

    def __call__(self, energy):
        """ Performs the minimization of the provided Energy functional.

        Parameters
        ----------
        energy : Energy
           Energy object which provides value, gradient and metric at a
           specific position in parameter space.

        Returns
        -------
        Energy
            Latest `energy` of the minimization.
        int
            Can be controller.CONVERGED or controller.ERROR

        Notes
        -----
        The minimization is stopped if
            * the controller returns controller.CONVERGED or controller.ERROR,
            * a perfectly flat point is reached,
            * according to the line-search the minimum is found,
        """
        f_k_minus_1 = None
        controller = self._controller
        status = controller.start(energy)
        if status != controller.CONTINUE:
            return energy, status

        while True:
            # check if position is at a flat point
            if energy.gradient_norm == 0:
                return energy, controller.CONVERGED

            # compute a step length that reduces energy.value sufficiently
            new_energy, success = self.line_searcher.perform_line_search(
                energy=energy, pk=self.get_descent_direction(energy),
                f_k_minus_1=f_k_minus_1)
            if not success:
                self.reset()

            f_k_minus_1 = energy.value

            if new_energy.value > energy.value:
                logger.error("Error: Energy has increased")
                return energy, controller.ERROR

            if new_energy.value == energy.value:
                logger.warning(
                    "Warning: Energy has not changed. Assuming convergence...")
                return new_energy, controller.CONVERGED

            energy = new_energy
            status = self._controller.check(energy)
            if status != controller.CONTINUE:
                return energy, status

    def reset(self):
        pass

    def get_descent_direction(self, energy):
        """ Calculates the next descent direction.

        Parameters
        ----------
        energy : Energy
            An instance of the Energy class which shall be minimized. The
            position of `energy` is used as the starting point of minimization.

        Returns
        -------
        Field
           The descent direction.
        """
        raise NotImplementedError


class SteepestDescent(DescentMinimizer):
    """ Implementation of the steepest descent minimization scheme.

    Also known as 'gradient descent'. This algorithm simply follows the
    functional's gradient for minimization.
    """

    def get_descent_direction(self, energy):
        return -energy.gradient


Martin Reinecke's avatar
Martin Reinecke committed
135
136
137
138
139
140
141
142
143
class RelaxedNewton(DescentMinimizer):
    """ Calculates the descent direction according to a Newton scheme.

    The descent direction is determined by weighting the gradient at the
    current parameter position with the inverse local metric.
    """

    def __init__(self, controller, line_searcher=None):
        if line_searcher is None:
Martin Reinecke's avatar
Martin Reinecke committed
144
            line_searcher = LineSearch(preferred_initial_step_size=1.)
Martin Reinecke's avatar
Martin Reinecke committed
145
146
147
148
149
150
151
        super(RelaxedNewton, self).__init__(controller=controller,
                                            line_searcher=line_searcher)

    def get_descent_direction(self, energy):
        return -energy.metric.inverse_times(energy.gradient)


152
153
154
155
156
157
158
159
class NewtonCG(DescentMinimizer):
    """ Calculates the descent direction according to a Newton-CG scheme.

    Algorithm derived from SciPy sources.
    """

    def __init__(self, controller, line_searcher=None):
        if line_searcher is None:
Martin Reinecke's avatar
Martin Reinecke committed
160
            line_searcher = LineSearch(preferred_initial_step_size=1.)
161
162
163
164
165
166
167
        super(NewtonCG, self).__init__(controller=controller,
                                       line_searcher=line_searcher)

    def get_descent_direction(self, energy):
        float64eps = np.finfo(np.float64).eps
        grad = energy.gradient
        maggrad = abs(grad).sum()
Martin Reinecke's avatar
Martin Reinecke committed
168
        termcond = np.min([0.5, np.sqrt(maggrad)]) * maggrad
169
170
171
172
173
        xsupi = energy.position*0
        ri = grad
        psupi = -ri
        dri0 = ri.vdot(ri)

Martin Reinecke's avatar
Martin Reinecke committed
174
        i = 0
175
176
177
        while True:
            if abs(ri).sum() <= termcond:
                return xsupi
Martin Reinecke's avatar
Martin Reinecke committed
178
            Ap = energy.apply_metric(psupi)
179
180
            # check curvature
            curv = psupi.vdot(Ap)
Martin Reinecke's avatar
Martin Reinecke committed
181
            if 0 <= curv <= 3*float64eps:
182
183
                return xsupi
            elif curv < 0:
Martin Reinecke's avatar
Martin Reinecke committed
184
                return xsupi if i > 0 else (dri0/curv) * grad
185
186
187
188
189
190
191
192
193
194
195
196
197
            alphai = dri0/curv
            xsupi = xsupi + alphai*psupi
            ri = ri + alphai*Ap
            dri1 = ri.vdot(ri)
            psupi = (dri1/dri0)*psupi - ri
            i += 1
            dri0 = dri1  # update numpy.dot(ri,ri) for next time.

        # curvature keeps increasing, bail out
        raise ValueError("Warning: CG iterations didn't converge. "
                         "The Hessian is not positive definite.")


198
class L_BFGS(DescentMinimizer):
Martin Reinecke's avatar
Martin Reinecke committed
199
    def __init__(self, controller, line_searcher=LineSearch(),
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
                 max_history_length=5):
        super(L_BFGS, self).__init__(controller=controller,
                                     line_searcher=line_searcher)
        self.max_history_length = max_history_length

    def __call__(self, energy):
        self.reset()
        return super(L_BFGS, self).__call__(energy)

    def reset(self):
        self._k = 0
        self._s = [None]*self.max_history_length
        self._y = [None]*self.max_history_length

    def get_descent_direction(self, energy):
        x = energy.position
        s = self._s
        y = self._y
        k = self._k
        maxhist = self.max_history_length
        gradient = energy.gradient

        nhist = min(k, maxhist)
        alpha = [None]*maxhist
        p = -gradient
        if k > 0:
            idx = (k-1) % maxhist
            s[idx] = x-self._lastx
            y[idx] = gradient-self._lastgrad
        if nhist > 0:
            for i in range(k-1, k-nhist-1, -1):
                idx = i % maxhist
                alpha[idx] = s[idx].vdot(p)/s[idx].vdot(y[idx])
                p = p - alpha[idx]*y[idx]
            idx = (k-1) % maxhist
            fact = s[idx].vdot(y[idx]) / y[idx].vdot(y[idx])
            if fact <= 0.:
                logger.error("L-BFGS curvature not positive definite!")
            p = p*fact
            for i in range(k-nhist, k):
                idx = i % maxhist
                beta = y[idx].vdot(p) / s[idx].vdot(y[idx])
                p = p + (alpha[idx]-beta)*s[idx]
        self._lastx = x
        self._lastgrad = gradient
        self._k += 1
        return p
Theo Steininger's avatar
Theo Steininger committed
247
248


249
class VL_BFGS(DescentMinimizer):
Martin Reinecke's avatar
Martin Reinecke committed
250
251
252
253
254
255
256
257
258
259
260
261
262
263
    """Implementation of the Vector-free L-BFGS minimization scheme.

    Find the descent direction by using the inverse Hessian.
    Instead of storing the whole matrix, it stores only the last few
    updates, which are used to do operations requiring the inverse
    Hessian product. The updates are represented in a new basis to optimize
    the algorithm.

    References
    ----------
    W. Chen, Z. Wang, J. Zhou, "Large-scale L-BFGS using MapReduce", 2014,
    Microsoft
    """

Martin Reinecke's avatar
Martin Reinecke committed
264
    def __init__(self, controller, line_searcher=LineSearch(),
Martin Reinecke's avatar
Martin Reinecke committed
265
                 max_history_length=5):
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
266
267
        super(VL_BFGS, self).__init__(controller=controller,
                                      line_searcher=line_searcher)
268
269
        self.max_history_length = max_history_length

270
    def __call__(self, energy):
271
        self._information_store = None
272
        return super(VL_BFGS, self).__call__(energy)
273

Martin Reinecke's avatar
stage 1    
Martin Reinecke committed
274
275
276
    def reset(self):
        self._information_store = None

277
    def get_descent_direction(self, energy):
278
279
        x = energy.position
        gradient = energy.gradient
280
281
282
283
        # initialize the information store if it doesn't already exist
        try:
            self._information_store.add_new_point(x, gradient)
        except AttributeError:
Martin Reinecke's avatar
Martin Reinecke committed
284
285
            self._information_store = _InformationStore(
                self.max_history_length, x0=x, gradient=gradient)
286
287
288
289

        b = self._information_store.b
        delta = self._information_store.delta

290
        descent_direction = delta[0] * b[0]
Martin Reinecke's avatar
Martin Reinecke committed
291
        for i in range(1, len(delta)):
292
            descent_direction = descent_direction + delta[i]*b[i]
293

294
        return descent_direction
Theo Steininger's avatar
Theo Steininger committed
295
296


Martin Reinecke's avatar
Martin Reinecke committed
297
class _InformationStore(object):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
298
    """Class for storing a list of past updates.
299

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
300
301
    Parameters
    ----------
Martin Reinecke's avatar
Martin Reinecke committed
302
    max_history_length : int
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
303
304
305
306
307
        Maximum number of stored past updates.
    x0 : Field
        Initial position in variable space.
    gradient : Field
        Gradient at position x0.
308

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
309
310
    Attributes
    ----------
Martin Reinecke's avatar
Martin Reinecke committed
311
    max_history_length : int
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
312
313
        Maximum number of stored past updates.
    s : List
Martin Reinecke's avatar
Martin Reinecke committed
314
        Circular buffer of past position differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
315
    y : List
Martin Reinecke's avatar
Martin Reinecke committed
316
        Circular buffer of past gradient differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
317
    last_x : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
318
        Latest position in variable space.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
319
    last_gradient : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
320
        Gradient at latest position.
Martin Reinecke's avatar
Martin Reinecke committed
321
    k : int
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
322
        Number of updates that have taken place
Martin Reinecke's avatar
Martin Reinecke committed
323
    ss : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
324
        2D circular buffer of scalar products between different elements of s.
Martin Reinecke's avatar
Martin Reinecke committed
325
    sy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
326
        2D circular buffer of scalar products between elements of s and y.
Martin Reinecke's avatar
Martin Reinecke committed
327
    yy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
328
        2D circular buffer of scalar products between different elements of y.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
329
    """
Philipp Arras's avatar
Philipp Arras committed
330

331
332
    def __init__(self, max_history_length, x0, gradient):
        self.max_history_length = max_history_length
333
334
        self.s = [None]*max_history_length
        self.y = [None]*max_history_length
335
336
        self.last_x = x0
        self.last_gradient = gradient
Theo Steininger's avatar
Theo Steininger committed
337
        self.k = 0
338

Martin Reinecke's avatar
Martin Reinecke committed
339
        mmax = max_history_length
Martin Reinecke's avatar
Martin Reinecke committed
340
341
342
        self.ss = np.empty((mmax, mmax), dtype=np.float64)
        self.sy = np.empty((mmax, mmax), dtype=np.float64)
        self.yy = np.empty((mmax, mmax), dtype=np.float64)
343
344
345

    @property
    def history_length(self):
Martin Reinecke's avatar
Martin Reinecke committed
346
        """Returns the number of currently stored updates."""
347
348
349
350
        return min(self.k, self.max_history_length)

    @property
    def b(self):
351
352
        """Combines s, y and gradient to form the new base vectors b.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
353
354
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
355
        List
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
356
357
            List of new basis vectors.
        """
358
359
        result = []
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
360
        mmax = self.max_history_length
361

Martin Reinecke's avatar
Martin Reinecke committed
362
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
363
            result.append(self.s[(self.k-m+i) % mmax])
364

Martin Reinecke's avatar
Martin Reinecke committed
365
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
366
            result.append(self.y[(self.k-m+i) % mmax])
367
368
369
370
371
372
373

        result.append(self.last_gradient)

        return result

    @property
    def b_dot_b(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
374
        """Generates the (2m+1) * (2m+1) scalar matrix.
375

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
376
        The i,j-th element of the matrix is a scalar product between the i-th
377
378
        and j-th base vector.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
379
380
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
381
        numpy.ndarray
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
382
383
            Scalar matrix.
        """
384
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
385
        mmax = self.max_history_length
386
387
388
        k = self.k
        result = np.empty((2*m+1, 2*m+1), dtype=np.float)

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
389
        # update the stores
Martin Reinecke's avatar
Martin Reinecke committed
390
        k1 = (k-1) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
391
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
392
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
393
394
395
            self.ss[kmi, k1] = self.ss[k1, kmi] = self.s[kmi].vdot(self.s[k1])
            self.yy[kmi, k1] = self.yy[k1, kmi] = self.y[kmi].vdot(self.y[k1])
            self.sy[kmi, k1] = self.s[kmi].vdot(self.y[k1])
Martin Reinecke's avatar
Martin Reinecke committed
396
        for j in range(m-1):
Martin Reinecke's avatar
Martin Reinecke committed
397
398
            kmj = (k-m+j) % mmax
            self.sy[k1, kmj] = self.s[k1].vdot(self.y[kmj])
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
399

Martin Reinecke's avatar
Martin Reinecke committed
400
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
401
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
402
            for j in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
403
                kmj = (k-m+j) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
404
405
406
                result[i, j] = self.ss[kmi, kmj]
                result[i, m+j] = result[m+j, i] = self.sy[kmi, kmj]
                result[m+i, m+j] = self.yy[kmi, kmj]
407

408
            sgrad_i = self.s[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
409
            result[2*m, i] = result[i, 2*m] = sgrad_i
410

Martin Reinecke's avatar
fix    
Martin Reinecke committed
411
            ygrad_i = self.y[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
412
            result[2*m, m+i] = result[m+i, 2*m] = ygrad_i
413

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
414
        result[2*m, 2*m] = self.last_gradient.norm()
415
        return result
Theo Steininger's avatar
Theo Steininger committed
416
417

    @property
418
    def delta(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
419
        """Calculates the new scalar coefficients (deltas).
420

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
421
422
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
423
        List
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
424
425
            List of the new scalar coefficients (deltas).
        """
426
427
428
429
430
431
432
433
        m = self.history_length
        b_dot_b = self.b_dot_b

        delta = np.zeros(2*m+1, dtype=np.float)
        delta[2*m] = -1

        alpha = np.empty(m, dtype=np.float)

Martin Reinecke's avatar
Martin Reinecke committed
434
435
        for j in range(m-1, -1, -1):
            delta_b_b = sum([delta[l] * b_dot_b[l, j] for l in range(2*m+1)])
436
437
438
            alpha[j] = delta_b_b/b_dot_b[j, m+j]
            delta[m+j] -= alpha[j]

Martin Reinecke's avatar
Martin Reinecke committed
439
        for i in range(2*m+1):
440
441
            delta[i] *= b_dot_b[m-1, 2*m-1]/b_dot_b[2*m-1, 2*m-1]

Martin Reinecke's avatar
Martin Reinecke committed
442
        for j in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
443
            delta_b_b = sum([delta[l]*b_dot_b[m+j, l] for l in range(2*m+1)])
444
445
446
447
448
            beta = delta_b_b/b_dot_b[j, m+j]
            delta[j] += (alpha[j] - beta)

        return delta

Theo Steininger's avatar
Theo Steininger committed
449
    def add_new_point(self, x, gradient):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
450
451
        """Updates the s list and y list.

Martin Reinecke's avatar
Martin Reinecke committed
452
453
        Calculates the new position and gradient differences and enters them
        into the respective list.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
454
        """
Martin Reinecke's avatar
Martin Reinecke committed
455
456
457
        mmax = self.max_history_length
        self.s[self.k % mmax] = x - self.last_x
        self.y[self.k % mmax] = gradient - self.last_gradient
Theo Steininger's avatar
Theo Steininger committed
458

459
460
        self.last_x = x
        self.last_gradient = gradient
Theo Steininger's avatar
Theo Steininger committed
461

Martin Reinecke's avatar
fixes    
Martin Reinecke committed
462
        self.k += 1