vl_bfgs.py 8.37 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
Theo Steininger's avatar
Theo Steininger committed
13
14
15
16
17
#
# Copyright(C) 2013-2017 Max-Planck-Society
#
# NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik
# and financially supported by the Studienstiftung des deutschen Volkes.
theos's avatar
theos committed
18

19
20
import numpy as np

21
from .descent_minimizer import DescentMinimizer
22
from .line_searching import LineSearchStrongWolfe
theos's avatar
theos committed
23
24


25
class VL_BFGS(DescentMinimizer):
26
27
    def __init__(self, line_searcher=LineSearchStrongWolfe(), callback=None,
                 convergence_tolerance=1E-4, convergence_level=3,
28
                 iteration_limit=None, max_history_length=5):
29
30
31
32
33
34
35
36
37
38

        super(VL_BFGS, self).__init__(
                                line_searcher=line_searcher,
                                callback=callback,
                                convergence_tolerance=convergence_tolerance,
                                convergence_level=convergence_level,
                                iteration_limit=iteration_limit)

        self.max_history_length = max_history_length

39
    def __call__(self, energy):
40
        self._information_store = None
41
        return super(VL_BFGS, self).__call__(energy)
42

43
    def get_descent_direction(self, energy):
44
45
46
47
48
49
50
51
        """Implementation of the Vector-free L-BFGS minimization scheme.

        Find the descent direction by using the inverse Hessian.
        Instead of storing the whole matrix, it stores only the last few
        updates, which are used to do operations requiring the inverse
        Hessian product. The updates are represented in a new basis to optimize
        the algorithm.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
52
53
        Parameters
        ----------
54
55
56
57
        energy : Energy
            An instance of the Energy class which shall be minized. The
            position of `energy` is used as the starting point of minization.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
58
59
        Returns
        -------
60
        descent_direction : Field
61
62
63
64
65
66
67
            Returns the descent direction.

        References
        ----------
        W. Chen, Z. Wang, J. Zhou, "Large-scale L-BFGS using MapReduce", 2014,
        Microsoft

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
68
        """
69

70
71
        x = energy.position
        gradient = energy.gradient
72
73
74
75
76
77
78
79
80
81
82
        # initialize the information store if it doesn't already exist
        try:
            self._information_store.add_new_point(x, gradient)
        except AttributeError:
            self._information_store = InformationStore(self.max_history_length,
                                                       x0=x,
                                                       gradient=gradient)

        b = self._information_store.b
        delta = self._information_store.delta

83
        descent_direction = delta[0] * b[0]
84
        for i in xrange(1, len(delta)):
85
            descent_direction += delta[i] * b[i]
86

87
        return descent_direction
theos's avatar
theos committed
88
89
90


class InformationStore(object):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
91
    """Class for storing a list of past updates.
92

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
93
94
95
96
97
98
99
100
    Parameters
    ----------
    max_history_length : integer
        Maximum number of stored past updates.
    x0 : Field
        Initial position in variable space.
    gradient : Field
        Gradient at position x0.
101

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
102
103
104
105
106
    Attributes
    ----------
    max_history_length : integer
        Maximum number of stored past updates.
    s : List
Martin Reinecke's avatar
Martin Reinecke committed
107
        Circular buffer of past position differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
108
    y : List
Martin Reinecke's avatar
Martin Reinecke committed
109
        Circular buffer of past gradient differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
110
    last_x : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
111
        Latest position in variable space.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
112
    last_gradient : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
113
        Gradient at latest position.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
114
    k : integer
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
115
        Number of updates that have taken place
Martin Reinecke's avatar
Martin Reinecke committed
116
    ss : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
117
        2D circular buffer of scalar products between different elements of s.
Martin Reinecke's avatar
Martin Reinecke committed
118
    sy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
119
        2D circular buffer of scalar products between elements of s and y.
Martin Reinecke's avatar
Martin Reinecke committed
120
    yy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
121
        2D circular buffer of scalar products between different elements of y.
122

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
123
    """
124
125
    def __init__(self, max_history_length, x0, gradient):
        self.max_history_length = max_history_length
126
127
        self.s = [None]*max_history_length
        self.y = [None]*max_history_length
128
129
        self.last_x = x0.copy()
        self.last_gradient = gradient.copy()
theos's avatar
theos committed
130
        self.k = 0
131

Martin Reinecke's avatar
Martin Reinecke committed
132
        mmax = max_history_length
Martin Reinecke's avatar
Martin Reinecke committed
133
134
135
        self.ss = np.empty((mmax, mmax), dtype=np.float64)
        self.sy = np.empty((mmax, mmax), dtype=np.float64)
        self.yy = np.empty((mmax, mmax), dtype=np.float64)
136
137
138

    @property
    def history_length(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
139
        """Returns the number of currently stored updates.
140

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
141
        """
142
143
144
145
        return min(self.k, self.max_history_length)

    @property
    def b(self):
146
147
        """Combines s, y and gradient to form the new base vectors b.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
148
149
150
151
        Returns
        -------
        result : List
            List of new basis vectors.
152

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
153
        """
154
155
        result = []
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
156
        mmax = self.max_history_length
157
158
159
160
        k = self.k

        s = self.s
        for i in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
161
            result.append(s[(k-m+i) % mmax])
162
163
164

        y = self.y
        for i in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
165
            result.append(y[(k-m+i) % mmax])
166
167
168
169
170
171
172

        result.append(self.last_gradient)

        return result

    @property
    def b_dot_b(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
173
        """Generates the (2m+1) * (2m+1) scalar matrix.
174

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
175
        The i,j-th element of the matrix is a scalar product between the i-th
176
177
        and j-th base vector.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
178
179
        Returns
        -------
180
        result : numpy.ndarray
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
181
            Scalar matrix.
182

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
183
        """
184
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
185
        mmax = self.max_history_length
186
187
188
        k = self.k
        result = np.empty((2*m+1, 2*m+1), dtype=np.float)

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
189
        # update the stores
Martin Reinecke's avatar
Martin Reinecke committed
190
        k1 = (k-1) % mmax
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
191
        for i in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
192
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
193
194
195
            self.ss[kmi, k1] = self.ss[k1, kmi] = self.s[kmi].vdot(self.s[k1])
            self.yy[kmi, k1] = self.yy[k1, kmi] = self.y[kmi].vdot(self.y[k1])
            self.sy[kmi, k1] = self.s[kmi].vdot(self.y[k1])
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
196
        for j in xrange(m-1):
Martin Reinecke's avatar
Martin Reinecke committed
197
198
            kmj = (k-m+j) % mmax
            self.sy[k1, kmj] = self.s[k1].vdot(self.y[kmj])
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
199

200
        for i in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
201
            kmi = (k-m+i) % mmax
202
            for j in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
203
                kmj = (k-m+j) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
204
205
206
                result[i, j] = self.ss[kmi, kmj]
                result[i, m+j] = result[m+j, i] = self.sy[kmi, kmj]
                result[m+i, m+j] = self.yy[kmi, kmj]
207

208
            sgrad_i = self.s[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
209
            result[2*m, i] = result[i, 2*m] = sgrad_i
210

Martin Reinecke's avatar
fix    
Martin Reinecke committed
211
            ygrad_i = self.y[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
212
            result[2*m, m+i] = result[m+i, 2*m] = ygrad_i
213

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
214
        result[2*m, 2*m] = self.last_gradient.norm()
215
216

        return result
theos's avatar
theos committed
217
218

    @property
219
    def delta(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
220
        """Calculates the new scalar coefficients (deltas).
221

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
222
223
224
225
        Returns
        -------
        delta : List
            List of the new scalar coefficients (deltas).
226

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
227
        """
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
        m = self.history_length
        b_dot_b = self.b_dot_b

        delta = np.zeros(2*m+1, dtype=np.float)
        delta[2*m] = -1

        alpha = np.empty(m, dtype=np.float)

        for j in xrange(m-1, -1, -1):
            delta_b_b = sum([delta[l] * b_dot_b[l, j] for l in xrange(2*m+1)])
            alpha[j] = delta_b_b/b_dot_b[j, m+j]
            delta[m+j] -= alpha[j]

        for i in xrange(2*m+1):
            delta[i] *= b_dot_b[m-1, 2*m-1]/b_dot_b[2*m-1, 2*m-1]

Martin Reinecke's avatar
fix    
Martin Reinecke committed
244
        for j in xrange(m):
245
246
247
248
249
250
            delta_b_b = sum([delta[l]*b_dot_b[m+j, l] for l in xrange(2*m+1)])
            beta = delta_b_b/b_dot_b[j, m+j]
            delta[j] += (alpha[j] - beta)

        return delta

theos's avatar
theos committed
251
    def add_new_point(self, x, gradient):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
252
253
        """Updates the s list and y list.

Martin Reinecke's avatar
Martin Reinecke committed
254
255
        Calculates the new position and gradient differences and enters them
        into the respective list.
256

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
257
        """
Martin Reinecke's avatar
Martin Reinecke committed
258
259
260
        mmax = self.max_history_length
        self.s[self.k % mmax] = x - self.last_x
        self.y[self.k % mmax] = gradient - self.last_gradient
theos's avatar
theos committed
261

262
263
        self.last_x = x.copy()
        self.last_gradient = gradient.copy()
theos's avatar
theos committed
264

Martin Reinecke's avatar
fixes    
Martin Reinecke committed
265
        self.k += 1