vl_bfgs.py 8.08 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
Theo Steininger's avatar
Theo Steininger committed
13
14
15
16
17
#
# Copyright(C) 2013-2017 Max-Planck-Society
#
# NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik
# and financially supported by the Studienstiftung des deutschen Volkes.
theos's avatar
theos committed
18

19
20
import numpy as np

21
from .descent_minimizer import DescentMinimizer
22
from .line_searching import LineSearchStrongWolfe
theos's avatar
theos committed
23
24


25
class VL_BFGS(DescentMinimizer):
Martin Reinecke's avatar
Martin Reinecke committed
26
27
    def __init__(self, controller, line_searcher=LineSearchStrongWolfe(),
                 max_history_length=5):
28
29

        super(VL_BFGS, self).__init__(
Martin Reinecke's avatar
Martin Reinecke committed
30
31
                                controller=controller,
                                line_searcher=line_searcher)
32
33
34

        self.max_history_length = max_history_length

35
    def __call__(self, energy):
36
        self._information_store = None
37
        return super(VL_BFGS, self).__call__(energy)
38

39
    def get_descent_direction(self, energy):
40
41
42
43
44
45
46
47
        """Implementation of the Vector-free L-BFGS minimization scheme.

        Find the descent direction by using the inverse Hessian.
        Instead of storing the whole matrix, it stores only the last few
        updates, which are used to do operations requiring the inverse
        Hessian product. The updates are represented in a new basis to optimize
        the algorithm.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
48
49
        Parameters
        ----------
50
51
52
53
        energy : Energy
            An instance of the Energy class which shall be minized. The
            position of `energy` is used as the starting point of minization.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
54
55
        Returns
        -------
56
        descent_direction : Field
57
58
59
60
61
62
63
            Returns the descent direction.

        References
        ----------
        W. Chen, Z. Wang, J. Zhou, "Large-scale L-BFGS using MapReduce", 2014,
        Microsoft

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
64
        """
65

66
67
        x = energy.position
        gradient = energy.gradient
68
69
70
71
72
73
74
75
76
77
78
        # initialize the information store if it doesn't already exist
        try:
            self._information_store.add_new_point(x, gradient)
        except AttributeError:
            self._information_store = InformationStore(self.max_history_length,
                                                       x0=x,
                                                       gradient=gradient)

        b = self._information_store.b
        delta = self._information_store.delta

79
        descent_direction = delta[0] * b[0]
80
        for i in xrange(1, len(delta)):
81
            descent_direction += delta[i] * b[i]
82

83
        return descent_direction
theos's avatar
theos committed
84
85
86


class InformationStore(object):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
87
    """Class for storing a list of past updates.
88

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
89
90
91
92
93
94
95
96
    Parameters
    ----------
    max_history_length : integer
        Maximum number of stored past updates.
    x0 : Field
        Initial position in variable space.
    gradient : Field
        Gradient at position x0.
97

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
98
99
100
101
102
    Attributes
    ----------
    max_history_length : integer
        Maximum number of stored past updates.
    s : List
Martin Reinecke's avatar
Martin Reinecke committed
103
        Circular buffer of past position differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
104
    y : List
Martin Reinecke's avatar
Martin Reinecke committed
105
        Circular buffer of past gradient differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
106
    last_x : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
107
        Latest position in variable space.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
108
    last_gradient : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
109
        Gradient at latest position.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
110
    k : integer
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
111
        Number of updates that have taken place
Martin Reinecke's avatar
Martin Reinecke committed
112
    ss : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
113
        2D circular buffer of scalar products between different elements of s.
Martin Reinecke's avatar
Martin Reinecke committed
114
    sy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
115
        2D circular buffer of scalar products between elements of s and y.
Martin Reinecke's avatar
Martin Reinecke committed
116
    yy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
117
        2D circular buffer of scalar products between different elements of y.
118

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
119
    """
120
121
    def __init__(self, max_history_length, x0, gradient):
        self.max_history_length = max_history_length
122
123
        self.s = [None]*max_history_length
        self.y = [None]*max_history_length
124
125
        self.last_x = x0.copy()
        self.last_gradient = gradient.copy()
theos's avatar
theos committed
126
        self.k = 0
127

Martin Reinecke's avatar
Martin Reinecke committed
128
        mmax = max_history_length
Martin Reinecke's avatar
Martin Reinecke committed
129
130
131
        self.ss = np.empty((mmax, mmax), dtype=np.float64)
        self.sy = np.empty((mmax, mmax), dtype=np.float64)
        self.yy = np.empty((mmax, mmax), dtype=np.float64)
132
133
134

    @property
    def history_length(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
135
        """Returns the number of currently stored updates.
136

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
137
        """
138
139
140
141
        return min(self.k, self.max_history_length)

    @property
    def b(self):
142
143
        """Combines s, y and gradient to form the new base vectors b.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
144
145
146
147
        Returns
        -------
        result : List
            List of new basis vectors.
148

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
149
        """
150
151
        result = []
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
152
        mmax = self.max_history_length
153
154
155
156
        k = self.k

        s = self.s
        for i in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
157
            result.append(s[(k-m+i) % mmax])
158
159
160

        y = self.y
        for i in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
161
            result.append(y[(k-m+i) % mmax])
162
163
164
165
166
167
168

        result.append(self.last_gradient)

        return result

    @property
    def b_dot_b(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
169
        """Generates the (2m+1) * (2m+1) scalar matrix.
170

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
171
        The i,j-th element of the matrix is a scalar product between the i-th
172
173
        and j-th base vector.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
174
175
        Returns
        -------
176
        result : numpy.ndarray
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
177
            Scalar matrix.
178

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
179
        """
180
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
181
        mmax = self.max_history_length
182
183
184
        k = self.k
        result = np.empty((2*m+1, 2*m+1), dtype=np.float)

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
185
        # update the stores
Martin Reinecke's avatar
Martin Reinecke committed
186
        k1 = (k-1) % mmax
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
187
        for i in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
188
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
189
190
191
            self.ss[kmi, k1] = self.ss[k1, kmi] = self.s[kmi].vdot(self.s[k1])
            self.yy[kmi, k1] = self.yy[k1, kmi] = self.y[kmi].vdot(self.y[k1])
            self.sy[kmi, k1] = self.s[kmi].vdot(self.y[k1])
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
192
        for j in xrange(m-1):
Martin Reinecke's avatar
Martin Reinecke committed
193
194
            kmj = (k-m+j) % mmax
            self.sy[k1, kmj] = self.s[k1].vdot(self.y[kmj])
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
195

196
        for i in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
197
            kmi = (k-m+i) % mmax
198
            for j in xrange(m):
Martin Reinecke's avatar
Martin Reinecke committed
199
                kmj = (k-m+j) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
200
201
202
                result[i, j] = self.ss[kmi, kmj]
                result[i, m+j] = result[m+j, i] = self.sy[kmi, kmj]
                result[m+i, m+j] = self.yy[kmi, kmj]
203

204
            sgrad_i = self.s[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
205
            result[2*m, i] = result[i, 2*m] = sgrad_i
206

Martin Reinecke's avatar
fix    
Martin Reinecke committed
207
            ygrad_i = self.y[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
208
            result[2*m, m+i] = result[m+i, 2*m] = ygrad_i
209

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
210
        result[2*m, 2*m] = self.last_gradient.norm()
211
212

        return result
theos's avatar
theos committed
213
214

    @property
215
    def delta(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
216
        """Calculates the new scalar coefficients (deltas).
217

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
218
219
220
221
        Returns
        -------
        delta : List
            List of the new scalar coefficients (deltas).
222

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
223
        """
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
        m = self.history_length
        b_dot_b = self.b_dot_b

        delta = np.zeros(2*m+1, dtype=np.float)
        delta[2*m] = -1

        alpha = np.empty(m, dtype=np.float)

        for j in xrange(m-1, -1, -1):
            delta_b_b = sum([delta[l] * b_dot_b[l, j] for l in xrange(2*m+1)])
            alpha[j] = delta_b_b/b_dot_b[j, m+j]
            delta[m+j] -= alpha[j]

        for i in xrange(2*m+1):
            delta[i] *= b_dot_b[m-1, 2*m-1]/b_dot_b[2*m-1, 2*m-1]

Martin Reinecke's avatar
fix    
Martin Reinecke committed
240
        for j in xrange(m):
241
242
243
244
245
246
            delta_b_b = sum([delta[l]*b_dot_b[m+j, l] for l in xrange(2*m+1)])
            beta = delta_b_b/b_dot_b[j, m+j]
            delta[j] += (alpha[j] - beta)

        return delta

theos's avatar
theos committed
247
    def add_new_point(self, x, gradient):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
248
249
        """Updates the s list and y list.

Martin Reinecke's avatar
Martin Reinecke committed
250
251
        Calculates the new position and gradient differences and enters them
        into the respective list.
252

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
253
        """
Martin Reinecke's avatar
Martin Reinecke committed
254
255
256
        mmax = self.max_history_length
        self.s[self.k % mmax] = x - self.last_x
        self.y[self.k % mmax] = gradient - self.last_gradient
theos's avatar
theos committed
257

258
259
        self.last_x = x.copy()
        self.last_gradient = gradient.copy()
theos's avatar
theos committed
260

Martin Reinecke's avatar
fixes    
Martin Reinecke committed
261
        self.k += 1