vl_bfgs.py 7.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
Theo Steininger's avatar
Theo Steininger committed
13
#
Martin Reinecke's avatar
Martin Reinecke committed
14
# Copyright(C) 2013-2018 Max-Planck-Society
Theo Steininger's avatar
Theo Steininger committed
15
16
17
#
# NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik
# and financially supported by the Studienstiftung des deutschen Volkes.
theos's avatar
theos committed
18

Martin Reinecke's avatar
Martin Reinecke committed
19
20
21
from __future__ import division
from builtins import range
from builtins import object
22
import numpy as np
23
from .descent_minimizer import DescentMinimizer
Martin Reinecke's avatar
Martin Reinecke committed
24
from .line_search_strong_wolfe import LineSearchStrongWolfe
theos's avatar
theos committed
25
26


27
class VL_BFGS(DescentMinimizer):
Martin Reinecke's avatar
Martin Reinecke committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
    """Implementation of the Vector-free L-BFGS minimization scheme.

    Find the descent direction by using the inverse Hessian.
    Instead of storing the whole matrix, it stores only the last few
    updates, which are used to do operations requiring the inverse
    Hessian product. The updates are represented in a new basis to optimize
    the algorithm.

    References
    ----------
    W. Chen, Z. Wang, J. Zhou, "Large-scale L-BFGS using MapReduce", 2014,
    Microsoft
    """

Martin Reinecke's avatar
Martin Reinecke committed
42
43
    def __init__(self, controller, line_searcher=LineSearchStrongWolfe(),
                 max_history_length=5):
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
44
45
        super(VL_BFGS, self).__init__(controller=controller,
                                      line_searcher=line_searcher)
46
47
        self.max_history_length = max_history_length

48
    def __call__(self, energy):
49
        self._information_store = None
50
        return super(VL_BFGS, self).__call__(energy)
51

52
    def get_descent_direction(self, energy):
53
54
        x = energy.position
        gradient = energy.gradient
55
56
57
58
        # initialize the information store if it doesn't already exist
        try:
            self._information_store.add_new_point(x, gradient)
        except AttributeError:
Martin Reinecke's avatar
Martin Reinecke committed
59
60
            self._information_store = _InformationStore(
                self.max_history_length, x0=x, gradient=gradient)
61
62
63
64

        b = self._information_store.b
        delta = self._information_store.delta

65
        descent_direction = delta[0] * b[0]
Martin Reinecke's avatar
Martin Reinecke committed
66
        for i in range(1, len(delta)):
67
            descent_direction += delta[i] * b[i]
68

69
        return descent_direction
theos's avatar
theos committed
70
71


Martin Reinecke's avatar
Martin Reinecke committed
72
class _InformationStore(object):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
73
    """Class for storing a list of past updates.
74

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
75
76
    Parameters
    ----------
Martin Reinecke's avatar
Martin Reinecke committed
77
    max_history_length : int
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
78
79
80
81
82
        Maximum number of stored past updates.
    x0 : Field
        Initial position in variable space.
    gradient : Field
        Gradient at position x0.
83

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
84
85
    Attributes
    ----------
Martin Reinecke's avatar
Martin Reinecke committed
86
    max_history_length : int
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
87
88
        Maximum number of stored past updates.
    s : List
Martin Reinecke's avatar
Martin Reinecke committed
89
        Circular buffer of past position differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
90
    y : List
Martin Reinecke's avatar
Martin Reinecke committed
91
        Circular buffer of past gradient differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
92
    last_x : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
93
        Latest position in variable space.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
94
    last_gradient : Field
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
95
        Gradient at latest position.
Martin Reinecke's avatar
Martin Reinecke committed
96
    k : int
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
97
        Number of updates that have taken place
Martin Reinecke's avatar
Martin Reinecke committed
98
    ss : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
99
        2D circular buffer of scalar products between different elements of s.
Martin Reinecke's avatar
Martin Reinecke committed
100
    sy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
101
        2D circular buffer of scalar products between elements of s and y.
Martin Reinecke's avatar
Martin Reinecke committed
102
    yy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
103
        2D circular buffer of scalar products between different elements of y.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
104
    """
105
106
    def __init__(self, max_history_length, x0, gradient):
        self.max_history_length = max_history_length
107
108
        self.s = [None]*max_history_length
        self.y = [None]*max_history_length
109
110
        self.last_x = x0.copy()
        self.last_gradient = gradient.copy()
theos's avatar
theos committed
111
        self.k = 0
112

Martin Reinecke's avatar
Martin Reinecke committed
113
        mmax = max_history_length
Martin Reinecke's avatar
Martin Reinecke committed
114
115
116
        self.ss = np.empty((mmax, mmax), dtype=np.float64)
        self.sy = np.empty((mmax, mmax), dtype=np.float64)
        self.yy = np.empty((mmax, mmax), dtype=np.float64)
117
118
119

    @property
    def history_length(self):
Martin Reinecke's avatar
Martin Reinecke committed
120
        """Returns the number of currently stored updates."""
121
122
123
124
        return min(self.k, self.max_history_length)

    @property
    def b(self):
125
126
        """Combines s, y and gradient to form the new base vectors b.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
127
128
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
129
        List
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
130
131
            List of new basis vectors.
        """
132
133
        result = []
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
134
        mmax = self.max_history_length
135

Martin Reinecke's avatar
Martin Reinecke committed
136
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
137
            result.append(self.s[(self.k-m+i) % mmax])
138

Martin Reinecke's avatar
Martin Reinecke committed
139
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
140
            result.append(self.y[(self.k-m+i) % mmax])
141
142
143
144
145
146
147

        result.append(self.last_gradient)

        return result

    @property
    def b_dot_b(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
148
        """Generates the (2m+1) * (2m+1) scalar matrix.
149

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
150
        The i,j-th element of the matrix is a scalar product between the i-th
151
152
        and j-th base vector.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
153
154
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
155
        numpy.ndarray
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
156
157
            Scalar matrix.
        """
158
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
159
        mmax = self.max_history_length
160
161
162
        k = self.k
        result = np.empty((2*m+1, 2*m+1), dtype=np.float)

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
163
        # update the stores
Martin Reinecke's avatar
Martin Reinecke committed
164
        k1 = (k-1) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
165
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
166
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
167
168
169
            self.ss[kmi, k1] = self.ss[k1, kmi] = self.s[kmi].vdot(self.s[k1])
            self.yy[kmi, k1] = self.yy[k1, kmi] = self.y[kmi].vdot(self.y[k1])
            self.sy[kmi, k1] = self.s[kmi].vdot(self.y[k1])
Martin Reinecke's avatar
Martin Reinecke committed
170
        for j in range(m-1):
Martin Reinecke's avatar
Martin Reinecke committed
171
172
            kmj = (k-m+j) % mmax
            self.sy[k1, kmj] = self.s[k1].vdot(self.y[kmj])
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
173

Martin Reinecke's avatar
Martin Reinecke committed
174
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
175
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
176
            for j in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
177
                kmj = (k-m+j) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
178
179
180
                result[i, j] = self.ss[kmi, kmj]
                result[i, m+j] = result[m+j, i] = self.sy[kmi, kmj]
                result[m+i, m+j] = self.yy[kmi, kmj]
181

182
            sgrad_i = self.s[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
183
            result[2*m, i] = result[i, 2*m] = sgrad_i
184

Martin Reinecke's avatar
fix    
Martin Reinecke committed
185
            ygrad_i = self.y[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
186
            result[2*m, m+i] = result[m+i, 2*m] = ygrad_i
187

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
188
        result[2*m, 2*m] = self.last_gradient.norm()
189
        return result
theos's avatar
theos committed
190
191

    @property
192
    def delta(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
193
        """Calculates the new scalar coefficients (deltas).
194

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
195
196
        Returns
        -------
Martin Reinecke's avatar
Martin Reinecke committed
197
        List
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
198
199
            List of the new scalar coefficients (deltas).
        """
200
201
202
203
204
205
206
207
        m = self.history_length
        b_dot_b = self.b_dot_b

        delta = np.zeros(2*m+1, dtype=np.float)
        delta[2*m] = -1

        alpha = np.empty(m, dtype=np.float)

Martin Reinecke's avatar
Martin Reinecke committed
208
209
        for j in range(m-1, -1, -1):
            delta_b_b = sum([delta[l] * b_dot_b[l, j] for l in range(2*m+1)])
210
211
212
            alpha[j] = delta_b_b/b_dot_b[j, m+j]
            delta[m+j] -= alpha[j]

Martin Reinecke's avatar
Martin Reinecke committed
213
        for i in range(2*m+1):
214
215
            delta[i] *= b_dot_b[m-1, 2*m-1]/b_dot_b[2*m-1, 2*m-1]

Martin Reinecke's avatar
Martin Reinecke committed
216
        for j in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
217
            delta_b_b = sum([delta[l]*b_dot_b[m+j, l] for l in range(2*m+1)])
218
219
220
221
222
            beta = delta_b_b/b_dot_b[j, m+j]
            delta[j] += (alpha[j] - beta)

        return delta

theos's avatar
theos committed
223
    def add_new_point(self, x, gradient):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
224
225
        """Updates the s list and y list.

Martin Reinecke's avatar
Martin Reinecke committed
226
227
        Calculates the new position and gradient differences and enters them
        into the respective list.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
228
        """
Martin Reinecke's avatar
Martin Reinecke committed
229
230
231
        mmax = self.max_history_length
        self.s[self.k % mmax] = x - self.last_x
        self.y[self.k % mmax] = gradient - self.last_gradient
theos's avatar
theos committed
232

233
234
        self.last_x = x.copy()
        self.last_gradient = gradient.copy()
theos's avatar
theos committed
235

Martin Reinecke's avatar
fixes    
Martin Reinecke committed
236
        self.k += 1