vl_bfgs.py 8.13 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
Theo Steininger's avatar
Theo Steininger committed
13 14 15 16 17
#
# Copyright(C) 2013-2017 Max-Planck-Society
#
# NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik
# and financially supported by the Studienstiftung des deutschen Volkes.
Theo Steininger's avatar
Theo Steininger committed
18

Martin Reinecke's avatar
Martin Reinecke committed
19 20 21
from __future__ import division
from builtins import range
from builtins import object
22 23
import numpy as np

24
from .descent_minimizer import DescentMinimizer
25
from .line_searching import LineSearchStrongWolfe
Theo Steininger's avatar
Theo Steininger committed
26 27


28
class VL_BFGS(DescentMinimizer):
Martin Reinecke's avatar
Martin Reinecke committed
29 30
    def __init__(self, controller, line_searcher=LineSearchStrongWolfe(),
                 max_history_length=5):
31

Martin Reinecke's avatar
cleanup  
Martin Reinecke committed
32 33
        super(VL_BFGS, self).__init__(controller=controller,
                                      line_searcher=line_searcher)
34 35 36

        self.max_history_length = max_history_length

37
    def __call__(self, energy):
38
        self._information_store = None
39
        return super(VL_BFGS, self).__call__(energy)
40

41
    def get_descent_direction(self, energy):
42 43 44 45 46 47 48 49
        """Implementation of the Vector-free L-BFGS minimization scheme.

        Find the descent direction by using the inverse Hessian.
        Instead of storing the whole matrix, it stores only the last few
        updates, which are used to do operations requiring the inverse
        Hessian product. The updates are represented in a new basis to optimize
        the algorithm.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
50 51
        Parameters
        ----------
52 53 54 55
        energy : Energy
            An instance of the Energy class which shall be minized. The
            position of `energy` is used as the starting point of minization.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
56 57
        Returns
        -------
58
        descent_direction : Field
59 60 61 62 63 64 65
            Returns the descent direction.

        References
        ----------
        W. Chen, Z. Wang, J. Zhou, "Large-scale L-BFGS using MapReduce", 2014,
        Microsoft

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
66
        """
67

68 69
        x = energy.position
        gradient = energy.gradient
70 71 72 73 74 75 76 77 78 79 80
        # initialize the information store if it doesn't already exist
        try:
            self._information_store.add_new_point(x, gradient)
        except AttributeError:
            self._information_store = InformationStore(self.max_history_length,
                                                       x0=x,
                                                       gradient=gradient)

        b = self._information_store.b
        delta = self._information_store.delta

81
        descent_direction = delta[0] * b[0]
Martin Reinecke's avatar
Martin Reinecke committed
82
        for i in range(1, len(delta)):
83
            descent_direction += delta[i] * b[i]
84

85
        return descent_direction
Theo Steininger's avatar
Theo Steininger committed
86 87 88


class InformationStore(object):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
89
    """Class for storing a list of past updates.
90

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
91 92 93 94 95 96 97 98
    Parameters
    ----------
    max_history_length : integer
        Maximum number of stored past updates.
    x0 : Field
        Initial position in variable space.
    gradient : Field
        Gradient at position x0.
99

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
100 101 102 103 104
    Attributes
    ----------
    max_history_length : integer
        Maximum number of stored past updates.
    s : List
Martin Reinecke's avatar
Martin Reinecke committed
105
        Circular buffer of past position differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
106
    y : List
Martin Reinecke's avatar
Martin Reinecke committed
107
        Circular buffer of past gradient differences, which are Fields.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
108
    last_x : Field
Martin Reinecke's avatar
tweaks  
Martin Reinecke committed
109
        Latest position in variable space.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
110
    last_gradient : Field
Martin Reinecke's avatar
tweaks  
Martin Reinecke committed
111
        Gradient at latest position.
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
112
    k : integer
Martin Reinecke's avatar
tweaks  
Martin Reinecke committed
113
        Number of updates that have taken place
Martin Reinecke's avatar
Martin Reinecke committed
114
    ss : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
115
        2D circular buffer of scalar products between different elements of s.
Martin Reinecke's avatar
Martin Reinecke committed
116
    sy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
117
        2D circular buffer of scalar products between elements of s and y.
Martin Reinecke's avatar
Martin Reinecke committed
118
    yy : numpy.ndarray
Martin Reinecke's avatar
Martin Reinecke committed
119
        2D circular buffer of scalar products between different elements of y.
120

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
121
    """
122 123
    def __init__(self, max_history_length, x0, gradient):
        self.max_history_length = max_history_length
124 125
        self.s = [None]*max_history_length
        self.y = [None]*max_history_length
126 127
        self.last_x = x0.copy()
        self.last_gradient = gradient.copy()
Theo Steininger's avatar
Theo Steininger committed
128
        self.k = 0
129

Martin Reinecke's avatar
Martin Reinecke committed
130
        mmax = max_history_length
Martin Reinecke's avatar
Martin Reinecke committed
131 132 133
        self.ss = np.empty((mmax, mmax), dtype=np.float64)
        self.sy = np.empty((mmax, mmax), dtype=np.float64)
        self.yy = np.empty((mmax, mmax), dtype=np.float64)
134 135 136

    @property
    def history_length(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
137
        """Returns the number of currently stored updates.
138

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
139
        """
140 141 142 143
        return min(self.k, self.max_history_length)

    @property
    def b(self):
144 145
        """Combines s, y and gradient to form the new base vectors b.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
146 147 148 149
        Returns
        -------
        result : List
            List of new basis vectors.
150

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
151
        """
152 153
        result = []
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
154
        mmax = self.max_history_length
155

Martin Reinecke's avatar
Martin Reinecke committed
156
        k = self.k
157
        s = self.s
Martin Reinecke's avatar
Martin Reinecke committed
158

Martin Reinecke's avatar
Martin Reinecke committed
159
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
160
            result.append(s[(k-m+i) % mmax])
161 162

        y = self.y
Martin Reinecke's avatar
Martin Reinecke committed
163
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
164
            result.append(y[(k-m+i) % mmax])
165 166 167 168 169 170 171

        result.append(self.last_gradient)

        return result

    @property
    def b_dot_b(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
172
        """Generates the (2m+1) * (2m+1) scalar matrix.
173

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
174
        The i,j-th element of the matrix is a scalar product between the i-th
175 176
        and j-th base vector.

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
177 178
        Returns
        -------
179
        result : numpy.ndarray
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
180
            Scalar matrix.
181

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
182
        """
183
        m = self.history_length
Martin Reinecke's avatar
Martin Reinecke committed
184
        mmax = self.max_history_length
185 186 187
        k = self.k
        result = np.empty((2*m+1, 2*m+1), dtype=np.float)

Martin Reinecke's avatar
tweaks  
Martin Reinecke committed
188
        # update the stores
Martin Reinecke's avatar
Martin Reinecke committed
189
        k1 = (k-1) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
190
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
191
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
192 193 194
            self.ss[kmi, k1] = self.ss[k1, kmi] = self.s[kmi].vdot(self.s[k1])
            self.yy[kmi, k1] = self.yy[k1, kmi] = self.y[kmi].vdot(self.y[k1])
            self.sy[kmi, k1] = self.s[kmi].vdot(self.y[k1])
Martin Reinecke's avatar
Martin Reinecke committed
195
        for j in range(m-1):
Martin Reinecke's avatar
Martin Reinecke committed
196 197
            kmj = (k-m+j) % mmax
            self.sy[k1, kmj] = self.s[k1].vdot(self.y[kmj])
Martin Reinecke's avatar
tweaks  
Martin Reinecke committed
198

Martin Reinecke's avatar
Martin Reinecke committed
199
        for i in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
200
            kmi = (k-m+i) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
201
            for j in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
202
                kmj = (k-m+j) % mmax
Martin Reinecke's avatar
Martin Reinecke committed
203 204 205
                result[i, j] = self.ss[kmi, kmj]
                result[i, m+j] = result[m+j, i] = self.sy[kmi, kmj]
                result[m+i, m+j] = self.yy[kmi, kmj]
206

207
            sgrad_i = self.s[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks  
Martin Reinecke committed
208
            result[2*m, i] = result[i, 2*m] = sgrad_i
209

Martin Reinecke's avatar
fix  
Martin Reinecke committed
210
            ygrad_i = self.y[kmi].vdot(self.last_gradient)
Martin Reinecke's avatar
tweaks  
Martin Reinecke committed
211
            result[2*m, m+i] = result[m+i, 2*m] = ygrad_i
212

Martin Reinecke's avatar
tweaks  
Martin Reinecke committed
213
        result[2*m, 2*m] = self.last_gradient.norm()
214 215

        return result
Theo Steininger's avatar
Theo Steininger committed
216 217

    @property
218
    def delta(self):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
219
        """Calculates the new scalar coefficients (deltas).
220

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
221 222 223 224
        Returns
        -------
        delta : List
            List of the new scalar coefficients (deltas).
225

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
226
        """
227 228 229 230 231 232 233 234
        m = self.history_length
        b_dot_b = self.b_dot_b

        delta = np.zeros(2*m+1, dtype=np.float)
        delta[2*m] = -1

        alpha = np.empty(m, dtype=np.float)

Martin Reinecke's avatar
Martin Reinecke committed
235 236
        for j in range(m-1, -1, -1):
            delta_b_b = sum([delta[l] * b_dot_b[l, j] for l in range(2*m+1)])
237 238 239
            alpha[j] = delta_b_b/b_dot_b[j, m+j]
            delta[m+j] -= alpha[j]

Martin Reinecke's avatar
Martin Reinecke committed
240
        for i in range(2*m+1):
241 242
            delta[i] *= b_dot_b[m-1, 2*m-1]/b_dot_b[2*m-1, 2*m-1]

Martin Reinecke's avatar
Martin Reinecke committed
243
        for j in range(m):
Martin Reinecke's avatar
Martin Reinecke committed
244
            delta_b_b = sum([delta[l]*b_dot_b[m+j, l] for l in range(2*m+1)])
245 246 247 248 249
            beta = delta_b_b/b_dot_b[j, m+j]
            delta[j] += (alpha[j] - beta)

        return delta

Theo Steininger's avatar
Theo Steininger committed
250
    def add_new_point(self, x, gradient):
Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
251 252
        """Updates the s list and y list.

Martin Reinecke's avatar
Martin Reinecke committed
253 254
        Calculates the new position and gradient differences and enters them
        into the respective list.
255

Matevz, Sraml (sraml)'s avatar
Matevz, Sraml (sraml) committed
256
        """
Martin Reinecke's avatar
Martin Reinecke committed
257 258 259
        mmax = self.max_history_length
        self.s[self.k % mmax] = x - self.last_x
        self.y[self.k % mmax] = gradient - self.last_gradient
Theo Steininger's avatar
Theo Steininger committed
260

261 262
        self.last_x = x.copy()
        self.last_gradient = gradient.copy()
Theo Steininger's avatar
Theo Steininger committed
263

Martin Reinecke's avatar
fixes  
Martin Reinecke committed
264
        self.k += 1