kl_energies.py 20.2 KB
Newer Older
Philipp Frank's avatar
Philipp Frank committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Copyright(C) 2013-2020 Max-Planck-Society
# Authors: Philipp Frank
#
# NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik.

import numpy as np
from functools import reduce

from .. import random
from .. import utilities
from ..linearization import Linearization
from ..multi_field import MultiField
from ..operators.inversion_enabler import InversionEnabler
from ..operators.endomorphic_operator import EndomorphicOperator
from ..operators.energy_operators import StandardHamiltonian, GaussianEnergy
from ..operators.sandwich_operator import SandwichOperator
from ..operators.sampling_enabler import SamplingDtypeSetter
from ..operators.scaling_operator import ScalingOperator
from .energy_adapter import EnergyAdapter
from .quadratic_energy import QuadraticEnergy
from ..probing import approximation2endo
from ..sugar import makeOp
from ..operators.adder import Adder
from ..utilities import myassert
from .energy import Energy
from .descent_minimizers import DescentMinimizer, ConjugateGradient

def _is_prior_dtype_float(H):
    real = True
    dts = H._prior._met._dtype
    if isinstance(dts, dict):
        for k in dts.keys():
            if not np.issubdtype(dts[k], np.float):
                real = False
    else:
        real = np.issubdtype(dts, np.float)
    return real


def _get_lo_hi(comm, n_samples):
    ntask, rank, _ = utilities.get_MPI_params_from_comm(comm)
    return utilities.shareRange(n_samples, ntask, rank)


def _modify_sample_domain(sample, domain):
    """Takes only keys from sample which are also in domain and inserts zeros
    for keys which are not in sample.domain."""
    from ..multi_domain import MultiDomain
    from ..field import Field
    from ..domain_tuple import DomainTuple
    from ..sugar import makeDomain
    domain = makeDomain(domain)
    if isinstance(domain, DomainTuple) and isinstance(sample, Field):
        if sample.domain is not domain:
            raise TypeError
        return sample
    elif isinstance(domain, MultiDomain) and isinstance(sample, MultiField):
        if sample.domain is domain:
            return sample
        out = {kk: vv for kk, vv in sample.items() if kk in domain.keys()}
        out = MultiField.from_dict(out, domain)
        return out
    raise TypeError


def _reduce_by_keys(mean, hamiltonian, keys):
    if isinstance(mean, MultiField):
        _, new_ham = hamiltonian.simplify_for_constant_input(mean.extract_by_keys(keys))
        new_mean =  mean.extract_by_keys(set(mean.keys()) - set(keys))
        return new_mean, new_ham
    return mean, hamiltonian


class _KLMetric(EndomorphicOperator):
    def __init__(self, KL):
        self._KL = KL
        self._capability = self.TIMES | self.ADJOINT_TIMES
        self._domain = KL.position.domain

    def apply(self, x, mode):
        self._check_input(x, mode)
        return self._KL.apply_metric(x)


class _SampledKLEnergy(Energy):
    """Base class for Energies representing a sampled Kullback-Leibler divergence
    for the variational approximation of a distribution with another distribution.
    """
    def __init__(self, mean, hamiltonian, n_samples, mirror_samples, comm,
                 local_samples, nanisinf):
        super(_SampledKLEnergy, self).__init__(mean)
        myassert(mean.domain is hamiltonian.domain)
        self._hamiltonian = hamiltonian
        self._n_samples = int(n_samples)
        self._mirror_samples = bool(mirror_samples)
        self._comm = comm
        self._local_samples = local_samples
        self._nanisinf = bool(nanisinf)
        
        lin = Linearization.make_var(mean)
        v, g = [], []
        for s in self._local_samples:
            s = _modify_sample_domain(s, mean.domain)
            tmp = hamiltonian(lin+s)
            tv = tmp.val.val
            tg = tmp.gradient
            if mirror_samples:
                tmp = hamiltonian(lin-s)
                tv = tv + tmp.val.val
                tg = tg + tmp.gradient
            v.append(tv)
            g.append(tg)
        self._val = utilities.allreduce_sum(v, self._comm)[()]/self.n_eff_samples
        if np.isnan(self._val) and self._nanisinf:
            self._val = np.inf
        self._grad = utilities.allreduce_sum(g, self._comm)/self.n_eff_samples

    @property
    def value(self):
        return self._val

    @property
    def gradient(self):
        return self._grad

    def at(self, position):
        return _SampledKLEnergy(
            position, self._hamiltonian, self._n_samples, self._mirror_samples,
            self._comm, self._local_samples, self._nanisinf)

    def apply_metric(self, x):
        lin = Linearization.make_var(self.position, want_metric=True)
        res = []
        for s in self._local_samples:
            s = _modify_sample_domain(s, self._hamiltonian.domain)
            tmp = self._hamiltonian(lin+s).metric(x)
            if self._mirror_samples:
                tmp = tmp + self._hamiltonian(lin-s).metric(x)
            res.append(tmp)
        return utilities.allreduce_sum(res, self._comm)/self.n_eff_samples

    @property
    def n_eff_samples(self):
        if self._mirror_samples:
            return 2*self._n_samples
        return self._n_samples

    @property
    def metric(self):
        return _KLMetric(self)

    @property
    def samples(self):
        ntask, rank, _ = utilities.get_MPI_params_from_comm(self._comm)
        if ntask == 1:
            for s in self._local_samples:
                yield s
                if self._mirror_samples:
                    yield -s
        else:
            rank_lo_hi = [utilities.shareRange(self._n_samples, ntask, i) for i in range(ntask)]
            lo, _ = _get_lo_hi(self._comm, self._n_samples)
            for itask, (l, h) in enumerate(rank_lo_hi):
                for i in range(l, h):
                    data = self._local_samples[i-lo] if rank == itask else None
                    s = self._comm.bcast(data, root=itask)
                    yield s
                    if self._mirror_samples:
                        yield -s


class _GeoMetricSampler:
    def __init__(self, position, H, minimizer, start_from_lin, 
                 n_samples, mirror_samples, napprox=0, want_error=False):
        if not isinstance(H, StandardHamiltonian):
            raise NotImplementedError
        if not _is_prior_dtype_float(H):
            raise ValueError("_GeoMetricSampler only supports real valued latent DOFs.")
        if isinstance(position, MultiField):
            self._position = position.extract(H.domain)
        else:
            self._position = position
        tr = H._lh.get_transformation()
        if tr is None:
            raise ValueError("_GeoMetricSampler only works for likelihoods")
        dtype, f_lh = tr
        scale = ScalingOperator(f_lh.target, 1.)
        if isinstance(dtype, dict):
            sampling = reduce((lambda a,b: a*b), 
                              [dtype[k] is not None for k in dtype.keys()])
        else:
            sampling = dtype is not None
        scale = SamplingDtypeSetter(scale, dtype) if sampling else scale
        
        fl = f_lh(Linearization.make_var(self._position))
        self._g = (Adder(-self._position) + 
                   fl.jac.adjoint@Adder(-fl.val)@f_lh)
        self._likelihood = SandwichOperator.make(fl.jac, scale)
        self._prior = SamplingDtypeSetter(ScalingOperator(fl.domain,1.), np.float64)
        self._met = self._likelihood + self._prior
        if napprox >=1:
            self._approximation = makeOp(approximation2endo(self._met, napprox)).inverse
        else:
            self._approximation = None
        self._ic = H._ic_samp
        self._minimizer = minimizer
        self._start_from_lin = start_from_lin
        self._want_error = want_error

        sseq = random.spawn_sseq(n_samples)
        if mirror_samples:
            mysseq = []
            for seq in sseq:
                mysseq += [seq, seq]
        else:
            mysseq = sseq
        self._sseq = mysseq
        self._neg = (False, True)*n_samples if mirror_samples else (False, )*n_samples
        self._n_samples = n_samples
        self._mirror_samples = mirror_samples

    @property
    def n_eff_samples(self):
        return 2*self._n_samples if self._mirror_samples else self._n_samples

    @property
    def position(self):
        return self._position

    def _draw_lin(self, neg):
        s = self._prior.draw_sample(from_inverse=True)
        s = -s if neg else s
        nj = self._likelihood.draw_sample()
        nj = -nj if neg else nj
        y = self._prior(s) + nj
        if self._start_from_lin:
            energy = QuadraticEnergy(s, self._met, y,
                                     _grad=self._likelihood(s) - nj)
            inverter = ConjugateGradient(self._ic)
            energy, convergence = inverter(energy,
                                           preconditioner=self._approximation)
            yi = energy.position
        else:
            yi = s
        return y, yi

    def _draw_nonlin(self, y, yi):
        en = EnergyAdapter(self._position+yi, GaussianEnergy(mean=y)@self._g,
                           nanisinf=True, want_metric=True)
        en, _ = self._minimizer(en)
        sam = en.position - self._position
        if self._want_error:
            er = y - self._g(sam)
            er = er.s_vdot(InversionEnabler(self._met, self._ic).inverse(er))
            return sam, er
        return sam

    def draw_samples(self, comm):
        local_samples = []
        prev = None
        for i in range(*_get_lo_hi(comm, self.n_eff_samples)):
            with random.Context(self._sseq[i]):
                neg = self._neg[i]
                if (prev is None) or not self._mirror_samples:
                    y, yi = self._draw_lin(neg)
                    if not neg:
                        prev = (-y, -yi)
                else:
                    (y, yi) = prev
                    prev = None
                local_samples.append(self._draw_nonlin(y, yi))
        return tuple(local_samples)


def MetricGaussianKL(mean, hamiltonian, n_samples, mirror_samples, constants=[],
                     point_estimates=[], napprox=0, comm=None, nanisinf=False):
    """Provides the sampled Kullback-Leibler divergence between a distribution
    and a Metric Gaussian.

    A Metric Gaussian is used to approximate another probability distribution.
    It is a Gaussian distribution that uses the Fisher information metric of
    the other distribution at the location of its mean to approximate the
    variance. In order to infer the mean, a stochastic estimate of the
    Kullback-Leibler divergence is minimized. This estimate is obtained by
    sampling the Metric Gaussian at the current mean. During minimization
    these samples are kept constant; only the mean is updated. Due to the
    typically nonlinear structure of the true distribution these samples have
    to be updated eventually by intantiating `MetricGaussianKL` again. For the
    true probability distribution the standard parametrization is assumed.
    The samples of this class can be distributed among MPI tasks.

    Parameters
    ----------
    mean : Field
        Mean of the Gaussian probability distribution.
    hamiltonian : StandardHamiltonian
        Hamiltonian of the approximated probability distribution.
    n_samples : integer
        Number of samples used to stochastically estimate the KL.
    mirror_samples : boolean
        Whether the negative of the drawn samples are also used, as they are
        equally legitimate samples. If true, the number of used samples
        doubles. Mirroring samples stabilizes the KL estimate as extreme
        sample variation is counterbalanced. Since it improves stability in
        many cases, it is recommended to set `mirror_samples` to `True`.
    constants : list
        List of parameter keys that are kept constant during optimization.
        Default is no constants.
    point_estimates : list
        List of parameter keys for which no samples are drawn, but that are
        (possibly) optimized for, corresponding to point estimates of these.
        Default is to draw samples for the complete domain.
    napprox : int
        Number of samples for computing preconditioner for sampling. No
        preconditioning is done by default.
    comm : MPI communicator or None
        If not None, samples will be distributed as evenly as possible
        across this communicator. If `mirror_samples` is set, then a sample and
        its mirror image will always reside on the same task.
    nanisinf : bool
        If true, nan energies which can happen due to overflows in the forward
        model are interpreted as inf. Thereby, the code does not crash on
        these occaisions but rather the minimizer is told that the position it
        has tried is not sensible.

    Notes
    -----
    The two lists `constants` and `point_estimates` are independent from each
    other. It is possible to sample along domains which are kept constant
    during minimization and vice versa.
    DomainTuples should never be created using the constructor, but rather
    via the factory function :attr:`make`!
    See also
    --------
    `Metric Gaussian Variational Inference`, Jakob Knollmüller,
    Torsten A. Enßlin, `<https://arxiv.org/abs/1901.11033>`_
    """
    if not isinstance(hamiltonian, StandardHamiltonian):
        raise TypeError
    if hamiltonian.domain is not mean.domain:
        raise ValueError
    if not isinstance(n_samples, int):
        raise TypeError
    if not isinstance(mirror_samples, bool):
        raise TypeError
    if isinstance(mean, MultiField) and set(point_estimates) == set(mean.keys()):
        raise RuntimeError(
            'Point estimates for whole domain. Use EnergyAdapter instead.')
    n_samples = int(n_samples)
    mirror_samples = bool(mirror_samples)

    _, ham_sampling = _reduce_by_keys(mean, hamiltonian, point_estimates)
    lin = Linearization.make_var(mean.extract(ham_sampling.domain), True)
    met = ham_sampling(lin).metric
    if napprox >= 1:
        met._approximation = makeOp(approximation2endo(met, napprox))
    local_samples = []
    sseq = random.spawn_sseq(n_samples)
    for i in range(*_get_lo_hi(comm, n_samples)):
        with random.Context(sseq[i]):
            local_samples.append(met.draw_sample(from_inverse=True))
    local_samples = tuple(local_samples)
    mean, hamiltonian = _reduce_by_keys(mean, hamiltonian, constants)
    return _SampledKLEnergy(mean, hamiltonian, n_samples, mirror_samples, comm,
                            local_samples, nanisinf)


def GeoMetricKL(mean, hamiltonian, n_samples, minimizer_samp, mirror_samples, 
                start_from_lin = True, constants=[], point_estimates=[],
                napprox=0, comm=None, nanisinf=True):
    """Provides the sampled Kullback-Leibler used in geometric Variational
    Inference (geoVI).

    In geoVI a probability distribution is approximated with a standard normal
    distribution in the canonical coordinate system of the Riemannian manifold
    associated with the metric of the other distribution. The coordinate 
    transformation is approximated by expanding around a point. In order to
    infer the expansion point, a stochastic estimate of the Kullback-Leibler
    divergence is minimized. This estimate is obtained by sampling from the
    approximation using the current expansion point. During minimization these
    samples are kept constant; only the expansion point is updated. Due to the
    typically nonlinear structure of the true distribution these samples have
    to be updated eventually by instantiating `GeoMetricKL` again. For the true
    probability distribution the standard parametrization is assumed.
    The samples of this class can be distributed among MPI tasks.

    Parameters
    ----------
    mean : Field
        Expansion point of the coordinate transformation.
    hamiltonian : StandardHamiltonian
        Hamiltonian of the approximated probability distribution.
    n_samples : integer
        Number of samples used to stochastically estimate the KL.
    minimizer_samp : DescentMinimizer
        Minimizer used to draw samples.
    mirror_samples : boolean
        Whether the mirrored version of the drawn samples are also used.
        If true, the number of used samples doubles.
        Mirroring samples stabilizes the KL estimate as extreme
        sample variation is counterbalanced.
    start_from_lin : boolean
        Whether the non-linear sampling should start using the inverse
        linearized transformation (i.E. the corresponding MGVI sample). 
        If False, the minimization starts from the prior sample.
        Default is True.
    constants : list
        List of parameter keys that are kept constant during optimization.
        Default is no constants.
    point_estimates : list
        List of parameter keys for which no samples are drawn, but that are
        (possibly) optimized for, corresponding to point estimates of these.
        Default is to draw samples for the complete domain.
    napprox : int
        Number of samples for computing preconditioner for linear sampling. 
        No preconditioning is done by default.
    comm : MPI communicator or None
        If not None, samples will be distributed as evenly as possible
        across this communicator. If `mirror_samples` is set, then a sample and
        its mirror image will preferably reside on the same task if neccessary.
    nanisinf : bool
        If true, nan energies which can happen due to overflows in the forward
        model are interpreted as inf. Thereby, the code does not crash on
        these occaisions but rather the minimizer is told that the position it
        has tried is not sensible.

    Notes
    -----
    The two lists `constants` and `point_estimates` are independent from each
    other. It is possible to sample along domains which are kept constant
    during minimization and vice versa.
    DomainTuples should never be created using the constructor, but rather
    via the factory function :attr:`make`!
    
    Note on MPI and mirror_samples:
    As in MGVI, mirroreing samples can help to stabilize the latent mean as it
    reduces sampling noise. But unlike MGVI a mirrored sample involves an
    additional solve of the non-linear transformation. Therefore, when using
    MPI, the mirrored samples also get distributed if enough tasks are available.
    If there are more total samples than tasks, the mirrored counterparts
    try to reside on the same task as their non mirrored partners. This ensures
    that at least the starting position can be re-used.
    See also
    --------
    `Geometric Variational Inference`, Philipp Frank, Reimar Leike,
    Torsten A. Enßlin, `<https://arxiv.org/abs/2105.10470>`_
    """
    if not isinstance(hamiltonian, StandardHamiltonian):
        raise TypeError
    if hamiltonian.domain is not mean.domain:
        raise ValueError
    if not isinstance(n_samples, int):
        raise TypeError
    if not isinstance(mirror_samples, bool):
        raise TypeError
    if not isinstance(minimizer_samp, DescentMinimizer):
        raise TypeError
    if isinstance(mean, MultiField) and set(point_estimates) == set(mean.keys()):
        raise RuntimeError(
            'Point estimates for whole domain. Use EnergyAdapter instead.')
    n_samples = int(n_samples)
    mirror_samples = bool(mirror_samples)

    _, ham_sampling = _reduce_by_keys(mean, hamiltonian, point_estimates)
    sampler = _GeoMetricSampler(mean, ham_sampling, minimizer_samp,
                                start_from_lin, n_samples, mirror_samples)
    local_samples = sampler.draw_samples(comm)
    mean, hamiltonian = _reduce_by_keys(mean, hamiltonian, constants)
    return _SampledKLEnergy(mean, hamiltonian, sampler.n_eff_samples, False,
                            comm, local_samples, nanisinf)