From 1520a2fa8a89bbd9686ecaa5beec3d1b382fdd0b Mon Sep 17 00:00:00 2001
From: Philipp Arras <parras@mpa-garching.mpg.de>
Date: Mon, 31 May 2021 12:14:38 +0200
Subject: [PATCH] Add docs and typos in docs

---
 docs/source/code.rst              |  9 +++--
 docs/source/ift.rst               | 35 ++++++++++--------
 src/minimization/kl_energies.py   | 59 ++++++++++++++++++++++---------
 src/operators/energy_operators.py | 23 ++++++------
 src/operators/operator.py         | 25 ++++++++-----
 5 files changed, 97 insertions(+), 54 deletions(-)

diff --git a/docs/source/code.rst b/docs/source/code.rst
index 48f81534c..2d1695102 100644
--- a/docs/source/code.rst
+++ b/docs/source/code.rst
@@ -369,11 +369,10 @@ tackling new IFT problems. An example of concrete energy classes delivered with
 NIFTy7 is :class:`~minimization.quadratic_energy.QuadraticEnergy` (with
 position-independent metric, mainly used with conjugate gradient minimization).
 
-For MGVI and GeoVI, NIFTy provides the constructors
-:func:`~minimization.kl_energies.MetricGaussianKL` and
-:func:`~minimization.kl_energies.GeoMetricKL`, respectively, which instantiate an
-object containing the sampled estimate of the KL divergence, its gradient and the
-Fisher metric. Thes constructors require an instance
+For MGVI and GeoVI, NIFTy provides :func:`~minimization.kl_energies.MetricGaussianKL`
+and :func:`~minimization.kl_energies.GeoMetricKL` that instantiate objects
+containing the sampled estimate of the KL divergence, its gradient and the
+Fisher metric. These constructors require an instance
 of :class:`~operators.energy_operators.StandardHamiltonian`, an operator to
 compute the negative log-likelihood of the problem in standardized coordinates
 at a given position in parameter space.
diff --git a/docs/source/ift.rst b/docs/source/ift.rst
index eb533cac1..150808288 100644
--- a/docs/source/ift.rst
+++ b/docs/source/ift.rst
@@ -19,17 +19,16 @@ Also, there were certainly previous works in a similar spirit.
 Anyhow, in many cases IFT provides novel rigorous ways to extract information from data.
 NIFTy comes with reimplemented MAP and VI estimators.
 
-.. tip:: *In-a-nutshell introductions to information field theory* can be found in [2]_, [3]_, [4]_, and [5]_, with the latter probably being the most didactical.
+.. tip:: A didactical introduction to *information field theory* can be found in [2]_. Other resources include the `Wikipedia entry <https://en.wikipedia.org/wiki/Information_field_theory>`_, [3]_ and [4]_.
 
 .. [1] T.A. Enßlin et al. (2009), "Information field theory for cosmological perturbation reconstruction and nonlinear signal analysis", PhysRevD.80.105005, 09/2009; `[arXiv:0806.3474] <https://www.arxiv.org/abs/0806.3474>`_
 
-.. [2] T.A. Enßlin (2013), "Information field theory", proceedings of MaxEnt 2012 -- the 32nd International Workshop on Bayesian Inference and Maximum Entropy Methods in Science and Engineering; AIP Conference Proceedings, Volume 1553, Issue 1, p.184; `[arXiv:1301.2556] <https://arxiv.org/abs/1301.2556>`_
+.. [2] T.A. Enßlin (2019), "Information theory for fields", Annalen der Physik; `[DOI] <https://doi.org/10.1002/andp.201800127>`_, `[arXiv:1804.03350] <https://arxiv.org/abs/1804.03350>`_
 
-.. [3] T.A. Enßlin (2014), "Astrophysical data analysis with information field theory", AIP Conference Proceedings, Volume 1636, Issue 1, p.49; `[arXiv:1405.7701] <https://arxiv.org/abs/1405.7701>`_
+.. [3] T.A. Enßlin (2013), "Information field theory", proceedings of MaxEnt 2012 -- the 32nd International Workshop on Bayesian Inference and Maximum Entropy Methods in Science and Engineering; AIP Conference Proceedings, Volume 1553, Issue 1, p.184; `[arXiv:1301.2556] <https://arxiv.org/abs/1301.2556>`_
 
-.. [4] Wikipedia contributors (2018), `"Information field theory" <https://en.wikipedia.org/w/index.php?title=Information_field_theory&oldid=876731720>`_, Wikipedia, The Free Encyclopedia.
+.. [4] T.A. Enßlin (2014), "Astrophysical data analysis with information field theory", AIP Conference Proceedings, Volume 1636, Issue 1, p.49; `[arXiv:1405.7701] <https://arxiv.org/abs/1405.7701>`_
 
-.. [5] T.A. Enßlin (2019), "Information theory for fields", accepted by Annalen der Physik; `[DOI] <https://doi.org/10.1002/andp.201800127>`_, `[arXiv:1804.03350] <https://arxiv.org/abs/1804.03350>`_
 
 
 
@@ -133,8 +132,8 @@ NIFTy takes advantage of this formulation in several ways:
 
 3) The response can be non-linear, e.g. :math:`{R'(s)=R \exp(A\,\xi)}`, see `demos/getting_started_2.py`.
 
-4) The amplitude operator may depend on further parameters, e.g. :math:`A=A(\tau)= F\, \widehat{e^\tau}` represents an amplitude operator with a positive definite, unknown spectrum defined in the Fourier domain.
-   The amplitude field :math:`{\tau}` would get its own amplitude operator, with a cepstrum (spectrum of a log spectrum) defined in quefrency space (harmonic space of a logarithmically binned harmonic space) to regularize its degrees of freedom by imposing some (user-defined degree of) spectral smoothness.
+4) The amplitude operator may depend on further parameters, e.g. :math:`A=A(\tau)=e^{2\tau}` represents an amplitude operator with a positive definite, unknown spectrum.
+   The log-amplitude field :math:`{\tau}` is modelled with the help of an integrated Wiener process in order to impose some (user-defined degree of) spectral smoothness.
 
 5) NIFTy calculates the gradient of the information Hamiltonian and the Fisher information metric with respect to all unknown parameters, here :math:`{\xi}` and :math:`{\tau}`, by automatic differentiation.
    The gradients are used for MAP estimates, and the Fisher matrix is required in addition to the gradient by Metric Gaussian Variational Inference (MGVI), which is available in NIFTy as well.
@@ -176,7 +175,7 @@ It only requires minimizing the information Hamiltonian, e.g. by a gradient desc
 
 NIFTy7 automatically calculates the necessary gradient from a generative model of the signal and the data and uses this to minimize the Hamiltonian.
 
-However, MAP often provides unsatisfactory results in cases of deep hirachical Bayesian networks.
+However, MAP often provides unsatisfactory results in cases of deep hierarchical Bayesian networks.
 The reason for this is that MAP ignores the volume factors in parameter space, which are not to be neglected in deciding whether a solution is reasonable or not.
 In the high dimensional setting of field inference these volume factors can differ by large ratios.
 A MAP estimate, which is only representative for a tiny fraction of the parameter space, might be a poorer choice (with respect to an error norm) compared to a slightly worse location with slightly lower posterior probability, which, however, is associated with a much larger volume (of nearby locations with similar probability).
@@ -214,7 +213,7 @@ The only term within the KL-divergence that explicitly depends on it is the Hami
     \mathrm{KL}(m|d) \;\widehat{=}\;
     \left\langle  \mathcal{H}(\xi,d)    \right\rangle_{\mathcal{Q}(\xi)},
 
-where :math:`\widehat{=}` expresses equality up to irrelvant (here not :math:`m`-dependent) terms.
+where :math:`\widehat{=}` expresses equality up to irrelevant (here not :math:`m`-dependent) terms.
 
 Thus, only the gradient of the KL is needed with respect to this, which can be expressed as
 
@@ -227,30 +226,36 @@ The particular structure of the covariance allows us to draw independent samples
 This KL-divergence for MGVI is implemented by
 :func:`~nifty7.minimization.kl_energies.MetricGaussianKL` within NIFTy7.
 
-It should be noted that MGVI can typically only provide a lower bound on the variance.
+Note that MGVI typically provides only a lower bound on the variance.
 
 Geometric Variational Inference
 -------------------------------
 
 For non-linear posterior distributions :math:`\mathcal{P}(\xi|d)` an approximation with a Gaussian :math:`\mathcal{Q}(\xi)` in the coordinates :math:`\xi` is sub-optimal, as higher order interactions are ignored.
 A better approximation can be achieved by constructing a coordinate system :math:`y = g\left(\xi\right)` in which the posterior is close to a Gaussian, and perform VI with a Gaussian :math:`\mathcal{Q}(y)` in these coordinates.
+This approach is called Geometric Variation Inference (geoVI).
+It is discussed in detail in [6]_.
 
-One useful coordinate system is obtained in case the metric :math:`M` of the posterior can be expressed as the pullback of the Euclidean metric using an invertible transformation :math:`g`:
+One useful coordinate system is obtained in case the metric :math:`M` of the posterior can be expressed as the pullback of the Euclidean metric by :math:`g`:
 
 .. math::
 
     M = \left(\frac{\partial g}{\partial \xi}\right)^T \frac{\partial g}{\partial \xi} \ .
 
-In general, such a transformation can only be constructed locally, i.E. in a neighbourhood of some expansion point :math:`\bar{\xi}`, denoted as :math:`g_{\bar{\xi}}\left(\xi\right)`. Using :math:`g_{\bar{\xi}}`, the Geometric Variational Inference [6]_ (GeoVI) scheme uses a zero mean, unit Gaussian :math:`\mathcal{Q}(y) = \mathcal{G}(y, 1)` approximation, which can be expressed in :math:`\xi` coordinates via the pushforward using the inverse transformation :math:`\xi = g_{\bar{\xi}}^{-1}(y)`:
+In general, such a transformation exists only locally, i.e. in a neighbourhood of some expansion point :math:`\bar{\xi}`, denoted as :math:`g_{\bar{\xi}}\left(\xi\right)`.
+Using :math:`g_{\bar{\xi}}`, the GeoVI scheme uses a zero mean, unit Gaussian :math:`\mathcal{Q}(y) = \mathcal{G}(y, 1)` approximation.
+It can be expressed in :math:`\xi` coordinates via the pushforward by the inverse transformation :math:`\xi = g_{\bar{\xi}}^{-1}(y)`:
 
 .. math::
 
     \mathcal{Q}_{\bar{\xi}}(\xi) = \left(g_{\bar{\xi}}^{-1} * \mathcal{Q}\right)(\xi) = \int \delta\left(\xi - g_{\bar{\xi}}^{-1}(y)\right) \ \mathcal{G}(y, 1) \ \mathcal{D}y \ ,
 
-where :math:`\delta` denotes the kronecker-delta.
+where :math:`\delta` denotes the Kronecker-delta.
 
-The remaining task in geoVI is to obtain the optimal expansion point :math:`\bar{\xi}` suth that :math:`\mathcal{Q}_{\bar{\xi}}` matches the posterior as good as possible. Analogous to the MGVI algorithm, :math:`\bar{\xi}` is obtained by minimization of the KL-divergenge between :math:`\mathcal{P}` and :math:`\mathcal{Q}_{\bar{\xi}}` w.r.t. :math:`\bar{\xi}`. Furthermore the KL is represented as a stochastic estimate using a set of samples drawn from :math:`\mathcal{Q}_{\bar{\xi}}` which is implemented in NIFTy7 via :func:`~nifty7.minimization.kl_energies.GeoMetricKL`.
+GeoVI obtains the optimal expansion point :math:`\bar{\xi}` such that :math:`\mathcal{Q}_{\bar{\xi}}` matches the posterior as good as possible.
+Analogous to the MGVI algorithm, :math:`\bar{\xi}` is obtained by minimization of the KL-divergence between :math:`\mathcal{P}` and :math:`\mathcal{Q}_{\bar{\xi}}` w.r.t. :math:`\bar{\xi}`.
+Furthermore the KL is represented as a stochastic estimate using a set of samples drawn from :math:`\mathcal{Q}_{\bar{\xi}}` which is implemented in NIFTy7 via :func:`~nifty7.minimization.kl_energies.GeoMetricKL`.
 
-A visual comparison of the MGVI and GeoVI algorithm can be found in the demo script `mgvi_visualized.py`.
+A visual comparison of the MGVI and GeoVI algorithm can be found in `variational_inference_visualized.py <https://gitlab.mpcdf.mpg.de/ift/nifty/-/blob/NIFTy_7/demos/variational_inference_visualized.py>`_.
 
 .. [6] P. Frank, R. Leike, and T.A. Enßlin (2021), "Geometric Variational Inference"; `[arXiv:2105.10470] <https://arxiv.org/abs/2105.10470>`_
diff --git a/src/minimization/kl_energies.py b/src/minimization/kl_energies.py
index ef67185a5..501315c6a 100644
--- a/src/minimization/kl_energies.py
+++ b/src/minimization/kl_energies.py
@@ -66,12 +66,35 @@ def _modify_sample_domain(sample, domain):
     raise TypeError
 
 
-def _reduce_by_keys(mean, hamiltonian, keys):
-    if isinstance(mean, MultiField):
-        _, new_ham = hamiltonian.simplify_for_constant_input(mean.extract_by_keys(keys))
-        new_mean =  mean.extract_by_keys(set(mean.keys()) - set(keys))
-        return new_mean, new_ham
-    return mean, hamiltonian
+def _reduce_by_keys(field, operator, keys):
+    """Partially insert a field into an operator
+
+    If the domain of the operator is an instance of `DomainTuple`
+
+    Parameters
+    ----------
+    field : Field or MultiField
+        Potentially partially constant input field.
+    operator : Operator
+        Operator into which `field` is partially inserted.
+    keys : list
+        List of constant `MultiDomain` entries.
+
+    Returns
+    -------
+    list
+        The variable part of the field and the contracted operator.
+    """
+    from ..sugar import is_fieldlike, is_operator
+    myassert(is_fieldlike(field))
+    myassert(is_operator(operator))
+    if isinstance(field, MultiField):
+        cst_field = field.extract_by_keys(keys)
+        var_field = field.extract_by_keys(set(field.keys()) - set(keys))
+        _, new_ham = operator.simplify_for_constant_input(cst_field)
+        return var_field, new_ham
+    myassert(len(keys) == 0)
+    return field, operator
 
 
 class _KLMetric(EndomorphicOperator):
@@ -86,9 +109,11 @@ class _KLMetric(EndomorphicOperator):
 
 
 class _SampledKLEnergy(Energy):
-    """Base class for Energies representing a sampled Kullback-Leibler divergence
-    for the variational approximation of a distribution with another distribution.
-    """
+    """Base class for Energies representing a sampled Kullback-Leibler
+    divergence for the variational approximation of a distribution with another
+    distribution.
+
+    Supports the samples to be distributed across MPI tasks."""
     def __init__(self, mean, hamiltonian, n_samples, mirror_samples, comm,
                  local_samples, nanisinf):
         super(_SampledKLEnergy, self).__init__(mean)
@@ -331,7 +356,7 @@ def MetricGaussianKL(mean, hamiltonian, n_samples, mirror_samples, constants=[],
     nanisinf : bool
         If true, nan energies which can happen due to overflows in the forward
         model are interpreted as inf. Thereby, the code does not crash on
-        these occaisions but rather the minimizer is told that the position it
+        these occasions but rather the minimizer is told that the position it
         has tried is not sensible.
 
     Notes
@@ -412,7 +437,7 @@ def GeoMetricKL(mean, hamiltonian, n_samples, minimizer_samp, mirror_samples,
         sample variation is counterbalanced.
     start_from_lin : boolean
         Whether the non-linear sampling should start using the inverse
-        linearized transformation (i.E. the corresponding MGVI sample).
+        linearized transformation (i.e. the corresponding MGVI sample).
         If False, the minimization starts from the prior sample.
         Default is True.
     constants : list
@@ -428,11 +453,11 @@ def GeoMetricKL(mean, hamiltonian, n_samples, minimizer_samp, mirror_samples,
     comm : MPI communicator or None
         If not None, samples will be distributed as evenly as possible
         across this communicator. If `mirror_samples` is set, then a sample and
-        its mirror image will preferably reside on the same task if neccessary.
+        its mirror image will preferably reside on the same task if necessary.
     nanisinf : bool
         If true, nan energies which can happen due to overflows in the forward
         model are interpreted as inf. Thereby, the code does not crash on
-        these occaisions but rather the minimizer is told that the position it
+        these occasions but rather the minimizer is told that the position it
         has tried is not sensible.
 
     Notes
@@ -444,13 +469,14 @@ def GeoMetricKL(mean, hamiltonian, n_samples, minimizer_samp, mirror_samples,
     via the factory function :attr:`make`!
 
     Note on MPI and mirror_samples:
-    As in MGVI, mirroreing samples can help to stabilize the latent mean as it
+    As in MGVI, mirroring samples can help to stabilize the latent mean as it
     reduces sampling noise. But unlike MGVI a mirrored sample involves an
     additional solve of the non-linear transformation. Therefore, when using
     MPI, the mirrored samples also get distributed if enough tasks are available.
     If there are more total samples than tasks, the mirrored counterparts
     try to reside on the same task as their non mirrored partners. This ensures
     that at least the starting position can be re-used.
+
     See also
     --------
     `Geometric Variational Inference`, Philipp Frank, Reimar Leike,
@@ -467,8 +493,9 @@ def GeoMetricKL(mean, hamiltonian, n_samples, minimizer_samp, mirror_samples,
     if not isinstance(minimizer_samp, DescentMinimizer):
         raise TypeError
     if isinstance(mean, MultiField) and set(point_estimates) == set(mean.keys()):
-        raise RuntimeError(
-            'Point estimates for whole domain. Use EnergyAdapter instead.')
+        s = 'Point estimates for whole domain. Use EnergyAdapter instead.'
+        raise RuntimeError(s)
+
     n_samples = int(n_samples)
     mirror_samples = bool(mirror_samples)
 
diff --git a/src/operators/energy_operators.py b/src/operators/energy_operators.py
index d2b081a86..45691d43b 100644
--- a/src/operators/energy_operators.py
+++ b/src/operators/energy_operators.py
@@ -70,7 +70,9 @@ def _field_to_dtype(field):
 class EnergyOperator(Operator):
     """Operator which has a scalar domain as target domain.
 
-    It is intended as an objective function for field inference.
+    It is intended as an objective function for field inference.  It can
+    implement a positive definite, symmetric form (called `metric`) that is
+    used as curvature for second-order minimizations.
 
     Examples
     --------
@@ -82,11 +84,13 @@ class EnergyOperator(Operator):
 
 
 class LikelihoodOperator(EnergyOperator):
-    """`EnergyOperator` representing a likelihood. The input to the Operator
-    are the parameters of the likelihood. Unlike a general `EnergyOperator`,
-    the metric of a `LikelihoodOperator` is the Fisher information metric of
-    the likelihood.
+    """Represent a log-likelihood.
+
+    The input to the Operator are the parameters of the likelihood. Unlike a
+    general `EnergyOperator`, the metric of a `LikelihoodOperator` is the
+    Fisher information metric of the likelihood.
     """
+
     def get_metric_at(self, x):
         """Computes the Fisher information metric for a `LikelihoodOperator`
         at `x` using the Jacobian of the coordinate transformation given by
@@ -176,10 +180,9 @@ class VariableCovarianceGaussianEnergy(LikelihoodOperator):
         Data type of the samples. Usually either 'np.float*' or 'np.complex*'
 
     use_full_fisher: boolean
-        Whether or not the proper Fisher information metric should be used as
-        a `metric`. If False the same approximation used in
-        `get_transformation` is used instead.
-        Default is True
+        Determines if the proper Fisher information metric should be used as
+        `metric`. If False, the same approximation as in `get_transformation`
+        is used. Default is True.
     """
 
     def __init__(self, domain, residual_key, inverse_covariance_key,
@@ -234,7 +237,7 @@ class VariableCovarianceGaussianEnergy(LikelihoodOperator):
     def get_transformation(self):
         """Note that for the metric of a `VariableCovarianceGaussianEnergy` no
         global transformation to Euclidean space exists. A local approximation
-        ivoking the resudual is used instead.
+        invoking the residual is used instead.
         """
         r = FieldAdapter(self._domain[self._kr], self._kr)
         ivar = FieldAdapter(self._domain[self._kr], self._ki)
diff --git a/src/operators/operator.py b/src/operators/operator.py
index 1e0dc1dc7..66abf7c9c 100644
--- a/src/operators/operator.py
+++ b/src/operators/operator.py
@@ -108,17 +108,26 @@ class Operator(metaclass=NiftyMeta):
         return None
 
     def get_transformation(self):
-        """The coordinate transformation that maps into a coordinate system
-        where the metric of a likelihood is the Euclidean metric.
-        This is `None`, except when the object an instance of
-        :class:`~nifty7.operators.energy_operators.LikelihoodOperator` or a
-        (nested) sum thereof.
+        """The coordinate transformation that maps into a coordinate system in
+        which the metric of a likelihood is the Euclidean metric. It is `None`,
+        except for instances of
+        :class:`~nifty7.operators.energy_operators.LikelihoodOperator` or
+        (nested) sums thereof.
 
         Returns
         -------
-        np.dtype, or dict of np.dtype : The dtype(s) of the target space of the transformation.
-
-        Operator : The transformation that maps from `domain` into the Euclidean target space.
+        np.dtype, or dict of np.dtype : The dtype(s) of the target space of the
+        transformation.
+
+        Operator : The transformation that maps from `domain` into the
+        Euclidean target space.
+
+        Note
+        ----
+        This Euclidean target space is the disjoint union of the Euclidean
+        target spaces of all summands. Therefore, the keys of `MultiDomains`
+        are prefixed with an index and `DomainTuples` are converted to
+        `MultiDomains` with the index as the key.
         """
         return None
 
-- 
GitLab