Commit 08279a38 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

Merge branch 'NIFTy_5' into privatization

parents bfdb0c7f e1e58be3
...@@ -74,7 +74,7 @@ if __name__ == '__main__': ...@@ -74,7 +74,7 @@ if __name__ == '__main__':
ic_sampling = ift.GradientNormController(iteration_limit=100) ic_sampling = ift.GradientNormController(iteration_limit=100)
# Minimize the Hamiltonian # Minimize the Hamiltonian
H = ift.Hamiltonian(likelihood, ic_sampling) H = ift.StandardHamiltonian(likelihood, ic_sampling)
H = ift.EnergyAdapter(position, H, want_metric=True) H = ift.EnergyAdapter(position, H, want_metric=True)
# minimizer = ift.L_BFGS(ic_newton) # minimizer = ift.L_BFGS(ic_newton)
H, convergence = minimizer(H) H, convergence = minimizer(H)
......
...@@ -99,7 +99,7 @@ if __name__ == '__main__': ...@@ -99,7 +99,7 @@ if __name__ == '__main__':
minimizer = ift.NewtonCG(ic_newton) minimizer = ift.NewtonCG(ic_newton)
# Compute MAP solution by minimizing the information Hamiltonian # Compute MAP solution by minimizing the information Hamiltonian
H = ift.Hamiltonian(likelihood) H = ift.StandardHamiltonian(likelihood)
initial_position = ift.from_random('normal', domain) initial_position = ift.from_random('normal', domain)
H = ift.EnergyAdapter(initial_position, H, want_metric=True) H = ift.EnergyAdapter(initial_position, H, want_metric=True)
H, convergence = minimizer(H) H, convergence = minimizer(H)
......
...@@ -100,10 +100,10 @@ if __name__ == '__main__': ...@@ -100,10 +100,10 @@ if __name__ == '__main__':
# Set up likelihood and information Hamiltonian # Set up likelihood and information Hamiltonian
likelihood = ift.GaussianEnergy(mean=data, covariance=N)(signal_response) likelihood = ift.GaussianEnergy(mean=data, covariance=N)(signal_response)
H = ift.Hamiltonian(likelihood, ic_sampling) H = ift.StandardHamiltonian(likelihood, ic_sampling)
initial_position = ift.MultiField.full(H.domain, 0.) initial_mean = ift.MultiField.full(H.domain, 0.)
position = initial_position mean = initial_mean
plot = ift.Plot() plot = ift.Plot()
plot.add(signal(mock_position), title='Ground Truth') plot.add(signal(mock_position), title='Ground Truth')
...@@ -117,9 +117,9 @@ if __name__ == '__main__': ...@@ -117,9 +117,9 @@ if __name__ == '__main__':
# Draw new samples to approximate the KL five times # Draw new samples to approximate the KL five times
for i in range(5): for i in range(5):
# Draw new samples and minimize KL # Draw new samples and minimize KL
KL = ift.KL_Energy(position, H, N_samples) KL = ift.MetricGaussianKL(mean, H, N_samples)
KL, convergence = minimizer(KL) KL, convergence = minimizer(KL)
position = KL.position mean = KL.position
# Plot current reconstruction # Plot current reconstruction
plot = ift.Plot() plot = ift.Plot()
...@@ -128,7 +128,7 @@ if __name__ == '__main__': ...@@ -128,7 +128,7 @@ if __name__ == '__main__':
plot.output(ny=1, ysize=6, xsize=16, name="loop-{:02}.png".format(i)) plot.output(ny=1, ysize=6, xsize=16, name="loop-{:02}.png".format(i))
# Draw posterior samples # Draw posterior samples
KL = ift.KL_Energy(position, H, N_samples) KL = ift.MetricGaussianKL(mean, H, N_samples)
sc = ift.StatCalculator() sc = ift.StatCalculator()
for sample in KL.samples: for sample in KL.samples:
sc.add(signal(sample + KL.position)) sc.add(signal(sample + KL.position))
......
...@@ -103,7 +103,7 @@ N = ift.DiagonalOperator(ift.from_global_data(d_space, var)) ...@@ -103,7 +103,7 @@ N = ift.DiagonalOperator(ift.from_global_data(d_space, var))
IC = ift.DeltaEnergyController(tol_rel_deltaE=1e-12, iteration_limit=200) IC = ift.DeltaEnergyController(tol_rel_deltaE=1e-12, iteration_limit=200)
likelihood = ift.GaussianEnergy(d, N)(R) likelihood = ift.GaussianEnergy(d, N)(R)
Ham = ift.Hamiltonian(likelihood, IC) Ham = ift.StandardHamiltonian(likelihood, IC)
H = ift.EnergyAdapter(params, Ham, want_metric=True) H = ift.EnergyAdapter(params, Ham, want_metric=True)
# Minimize # Minimize
......
# rm -rf docs/build docs/source/mod
sphinx-apidoc -e -o docs/source/mod nifty5 sphinx-apidoc -e -o docs/source/mod nifty5
sphinx-build -b html docs/source/ docs/build/ sphinx-build -b html docs/source/ docs/build/
...@@ -5,11 +5,20 @@ Theoretical Background ...@@ -5,11 +5,20 @@ Theoretical Background
---------------------- ----------------------
`Information Field Theory <http://www.mpa-garching.mpg.de/ift/>`_ [1]_ (IFT) is information theory, the logic of reasoning under uncertainty, applied to fields. A field can be any quantity defined over some space, e.g. the air temperature over Europe, the magnetic field strength in the Milky Way, or the matter density in the Universe. IFT describes how data and knowledge can be used to infer field properties. Mathematically it is a statistical field theory and exploits many of the tools developed for such. Practically, it is a framework for signal processing and image reconstruction. `Information Field Theory <http://www.mpa-garching.mpg.de/ift/>`_ [1]_ (IFT) is information theory, the logic of reasoning under uncertainty, applied to fields.
A field can be any quantity defined over some space, e.g. the air temperature over Europe, the magnetic field strength in the Milky Way, or the matter density in the Universe.
IFT describes how data and knowledge can be used to infer field properties.
Mathematically it is a statistical field theory and exploits many of the tools developed for such.
Practically, it is a framework for signal processing and image reconstruction.
IFT is fully Bayesian. How else could infinitely many field degrees of freedom be constrained by finite data? IFT is fully Bayesian.
How else could infinitely many field degrees of freedom be constrained by finite data?
There is a full toolbox of methods that can be used, like the classical approximation (= Maximum a posteriori = MAP), effective action (= Variational Bayes = VI), Feynman diagrams, renormalization, and more. IFT reproduces many known well working algorithms. This should be reassuring. Also, there were certainly previous works in a similar spirit. Anyhow, in many cases IFT provides novel rigorous ways to extract information from data. NIFTy comes with reimplemented MAP and VI estimators. It also provides a Hamiltonian Monte Carlo sampler for Fields (HMCF). (*FIXME* does it?) There is a full toolbox of methods that can be used, like the classical approximation (= Maximum a posteriori = MAP), effective action (= Variational Bayes = VI), Feynman diagrams, renormalization, and more.
IFT reproduces many known well working algorithms, which is reassuring.
Also, there were certainly previous works in a similar spirit.
Anyhow, in many cases IFT provides novel rigorous ways to extract information from data.
NIFTy comes with reimplemented MAP and VI estimators.
.. tip:: *In-a-nutshell introductions to information field theory* can be found in [2]_, [3]_, [4]_, and [5]_, with the latter probably being the most didactical. .. tip:: *In-a-nutshell introductions to information field theory* can be found in [2]_, [3]_, [4]_, and [5]_, with the latter probably being the most didactical.
...@@ -27,7 +36,8 @@ There is a full toolbox of methods that can be used, like the classical approxim ...@@ -27,7 +36,8 @@ There is a full toolbox of methods that can be used, like the classical approxim
Discretized continuum Discretized continuum
--------------------- ---------------------
The representation of fields that are mathematically defined on a continuous space in a finite computer environment is a common necessity. The goal hereby is to preserve the continuum limit in the calculus in order to ensure a resolution independent discretization. The representation of fields that are mathematically defined on a continuous space in a finite computer environment is a common necessity.
The goal hereby is to preserve the continuum limit in the calculus in order to ensure a resolution independent discretization.
+-----------------------------+-----------------------------+ +-----------------------------+-----------------------------+
| .. image:: images/42vs6.png | .. image:: images/42vs9.png | | .. image:: images/42vs6.png | .. image:: images/42vs9.png |
...@@ -43,7 +53,8 @@ Any partition of the continuous position space :math:`\Omega` (with volume :math ...@@ -43,7 +53,8 @@ Any partition of the continuous position space :math:`\Omega` (with volume :math
V &\quad=\quad \int_\Omega \mathrm{d}x \quad=\quad \sum_{q=1}^Q \int_{\Omega_q} \mathrm{d}x \quad=\quad \sum_{q=1}^Q V_q V &\quad=\quad \int_\Omega \mathrm{d}x \quad=\quad \sum_{q=1}^Q \int_{\Omega_q} \mathrm{d}x \quad=\quad \sum_{q=1}^Q V_q
. .
Here the number :math:`Q` characterizes the resolution of the pixelization and the continuum limit is described by :math:`Q \rightarrow \infty` and :math:`V_q \rightarrow 0` for all :math:`q \in \{1,\dots,Q\}` simultaneously. Moreover, the above equation defines a discretization of continuous integrals, :math:`\int_\Omega \mathrm{d}x \mapsto \sum_q V_q`. Here the number :math:`Q` characterizes the resolution of the pixelization and the continuum limit is described by :math:`Q \rightarrow \infty` and :math:`V_q \rightarrow 0` for all :math:`q \in \{1,\dots,Q\}` simultaneously.
Moreover, the above equation defines a discretization of continuous integrals, :math:`\int_\Omega \mathrm{d}x \mapsto \sum_q V_q`.
Any valid discretization scheme for a field :math:`{s}` can be described by a mapping, Any valid discretization scheme for a field :math:`{s}` can be described by a mapping,
...@@ -52,39 +63,48 @@ Any valid discretization scheme for a field :math:`{s}` can be described by a ma ...@@ -52,39 +63,48 @@ Any valid discretization scheme for a field :math:`{s}` can be described by a ma
s(x \in \Omega_q) \quad\mapsto\quad s_q \quad=\quad \int_{\Omega_q} \mathrm{d}x \; w_q(x) \; s(x) s(x \in \Omega_q) \quad\mapsto\quad s_q \quad=\quad \int_{\Omega_q} \mathrm{d}x \; w_q(x) \; s(x)
, ,
if the weighting function :math:`w_q(x)` is chosen appropriately. In order for the discretized version of the field to converge to the actual field in the continuum limit, the weighting functions need to be normalized in each subset; i.e., :math:`\forall q: \int_{\Omega_q} \mathrm{d}x \; w_q(x) = 1`. Choosing such a weighting function that is constant with respect to :math:`x` yields if the weighting function :math:`w_q(x)` is chosen appropriately.
In order for the discretized version of the field to converge to the actual field in the continuum limit, the weighting functions need to be normalized in each subset; i.e., :math:`\forall q: \int_{\Omega_q} \mathrm{d}x \; w_q(x) = 1`.
Choosing such a weighting function that is constant with respect to :math:`x` yields
.. math:: .. math::
s_q = \frac{\int_{\Omega_q} \mathrm{d}x \; s(x)}{\int_{\Omega_q} \mathrm{d}x} = \left< s(x) \right>_{\Omega_q} s_q = \frac{\int_{\Omega_q} \mathrm{d}x \; s(x)}{\int_{\Omega_q} \mathrm{d}x} = \left< s(x) \right>_{\Omega_q}
, ,
which corresponds to a discretization of the field by spatial averaging. Another common and equally valid choice is :math:`w_q(x) = \delta(x-x_q)`, which distinguishes some position :math:`x_q \in \Omega_q`, and evaluates the continuous field at this position, which corresponds to a discretization of the field by spatial averaging.
Another common and equally valid choice is :math:`w_q(x) = \delta(x-x_q)`, which distinguishes some position :math:`x_q \in \Omega_q`, and evaluates the continuous field at this position,
.. math:: .. math::
s_q \quad=\quad \int_{\Omega_q} \mathrm{d}x \; \delta(x-x_q) \; s(x) \quad=\quad s(x_q) s_q \quad=\quad \int_{\Omega_q} \mathrm{d}x \; \delta(x-x_q) \; s(x) \quad=\quad s(x_q)
. .
In practice, one often makes use of the spatially averaged pixel position, :math:`x_q = \left< x \right>_{\Omega_q}`. If the resolution is high enough to resolve all features of the signal field :math:`{s}`, both of these discretization schemes approximate each other, :math:`\left< s(x) \right>_{\Omega_q} \approx s(\left< x \right>_{\Omega_q})`, since they approximate the continuum limit by construction. (The approximation of :math:`\left< s(x) \right>_{\Omega_q} \approx s(x_q \in \Omega_q)` marks a resolution threshold beyond which further refinement of the discretization reveals no new features; i.e., no new information content of the field :math:`{s}`.) In practice, one often makes use of the spatially averaged pixel position, :math:`x_q = \left< x \right>_{\Omega_q}`.
If the resolution is high enough to resolve all features of the signal field :math:`{s}`, both of these discretization schemes approximate each other, :math:`\left< s(x) \right>_{\Omega_q} \approx s(\left< x \right>_{\Omega_q})`, since they approximate the continuum limit by construction.
(The approximation of :math:`\left< s(x) \right>_{\Omega_q} \approx s(x_q \in \Omega_q)` marks a resolution threshold beyond which further refinement of the discretization reveals no new features; i.e., no new information content of the field :math:`{s}`.)
All operations involving position integrals can be normalized in accordance with the above definitions. For example, the scalar product between two fields :math:`{s}` and :math:`{u}` is defined as All operations involving position integrals can be normalized in accordance with the above definitions.
For example, the scalar product between two fields :math:`{s}` and :math:`{u}` is defined as
.. math:: .. math::
{s}^\dagger {u} \quad=\quad \int_\Omega \mathrm{d}x \; s^*(x) \; u(x) \quad\approx\quad \sum_{q=1}^Q V_q^{\phantom{*}} \; s_q^* \; u_q^{\phantom{*}} {s}^\dagger {u} \quad=\quad \int_\Omega \mathrm{d}x \; s^*(x) \; u(x) \quad\approx\quad \sum_{q=1}^Q V_q^{\phantom{*}} \; s_q^* \; u_q^{\phantom{*}}
, ,
where :math:`\dagger` denotes adjunction and :math:`*` complex conjugation. Since the above approximation becomes an equality in the continuum limit, the scalar product is independent of the pixelization scheme and resolution, if the latter is sufficiently high. where :math:`\dagger` denotes adjunction and :math:`*` complex conjugation.
Since the above approximation becomes an equality in the continuum limit, the scalar product is independent of the pixelization scheme and resolution, if the latter is sufficiently high.
The above line of argumentation analogously applies to the discretization of operators. For a linear operator :math:`{A}` acting on some field :math:`{s}` as :math:`{A} {s} = \int_\Omega \mathrm{d}y \; A(x,y) \; s(y)`, a matrix representation discretized with constant weighting functions is given by The above line of argumentation analogously applies to the discretization of operators.
For a linear operator :math:`{A}` acting on some field :math:`{s}` as :math:`{A} {s} = \int_\Omega \mathrm{d}y \; A(x,y) \; s(y)`, a matrix representation discretized with constant weighting functions is given by
.. math:: .. math::
A(x \in \Omega_p, y \in \Omega_q) \quad\mapsto\quad A_{pq} \quad=\quad \frac{\iint_{\Omega_p \Omega_q} \mathrm{d}x \, \mathrm{d}y \; A(x,y)}{\iint_{\Omega_p \Omega_q} \mathrm{d}x \, \mathrm{d}y} \quad=\quad \big< \big< A(x,y) \big>_{\Omega_p} \big>_{\Omega_q} A(x \in \Omega_p, y \in \Omega_q) \quad\mapsto\quad A_{pq} \quad=\quad \frac{\iint_{\Omega_p \Omega_q} \mathrm{d}x \, \mathrm{d}y \; A(x,y)}{\iint_{\Omega_p \Omega_q} \mathrm{d}x \, \mathrm{d}y} \quad=\quad \big< \big< A(x,y) \big>_{\Omega_p} \big>_{\Omega_q}
. .
The proper discretization of spaces, fields, and operators, as well as the normalization of position integrals, is essential for the conservation of the continuum limit. Their consistent implementation in NIFTy allows a pixelization independent coding of algorithms. The proper discretization of spaces, fields, and operators, as well as the normalization of position integrals, is essential for the conservation of the continuum limit.
Their consistent implementation in NIFTy allows a pixelization independent coding of algorithms.
Free Theory & Implicit Operators Free Theory & Implicit Operators
-------------------------------- --------------------------------
...@@ -135,18 +155,24 @@ the posterior covariance operator, and ...@@ -135,18 +155,24 @@ the posterior covariance operator, and
j = R^\dagger N^{-1} d j = R^\dagger N^{-1} d
the information source. The operation in :math:`{m = D\,R^\dagger N^{-1} d}` is also called the generalized Wiener filter. the information source.
The operation in :math:`{m = D\,R^\dagger N^{-1} d}` is also called the generalized Wiener filter.
NIFTy permits to define the involved operators :math:`{R}`, :math:`{R^\dagger}`, :math:`{S}`, and :math:`{N}` implicitly, as routines that can be applied to vectors, but which do not require the explicit storage of the matrix elements of the operators. NIFTy permits to define the involved operators :math:`{R}`, :math:`{R^\dagger}`, :math:`{S}`, and :math:`{N}` implicitly, as routines that can be applied to vectors, but which do not require the explicit storage of the matrix elements of the operators.
Some of these operators are diagonal in harmonic (Fourier) basis, and therefore only require the specification of a (power) spectrum and :math:`{S= F\,\widehat{P_s} F^\dagger}`. Here :math:`{F = \mathrm{HarmonicTransformOperator}}`, :math:`{\widehat{P_s} = \mathrm{DiagonalOperator}(P_s)}`, and :math:`{P_s(k)}` is the power spectrum of the process that generated :math:`{s}` as a function of the (absolute value of the) harmonic (Fourier) space koordinate :math:`{k}`. For those, NIFTy can easily also provide inverse operators, as :math:`{S^{-1}= F\,\widehat{\frac{1}{P_s}} F^\dagger}` in case :math:`{F}` is unitary, :math:`{F^\dagger=F^{-1}}`. Some of these operators are diagonal in harmonic (Fourier) basis, and therefore only require the specification of a (power) spectrum and :math:`{S= F\,\widehat{P_s} F^\dagger}`.
Here :math:`{F = \mathrm{HarmonicTransformOperator}}`, :math:`{\widehat{P_s} = \mathrm{DiagonalOperator}(P_s)}`, and :math:`{P_s(k)}` is the power spectrum of the process that generated :math:`{s}` as a function of the (absolute value of the) harmonic (Fourier) space coordinate :math:`{k}`.
For those, NIFTy can easily also provide inverse operators, as :math:`{S^{-1}= F\,\widehat{\frac{1}{P_s}} F^\dagger}` in case :math:`{F}` is unitary, :math:`{F^\dagger=F^{-1}}`.
These implicit operators can be combined into new operators, e.g. to :math:`{D^{-1} = S^{-1} + R^\dagger N^{-1} R}`, as well as their inverses, e.g. :math:`{D = \left( D^{-1} \right)^{-1}}`. These implicit operators can be combined into new operators, e.g. to :math:`{D^{-1} = S^{-1} + R^\dagger N^{-1} R}`, as well as their inverses, e.g. :math:`{D = \left( D^{-1} \right)^{-1}}`.
The invocation of an inverse operator applied to a vector might trigger the execution of a numerical linear algebra solver. The invocation of an inverse operator applied to a vector might trigger the execution of a numerical linear algebra solver.
Thus, when NIFTy calculates :math:`{m = D\, j}` it actually solves :math:`{D^{-1} m = j}` for :math:`{m}` behind the scenes. The advantage of implicit operators to explicit matrices is the reduced memory requirements. The reconstruction of only a Megapixel image would otherwithe require the storage and processing of matrices with sizes of several Terrabytes. Larger images could not be dealt with due to the quadratic memory requirements of explicit operator representations. Thus, when NIFTy calculates :math:`{m = D\, j}`, it actually solves :math:`{D^{-1} m = j}` for :math:`{m}` behind the scenes.
The advantage of implicit operators to explicit matrices is the reduced memory requirements.
The reconstruction of only a Megapixel image would otherwithe require the storage and processing of matrices with sizes of several Terabytes.
Larger images could not be dealt with due to the quadratic memory requirements of explicit operator representations.
The demo codes demos/getting_started_1.py and demos/Wiener_Filter.ipynb illustrate this. The demo codes `demos/getting_started_1.py` and `demos/Wiener_Filter.ipynb` illustrate this.
Generative Models Generative Models
...@@ -164,7 +190,7 @@ Let us rewrite the above free theory as a generative model: ...@@ -164,7 +190,7 @@ Let us rewrite the above free theory as a generative model:
with :math:`{A}` the amplitude operator such that it generates signal field realizations with the correct covariance :math:`{S=A\,A^\dagger}` when being applied to a white Gaussian field :math:`{\xi}` with :math:`{\mathcal{P}(\xi)= \mathcal{G}(\xi, 1)}`. with :math:`{A}` the amplitude operator such that it generates signal field realizations with the correct covariance :math:`{S=A\,A^\dagger}` when being applied to a white Gaussian field :math:`{\xi}` with :math:`{\mathcal{P}(\xi)= \mathcal{G}(\xi, 1)}`.
The joint information Hamiltonian for the whitened signal field :math:`{\xi}` reads: The joint information Hamiltonian for the standardized signal field :math:`{\xi}` reads:
.. math:: .. math::
...@@ -172,26 +198,35 @@ The joint information Hamiltonian for the whitened signal field :math:`{\xi}` re ...@@ -172,26 +198,35 @@ The joint information Hamiltonian for the whitened signal field :math:`{\xi}` re
NIFTy takes advantage of this formulation in several ways: NIFTy takes advantage of this formulation in several ways:
1) All prior degrees of freedom have unit covariance which improves the condition number of operators which need to be inverted. 1) All prior degrees of freedom have unit covariance, which improves the condition number of operators that need to be inverted.
2) The amplitude operator can be regarded as part of the response, :math:`{R'=R\,A}`. In general, more sophisticated responses can be constructed out of the composition of simpler operators.
3) The response can be non-linear, e.g. :math:`{R'(s)=R \exp(A\,\xi)}`, see demos/getting_started_2.py.
4) The amplitude operator can be made dependent on unknowns as well, e.g. :math:`A=A(\tau)= F\, \widehat{e^\tau}` represents an amplitude operator with a positive definite, unknown spectrum defined in the Fourier domain. The amplitude field :math:`{\tau}` would get its own amplitude operator, with a cepstrum (spectrum of a log spectrum) defined in quefrency space (harmonic space of a logarithmically binned harmonic space) to regularize its degrees of freedom by imposing some (user-defined degree of) spectral smoothness.
5) NIFTy can calculate the gradient of the information Hamiltonian and the Fisher information metric with respect to all unknown parameters, here :math:`{\xi}` and :math:`{\tau}`, by automatic differentiation. The gradients are used for MAP and HMCF estimates, and the Fisher matrix is required in addition to the gradient by Metric Gaussian Variational Inference (MGVI), which is available in NIFTy as well. MGVI is an implicit operator extension of Automatic Differentiation Variational Inference (ADVI).
The reconstruction of a non-Gaussian signal with unknown covariance from a non-trivial (tomographic) response is demonstrated in demos/getting_started_3.py. Here, the uncertainty of the field and the power spectrum of its generating process are probed via posterior samples provided by the MGVI algorithm. 2) The amplitude operator can be regarded as part of the response, :math:`{R'=R\,A}`.
In general, more sophisticated responses can be constructed out of the composition of simpler operators.
3) The response can be non-linear, e.g. :math:`{R'(s)=R \exp(A\,\xi)}`, see `demos/getting_started_2.py`.
4) The amplitude operator may dependent on further parameters, e.g. :math:`A=A(\tau)= F\, \widehat{e^\tau}` represents an amplitude operator with a positive definite, unknown spectrum defined in the Fourier domain.
The amplitude field :math:`{\tau}` would get its own amplitude operator, with a cepstrum (spectrum of a log spectrum) defined in quefrency space (harmonic space of a logarithmically binned harmonic space) to regularize its degrees of freedom by imposing some (user-defined degree of) spectral smoothness.
5) NIFTy calculates the gradient of the information Hamiltonian and the Fisher information metric with respect to all unknown parameters, here :math:`{\xi}` and :math:`{\tau}`, by automatic differentiation.
The gradients are used for MAP and HMCF estimates, and the Fisher matrix is required in addition to the gradient by Metric Gaussian Variational Inference (MGVI), which is available in NIFTy as well.
MGVI is an implicit operator extension of Automatic Differentiation Variational Inference (ADVI).
The reconstruction of a non-Gaussian signal with unknown covariance from a non-trivial (tomographic) response is demonstrated in `demos/getting_started_3.py`.
Here, the uncertainty of the field and the power spectrum of its generating process are probed via posterior samples provided by the MGVI algorithm.
+----------------------------------------------------+ +----------------------------------------------------+
| **Output of tomography demo getting_started_3.py** | | **Output of tomography demo getting_started_3.py** |
+----------------------------------------------------+ +----------------------------------------------------+
| .. image:: images/getting_started_3_setup.png | | .. image:: images/getting_started_3_setup.png |
| | | :width: 50 % |
+----------------------------------------------------+ +----------------------------------------------------+
| Non-Gaussian signal field, | | Non-Gaussian signal field, |
| data backprojected into the image domain, power | | data backprojected into the image domain, power |
| spectrum of underlying Gausssian process. | | spectrum of underlying Gausssian process. |
+----------------------------------------------------+ +----------------------------------------------------+
| .. image:: images/getting_started_3_results.png | | .. image:: images/getting_started_3_results.png |
| | | :width: 50 % |
+----------------------------------------------------+ +----------------------------------------------------+
| Posterior mean field signal | | Posterior mean field signal |
| reconstruction, its uncertainty, and the power | | reconstruction, its uncertainty, and the power |
...@@ -199,3 +234,73 @@ The reconstruction of a non-Gaussian signal with unknown covariance from a non-t ...@@ -199,3 +234,73 @@ The reconstruction of a non-Gaussian signal with unknown covariance from a non-t
| samples in comparison to the correct one (thick | | samples in comparison to the correct one (thick |
| orange line). | | orange line). |
+----------------------------------------------------+ +----------------------------------------------------+
Maximim a Posteriori
--------------------
One popular field estimation method is Maximim a Posteriori (MAP).
It only requires to minimize the information Hamiltonian, e.g by a gradient descent method that stops when
.. math::
\frac{\partial \mathcal{H}(d,\xi)}{\partial \xi} = 0.
NIFTy5 automatically calculates the necessary gradient from a generative model of the signal and the data and to minimize the Hamiltonian.
However, MAP often provides unsatisfactory results in cases of deep hirachical Bayesian networks.
The reason for this is that MAP ignores the volume factors in parameter space, which are not to be neglected in deciding whether a solution is reasonable or not.
In the high dimensional setting of field inference these volume factors can differ by large ratios.
A MAP estimate, which is only representative for a tiny fraction of the parameter space, might be a poorer choice (with respect to an error norm) compared to a slightly worse location with slightly lower posterior probability, which, however, is associated with a much larger volume (of nearby locations with similar probability).
This causes MAP signal estimates to be more prone to overfitting the noise as well as to perception thresholds than methods that take volume effects into account.
Variational Inference
---------------------
One method that takes volume effects into account is Variational Inference (VI).
In VI, the posterior :math:`\mathcal{P}(\xi|d)` is approximated by a simpler, parametrized distribution, often a Gaussian :math:`\mathcal{Q}(\xi)=\mathcal{G}(\xi-m,D)`.
The parameters of :math:`\mathcal{Q}`, the mean :math:`m` and its covariance :math:`D` are obtained by minimization of an appropriate information distance measure between :math:`\mathcal{Q}` and :math:`\mathcal{P}`.
As a compromise between being optimal and being computationally affordable, the variational Kullback-Leibler (KL) divergence is used:
.. math::
\mathrm{KL}(m,D|d)= \mathcal{D}_\mathrm{KL}(\mathcal{Q}||\mathcal{P})=
\int \mathcal{D}\xi \,\mathcal{Q}(\xi) \log \left( \frac{\mathcal{Q}(\xi)}{\mathcal{P}(\xi)} \right)
Minimizing this with respect to all entries of the covariance :math:`D` is unfeasible for fields.
Therefore, Metric Gaussian Variational Inference (MGVI) approximates the precision matrix at the location of the current mean :math:`M=D^{-1}` by the Bayesian Fisher information metric,
.. math::
M \approx \left\langle \frac{\partial \mathcal{H}(d,\xi)}{\partial \xi} \, \frac{\partial \mathcal{H}(d,\xi)}{\partial \xi}^\dagger \right\rangle_{(d,\xi)}.
In practice the average is performed over :math:`\mathcal{P}(d,\xi)\approx \mathcal{P}(d|\xi)\,\delta(\xi-m)` by evaluating the expression at the current mean :math:`m`.
This results in a Fisher information metric of the likelihood evaluated at the mean plus the prior information metric.
Therefore we will only have to infer the mean of the approximate distribution.
The only term within the KL-divergence that explicitly depends on it is the Hamiltonian of the true problem averaged over the approximation:
.. math::
\mathrm{KL}(m|d) \;\widehat{=}\;
\left\langle \mathcal{H}(\xi,d) \right\rangle_{\mathcal{Q}(\xi)},
where :math:`\widehat{=}` expresses equality up to irrelvant (here not :math:`m`-dependent) terms.
Thus, only the gradient of the KL is needed with respect to this, which can be expressed as
.. math::
\frac{\partial \mathrm{KL}(m|d)}{\partial m} = \left\langle \frac{\partial \mathcal{H}(d,\xi)}{\partial \xi} \right\rangle_{\mathcal{G}(\xi-m,D)}.
We stochastically estimate the KL-divergence and gradients with a set of samples drawn from the approximate posterior distribution.
The particular structure of the covariance allows us to draw independent samples solving a certain system of equations.
This KL-divergence for MGVI is implemented in the class MetricGaussianKL within NIFTy5.
The demo `getting_started_3.py` for example not only infers a field this way, but also the power spectrum of the process that has generated the field.
The cross-correlation of field and power spectrum is taken care of in this process.
Posterior samples can be obtained to study this cross-correlation.
It should be noted that MGVI, as any VI method, can typically only provide a lower bound on the variance.
...@@ -19,6 +19,7 @@ from .field import Field ...@@ -19,6 +19,7 @@ from .field import Field
from .multi_field import MultiField from .multi_field import MultiField
from .operators.operator import Operator from .operators.operator import Operator
from .operators.adder import Adder
from .operators.diagonal_operator import DiagonalOperator from .operators.diagonal_operator import DiagonalOperator
from .operators.distributors import DOFDistributor, PowerDistributor from .operators.distributors import DOFDistributor, PowerDistributor
from .operators.domain_tuple_field_inserter import DomainTupleFieldInserter from .operators.domain_tuple_field_inserter import DomainTupleFieldInserter
...@@ -33,7 +34,6 @@ from .operators.field_zero_padder import FieldZeroPadder ...@@ -33,7 +34,6 @@ from .operators.field_zero_padder import FieldZeroPadder
from .operators.inversion_enabler import InversionEnabler from .operators.inversion_enabler import InversionEnabler
from .operators.linear_operator import LinearOperator from .operators.linear_operator import LinearOperator
from .operators.mask_operator import MaskOperator from .operators.mask_operator import MaskOperator
from .operators.offset_operator import OffsetOperator
from .operators.qht_operator import QHTOperator from .operators.qht_operator import QHTOperator
from .operators.regridding_operator import RegriddingOperator from .operators.regridding_operator import RegriddingOperator
from .operators.sampling_enabler import SamplingEnabler from .operators.sampling_enabler import SamplingEnabler
...@@ -49,7 +49,7 @@ from .operators.simple_linear_operators import ( ...@@ -49,7 +49,7 @@ from .operators.simple_linear_operators import (
from .operators.value_inserter import ValueInserter from .operators.value_inserter import ValueInserter
from .operators.energy_operators import ( from .operators.energy_operators import (
EnergyOperator, GaussianEnergy, PoissonianEnergy, InverseGammaLikelihood, EnergyOperator, GaussianEnergy, PoissonianEnergy, InverseGammaLikelihood,
BernoulliEnergy, Hamiltonian, AveragedEnergy) BernoulliEnergy, StandardHamiltonian, AveragedEnergy)
from .probing import probe_with_posterior_samples, probe_diagonal, \ from .probing import probe_with_posterior_samples, probe_diagonal, \
StatCalculator StatCalculator
...@@ -68,7 +68,7 @@ from .minimization.scipy_minimizer import L_BFGS_B ...@@ -68,7 +68,7 @@ from .minimization.scipy_minimizer import L_BFGS_B
from .minimization.energy import Energy from .minimization.energy import Energy
from .minimization.quadratic_energy import QuadraticEnergy from .minimization.quadratic_energy import QuadraticEnergy
from .minimization.energy_adapter import EnergyAdapter from .minimization.energy_adapter import EnergyAdapter
from .minimization.kl_energy import KL_Energy from .minimization.metric_gaussian_kl import MetricGaussianKL
from .sugar import * from .sugar import *
from .plot import Plot from .plot import Plot
......
...@@ -16,10 +16,9 @@ ...@@ -16,10 +16,9 @@
# NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik. # NIFTy is being developed at the Max-Planck-Institut fuer Astrophysik.
from ..minimization.energy_adapter import EnergyAdapter from ..minimization.energy_adapter import EnergyAdapter
from ..multi_domain import MultiDomain
from ..multi_field import MultiField from ..multi_field import MultiField
from ..operators.distributors import PowerDistributor from ..operators.distributors import PowerDistributor
from ..operators.energy_operators import Hamiltonian, InverseGammaLikelihood from ..operators.energy_operators import StandardHamiltonian, InverseGammaLikelihood
from ..operators.scaling_operator import ScalingOperator from ..operators.scaling_operator import ScalingOperator
from ..operators.simple_linear_operators import ducktape from ..operators.simple_linear_operators import ducktape
...@@ -35,25 +34,27 @@ def make_adjust_variances(a, ...@@ -35,25 +34,27 @@ def make_adjust_variances(a,
Constructs a Hamiltonian to solve constant likelihood optimizations of the Constructs a Hamiltonian to solve constant likelihood optimizations of the
form phi = a * xi under the constraint that phi remains constant. form phi = a * xi under the constraint that phi remains constant.
FIXME xi is white.
Parameters Parameters
---------- ----------
a : Operator a : Operator
Operator which gives the amplitude when evaluated at a position Gives the amplitude when evaluated at a position.
xi : Operator xi : Operator
Operator which gives the excitation when evaluated at a position Gives the excitation when evaluated at a position.
position : Field, MultiField position : Field, MultiField
Position of the whole problem Position of the entire problem.
samples : Field, MultiField samples : Field, MultiField
Residual samples of the whole problem Residual samples of the whole problem.
scaling : Float scaling : Float
Optional rescaling of the Likelihood Optional rescaling of the Likelihood.
ic_samp : Controller ic_samp : Controller
Iteration Controller for Hamiltonian Iteration Controller for Hamiltonian.
Returns Returns
------- -------
Hamiltonian StandardHamiltonian
A Hamiltonian that can be used for further minimization A Hamiltonian that can be used for further minimization.
""" """
d = a*xi d = a*xi
...@@ -71,7 +72,7 @@ def make_adjust_variances(a, ...@@ -71,7 +72,7 @@ def make_adjust_variances(a,
if scaling is not None: if scaling is not None:
x = ScalingOperator(scaling, x.target)(x) x = ScalingOperator(scaling, x.target)(x)
return Hamiltonian(InverseGammaLikelihood(d_eval)(x), ic_samp=ic_samp) return StandardHamiltonian(InverseGammaLikelihood(d_eval)(x), ic_samp=ic_samp)
def do_adjust_variances(position, def do_adjust_variances(position,
...@@ -79,6 +80,9 @@ def do_adjust_variances(position, ...@@ -79,6 +80,9 @@ def do_adjust_variances(position,
minimizer, minimizer,
xi_key='xi', xi_key='xi',
samples=[]): samples=[]):
'''
FIXME
'''
h_space = position[xi_key].domain[0] h_space = position[xi_key].domain[0]
pd = PowerDistributor(h_space, amplitude_operator.target[0]) pd = PowerDistributor(h_space, amplitude_operator.target[0])
......
...@@ -24,7 +24,7 @@ from ..operators.harmonic_operators import HarmonicTransformOperator ...@@ -24,7 +24,7 @@ from ..operators.harmonic_operators import HarmonicTransformOperator
from ..operators.simple_linear_operators import ducktape from ..operators.simple_linear_operators import ducktape
def CorrelatedField(target, amplitude_operator, name='xi'): def CorrelatedField(target, amplitude_operator, name='xi', codomain=None):
"""Constructs an operator which turns a white Gaussian excitation field """Constructs an operator which turns a white Gaussian excitation field
into a correlated field. into a correlated field.
...@@ -42,16 +42,21 @@ def CorrelatedField(target, amplitude_operator, name='xi'): ...@@ -42,16 +42,21 @@ def CorrelatedField(target, amplitude_operator, name='xi'):
amplitude_operator: Operator amplitude_operator: Operator
name : string name : string
:class:`MultiField` key for the xi-field. :class:`MultiField` key for the xi-field.
codomain : Domain
The codomain for target[0]. If not supplied, it is inferred.
Returns Returns
------- -------
Correlated field : Operator Operator
Correlated field
""" """
tgt = DomainTuple.make(target) tgt = DomainTuple.make(target)
if len(tgt) > 1: if len(tgt) > 1:
raise ValueError raise ValueError
h_space = tgt[0].get_default_codomain() if codomain is None:
ht = HarmonicTransformOperator(h_space, tgt[0]) codomain = tgt[0].get_default_codomain()
h_space = codomain
ht = HarmonicTransformOperator(h_space, target=tgt[0])
p_space = amplitude_operator.target[0] p_space = amplitude_operator.target[0]
power_distributor = PowerDistributor(h_space, p_space) power_distributor = PowerDistributor(h_space, p_space)
A = power_distributor(amplitude_operator) A = power_distributor(amplitude_operator)
...@@ -70,7 +75,7 @@ def MfCorrelatedField(target, amplitudes, name='xi'): ...@@ -70,7 +75,7 @@ def MfCorrelatedField(target, amplitudes, name='xi'):
Parameters Parameters
---------- ----------
target : Domain, DomainTuple or tuple of Domain target : Domain, DomainTuple or tuple of Domain