diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000000000000000000000000000000000000..921ffeda512e71d1a70c2797e5c676f80967aede
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,6 @@
+All people who contributed to bfps, in order of the date of their first
+contribution.
+
+Cristian C Lalescu <Cristian.Lalescu@ds.mpg.de>
+Dimitar Vlaykov
+Berenger Bramas
diff --git a/README.rst b/README.rst
index 0379bc61d93b1a88baaf8c0d757c0092dbb6361a..ddb9f2447db919248100368a9a08b13297d5e3a4 100644
--- a/README.rst
+++ b/README.rst
@@ -12,11 +12,14 @@ Parameters and statistics are stored in HDF5 format, together with code
 information, so simulation data should be "future proof" --- suggestions
 of possible improvements to the current approach are always welcome.
 
+The primary aim of bfps is to reduce the time spent on setting up and
+baby sitting DNS, as well as simplify the analysis of the generated
+data.
 The wish is that this Python package provides an easy and general way
 of constructing efficient specialized DNS C++ codes for different
 turbulence problems encountered in research.
 At the same time, the package should provide a unified way of
-postprocessing data, and accessing the postprocessing results.
+postprocessing, and accessing the postprocessing results.
 The code therefore consists of two main parts: the pure C++ code, a set
 of loosely related "building blocks", and the Python code, which can
 generate C++ code using the pure classes, but with a significant degree
@@ -34,10 +37,10 @@ the user's machine, or submitted to a queue on a cluster.
 Installation
 ------------
 
-So far, the code has been run on an ubuntu 14.04 machine, an opensuse
-13.2 desktop, and a reasonably standard linux cluster (biggest run so
-far was 1344^3 on 16 nodes of 12 cores each, with about 24 seconds per
-time step).
+So far, the code has been run on laptops, desktops, and a couple of
+clusters (biggest run so far was 1536^3 on 16 nodes of 32 cores each,
+with about 11 seconds per time step, for a simple incompressible
+Navier-Stokes problem).
 Postprocessing data may not be very computationally intensive, depending
 on the amount of data involved.
 
@@ -55,21 +58,21 @@ Use a console; navigate to the ``bfps`` folder, and type:
 **Full installation**
 
 If you want to run simulations on the machine where you're installing,
-you will need to call `build` before installing.
+you will need to call `compile_library` before installing.
 Your machine needs to have an MPI compiler installed, the HDF5 C library
 and FFTW >= 3.4.
 The file `machine_settings_py.py` should be modified
-appropriately for your machine (otherwise the `build` command will most
+appropriately for your machine (otherwise the `compile_library` command will most
 likely fail).
 This file will be copied the first time you run `setup.py` into
-`$HOME/.config/bfps/machine_settings.py`, where it will be imported from
-afterwards.
-You may, obviously, edit it afterwards and rerun the build command as
+`$HOME/.config/bfps/machine_settings.py`, **where it will be imported from
+afterwards** --- any future edits **must** be made to the new file.
+You may, obviously, edit it afterwards and rerun the `compile_library` command as
 needed.
 
 .. code:: bash
 
-    python setup.py build
+    python setup.py compile_library
     python setup.py install
 
 -------------
@@ -99,9 +102,7 @@ Comments
 * particles: initialization of multistep solvers is done with lower
   order methods, so direct convergence tests will fail.
 
-* Code is used mainly with Python 3.4, but Python 2.7
-  compatibility should be kept since mayavi (well, vtk actually) only
-  works on Python 2.
-  Until vtk becomes compatible with Python 3.x, any Python 2.7
-  incompatibilites can be reported as bugs.
+* Code is used mainly with Python 3.4 and 3.5.
+  In principle it should be easy to maintain compatibility with Python
+  2.7.x, but as of `bfps 1.8` this is no longer a main concern.
 
diff --git a/bfps/FluidConvert.py b/bfps/FluidConvert.py
index 14be9b985139fabf3b7e1cda1b5f9ee9618a8307..d924f2a1d5ed411855ca13687aa716fa3aa31dc5 100644
--- a/bfps/FluidConvert.py
+++ b/bfps/FluidConvert.py
@@ -43,7 +43,7 @@ class FluidConvert(_fluid_particle_base):
             work_dir = './',
             simname = 'test',
             fluid_precision = 'single',
-            use_fftw_wisdom = True):
+            use_fftw_wisdom = False):
         _fluid_particle_base.__init__(
                 self,
                 name = name + '-' + fluid_precision,
@@ -98,7 +98,7 @@ class FluidConvert(_fluid_particle_base):
                         nx, ny, nz,
                         dkx, dky, dkz,
                         dealias_type,
-                        FFTW_ESTIMATE);
+                        DEFAULT_FFTW_FLAG);
                 //endcpp
                 """.format(self.C_dtype)
         self.fluid_loop += """
@@ -109,11 +109,13 @@ class FluidConvert(_fluid_particle_base):
                 """
         self.fluid_end += 'delete fs;\n'
         return None
-    def add_parser_arguments(
+    def specific_parser_arguments(
             self,
             parser):
-        _fluid_particle_base.add_parser_arguments(self, parser)
-        self.parameters_to_parser_arguments(parser, parameters = self.spec_parameters)
+        _fluid_particle_base.specific_parser_arguments(self, parser)
+        self.parameters_to_parser_arguments(
+                parser,
+                parameters = self.spec_parameters)
         return None
     def launch(
             self,
@@ -125,13 +127,13 @@ class FluidConvert(_fluid_particle_base):
         self.pars_from_namespace(
                 opt,
                 parameters = self.spec_parameters)
-        self.set_host_info(bfps.host_info)
         self.rewrite_par(
                 group = 'conversion_parameters',
                 parameters = self.spec_parameters)
-        self.run(
-                ncpu = opt.ncpu,
-                err_file = 'err_convert',
-                out_file = 'out_convert')
+        self.run(ncpu = opt.ncpu,
+                 hours = opt.minutes // 60,
+                 minutes = opt.minutes % 60,
+                 err_file = 'err_convert',
+                 out_file = 'out_convert')
         return None
 
diff --git a/bfps/FluidResize.py b/bfps/FluidResize.py
index be0af1fe8228ffd31f42c08b5d0fca45dadbf8b2..fb5e26208f6960d447bc927bd9e207354620d188 100644
--- a/bfps/FluidResize.py
+++ b/bfps/FluidResize.py
@@ -136,6 +136,8 @@ class FluidResize(_fluid_particle_base):
         for k in ['dst_nx', 'dst_ny', 'dst_nz']:
             if type(cmd_line_pars[k]) == type(None):
                 cmd_line_pars[k] = opt.m
+        # the 3 dst_ni have been updated in opt itself at this point
+        # I'm not sure if this code is future-proof...
         self.parameters['niter_todo'] = 0
         self.pars_from_namespace(opt)
         src_file = os.path.join(
@@ -144,10 +146,11 @@ class FluidResize(_fluid_particle_base):
         read_file = os.path.join(
                 self.work_dir,
                 opt.src_simname + '_cvorticity_i{0:0>5x}'.format(opt.src_iteration))
-        self.set_host_info(bfps.host_info)
         self.write_par(iter0 = opt.src_iteration)
         if not os.path.exists(read_file):
             os.symlink(src_file, read_file)
-        self.run(ncpu = opt.ncpu)
+        self.run(ncpu = opt.ncpu,
+                 hours = opt.minutes // 60,
+                 minutes = opt.minutes % 60)
         return None
 
diff --git a/bfps/NSVorticityEquation.py b/bfps/NSVorticityEquation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f67ba6aee16e93d1c4a8a9a710c3136eae678398
--- /dev/null
+++ b/bfps/NSVorticityEquation.py
@@ -0,0 +1,849 @@
+#######################################################################
+#                                                                     #
+#  Copyright 2015 Max Planck Institute                                #
+#                 for Dynamics and Self-Organization                  #
+#                                                                     #
+#  This file is part of bfps.                                         #
+#                                                                     #
+#  bfps is free software: you can redistribute it and/or modify       #
+#  it under the terms of the GNU General Public License as published  #
+#  by the Free Software Foundation, either version 3 of the License,  #
+#  or (at your option) any later version.                             #
+#                                                                     #
+#  bfps is distributed in the hope that it will be useful,            #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of     #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      #
+#  GNU General Public License for more details.                       #
+#                                                                     #
+#  You should have received a copy of the GNU General Public License  #
+#  along with bfps.  If not, see <http://www.gnu.org/licenses/>       #
+#                                                                     #
+# Contact: Cristian.Lalescu@ds.mpg.de                                 #
+#                                                                     #
+#######################################################################
+
+
+
+import sys
+import os
+import numpy as np
+import h5py
+import argparse
+
+import bfps
+import bfps.tools
+from bfps._code import _code
+from bfps._fluid_base import _fluid_particle_base
+
+class NSVorticityEquation(_fluid_particle_base):
+    def __init__(
+            self,
+            name = 'NSVE-v' + bfps.__version__,
+            work_dir = './',
+            simname = 'test',
+            fluid_precision = 'single',
+            fftw_plan_rigor = 'FFTW_MEASURE',
+            use_fftw_wisdom = True):
+        """
+            This code uses checkpoints for DNS restarts, and it can be stopped
+            by creating the file "stop_<simname>" in the working directory.
+            For postprocessing of field snapshots, consider creating a separate
+            HDF5 file (from the python wrapper) which contains links to all the
+            different snapshots.
+        """
+        self.fftw_plan_rigor = fftw_plan_rigor
+        _fluid_particle_base.__init__(
+                self,
+                name = name + '-' + fluid_precision,
+                work_dir = work_dir,
+                simname = simname,
+                dtype = fluid_precision,
+                use_fftw_wisdom = use_fftw_wisdom)
+        self.parameters['nu'] = float(0.1)
+        self.parameters['fmode'] = 1
+        self.parameters['famplitude'] = float(0.5)
+        self.parameters['fk0'] = float(2.0)
+        self.parameters['fk1'] = float(4.0)
+        self.parameters['forcing_type'] = 'linear'
+        self.parameters['histogram_bins'] = int(256)
+        self.parameters['max_velocity_estimate'] = float(1)
+        self.parameters['max_vorticity_estimate'] = float(1)
+        self.parameters['checkpoints_per_file'] = int(1)
+        self.file_datasets_grow = """
+                //begincpp
+                hid_t group;
+                group = H5Gopen(stat_file, "/statistics", H5P_DEFAULT);
+                H5Ovisit(group, H5_INDEX_NAME, H5_ITER_NATIVE, grow_statistics_dataset, NULL);
+                H5Gclose(group);
+                //endcpp
+                """
+        self.style = {}
+        self.statistics = {}
+        self.fluid_output = """
+                fs->io_checkpoint(false);
+                """
+        # vorticity_equation specific things
+        self.includes += '#include "vorticity_equation.hpp"\n'
+        self.store_kspace = """
+                //begincpp
+                if (myrank == 0 && iteration == 0)
+                {
+                    TIMEZONE("fluid_base::store_kspace");
+                    hsize_t dims[4];
+                    hid_t space, dset;
+                    // store kspace information
+                    dset = H5Dopen(stat_file, "/kspace/kshell", H5P_DEFAULT);
+                    space = H5Dget_space(dset);
+                    H5Sget_simple_extent_dims(space, dims, NULL);
+                    H5Sclose(space);
+                    if (fs->kk->nshells != dims[0])
+                    {
+                        DEBUG_MSG(
+                            "ERROR: computed nshells %d not equal to data file nshells %d\\n",
+                            fs->kk->nshells, dims[0]);
+                    }
+                    H5Dwrite(dset, H5T_NATIVE_DOUBLE, H5S_ALL, H5S_ALL, H5P_DEFAULT, &fs->kk->kshell.front());
+                    H5Dclose(dset);
+                    dset = H5Dopen(stat_file, "/kspace/nshell", H5P_DEFAULT);
+                    H5Dwrite(dset, H5T_NATIVE_INT64, H5S_ALL, H5S_ALL, H5P_DEFAULT, &fs->kk->nshell.front());
+                    H5Dclose(dset);
+                    dset = H5Dopen(stat_file, "/kspace/kM", H5P_DEFAULT);
+                    H5Dwrite(dset, H5T_NATIVE_DOUBLE, H5S_ALL, H5S_ALL, H5P_DEFAULT, &fs->kk->kM);
+                    H5Dclose(dset);
+                    dset = H5Dopen(stat_file, "/kspace/dk", H5P_DEFAULT);
+                    H5Dwrite(dset, H5T_NATIVE_DOUBLE, H5S_ALL, H5S_ALL, H5P_DEFAULT, &fs->kk->dk);
+                    H5Dclose(dset);
+                }
+                //endcpp
+                """
+        return None
+    def add_particles(
+            self,
+            integration_steps = 2,
+            neighbours = 1,
+            smoothness = 1):
+        assert(integration_steps > 0 and integration_steps < 6)
+        self.particle_species = 1
+        self.parameters['tracers0_integration_steps'] = int(integration_steps)
+        self.parameters['tracers0_neighbours'] = int(neighbours)
+        self.parameters['tracers0_smoothness'] = int(smoothness)
+        self.parameters['tracers0_interpolator'] = 'spline'
+        self.particle_includes += """
+                #include "particles/particles_system_builder.hpp"
+                #include "particles/particles_output_hdf5.hpp"
+                """
+        ## initialize
+        self.particle_start += """
+            sprintf(fname, "%s_particles.h5", simname);
+            std::unique_ptr<abstract_particles_system<double>> ps = particles_system_builder(
+                    fs->cvorticity,             // (field object)
+                    fs->kk,                     // (kspace object, contains dkx, dky, dkz)
+                    tracers0_integration_steps, // to check coherency between parameters and hdf input file (nb rhs)
+                    nparticles,                 // to check coherency between parameters and hdf input file
+                    fname,                      // particles input filename
+                    std::string("/tracers0/state/0"),                 // dataset name for initial input
+                    std::string("/tracers0/rhs/0"),                 // dataset name for initial input
+                    tracers0_neighbours,        // parameter (interpolation no neighbours)
+                    tracers0_smoothness,        // parameter
+                    MPI_COMM_WORLD);
+            particles_output_hdf5<double,3,3> particles_output_writer_mpi(MPI_COMM_WORLD, fname, nparticles, tracers0_integration_steps,
+                                                                          "/tracers0/state/", "/tracers0/rhs/");
+                    """
+        self.particle_loop += """
+                fs->compute_velocity(fs->cvorticity);
+                fs->cvelocity->ift();
+                ps->completeLoop(dt);
+                """
+        output_particles = """
+                particles_output_writer_mpi.save(ps->getParticlesPositions(),
+                                                 ps->getParticlesRhs(),
+                                                 ps->getParticlesIndexes(),
+                                                 ps->getLocalNbParticles(),
+                                                 iteration+1);
+                           """
+        self.fluid_output += output_particles
+        self.particle_end += 'ps.release();\n'
+        return None
+    def create_stat_output(
+            self,
+            dset_name,
+            data_buffer,
+            data_type = 'H5T_NATIVE_DOUBLE',
+            size_setup = None,
+            close_spaces = True):
+        new_stat_output_txt = 'Cdset = H5Dopen(stat_file, "{0}", H5P_DEFAULT);\n'.format(dset_name)
+        if not type(size_setup) == type(None):
+            new_stat_output_txt += (
+                    size_setup +
+                    'wspace = H5Dget_space(Cdset);\n' +
+                    'ndims = H5Sget_simple_extent_dims(wspace, dims, NULL);\n' +
+                    'mspace = H5Screate_simple(ndims, count, NULL);\n' +
+                    'H5Sselect_hyperslab(wspace, H5S_SELECT_SET, offset, NULL, count, NULL);\n')
+        new_stat_output_txt += ('H5Dwrite(Cdset, {0}, mspace, wspace, H5P_DEFAULT, {1});\n' +
+                                'H5Dclose(Cdset);\n').format(data_type, data_buffer)
+        if close_spaces:
+            new_stat_output_txt += ('H5Sclose(mspace);\n' +
+                                    'H5Sclose(wspace);\n')
+        return new_stat_output_txt
+    def write_fluid_stats(self):
+        self.fluid_includes += '#include <cmath>\n'
+        self.fluid_includes += '#include "fftw_tools.hpp"\n'
+        self.stat_src += """
+                //begincpp
+                hid_t stat_group;
+                if (myrank == 0)
+                    stat_group = H5Gopen(stat_file, "statistics", H5P_DEFAULT);
+                fs->compute_velocity(fs->cvorticity);
+                *tmp_vec_field = fs->cvelocity->get_cdata();
+                tmp_vec_field->compute_stats(
+                    fs->kk,
+                    stat_group,
+                    "velocity",
+                    fs->iteration / niter_stat,
+                    max_velocity_estimate/sqrt(3));
+                //endcpp
+                """
+        self.stat_src += """
+                //begincpp
+                *tmp_vec_field = fs->cvorticity->get_cdata();
+                tmp_vec_field->compute_stats(
+                    fs->kk,
+                    stat_group,
+                    "vorticity",
+                    fs->iteration / niter_stat,
+                    max_vorticity_estimate/sqrt(3));
+                //endcpp
+                """
+        self.stat_src += """
+                //begincpp
+                if (myrank == 0)
+                    H5Gclose(stat_group);
+                if (myrank == 0)
+                {{
+                    hid_t Cdset, wspace, mspace;
+                    int ndims;
+                    hsize_t count[4], offset[4], dims[4];
+                    offset[0] = fs->iteration/niter_stat;
+                    offset[1] = 0;
+                    offset[2] = 0;
+                    offset[3] = 0;
+                //endcpp
+                """.format(self.C_dtype)
+        if self.dtype == np.float32:
+            field_H5T = 'H5T_NATIVE_FLOAT'
+        elif self.dtype == np.float64:
+            field_H5T = 'H5T_NATIVE_DOUBLE'
+        self.stat_src += self.create_stat_output(
+                '/statistics/xlines/velocity',
+                'fs->rvelocity->get_rdata()',
+                data_type = field_H5T,
+                size_setup = """
+                    count[0] = 1;
+                    count[1] = nx;
+                    count[2] = 3;
+                    """,
+                close_spaces = False)
+        self.stat_src += self.create_stat_output(
+                '/statistics/xlines/vorticity',
+                'fs->rvorticity->get_rdata()',
+                data_type = field_H5T)
+        self.stat_src += '}\n'
+        ## checkpoint
+        self.stat_src += """
+                //begincpp
+                if (myrank == 0)
+                {
+                    std::string fname = (
+                        std::string("stop_") +
+                        std::string(simname));
+                    {
+                        struct stat file_buffer;
+                        stop_code_now = (stat(fname.c_str(), &file_buffer) == 0);
+                    }
+                }
+                MPI_Bcast(&stop_code_now, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
+                //endcpp
+                """
+        return None
+    def fill_up_fluid_code(self):
+        self.fluid_includes += '#include <cstring>\n'
+        self.fluid_variables += (
+                'vorticity_equation<{0}, FFTW> *fs;\n'.format(self.C_dtype) +
+                'field<{0}, FFTW, THREE> *tmp_vec_field;\n'.format(self.C_dtype) +
+                'field<{0}, FFTW, ONE> *tmp_scal_field;\n'.format(self.C_dtype))
+        self.fluid_definitions += """
+                    typedef struct {{
+                        {0} re;
+                        {0} im;
+                    }} tmp_complex_type;
+                    """.format(self.C_dtype)
+        self.write_fluid_stats()
+        if self.dtype == np.float32:
+            field_H5T = 'H5T_NATIVE_FLOAT'
+        elif self.dtype == np.float64:
+            field_H5T = 'H5T_NATIVE_DOUBLE'
+        self.variables += 'int checkpoint;\n'
+        self.variables += 'bool stop_code_now;\n'
+        self.read_checkpoint = """
+                //begincpp
+                if (myrank == 0)
+                {
+                    hid_t dset = H5Dopen(stat_file, "checkpoint", H5P_DEFAULT);
+                    H5Dread(
+                        dset,
+                        H5T_NATIVE_INT,
+                        H5S_ALL,
+                        H5S_ALL,
+                        H5P_DEFAULT,
+                        &checkpoint);
+                    H5Dclose(dset);
+                }
+                MPI_Bcast(&checkpoint, 1, MPI_INT, 0, MPI_COMM_WORLD);
+                fs->checkpoint = checkpoint;
+                //endcpp
+        """
+        self.store_checkpoint = """
+                //begincpp
+                checkpoint = fs->checkpoint;
+                if (myrank == 0)
+                {
+                    hid_t dset = H5Dopen(stat_file, "checkpoint", H5P_DEFAULT);
+                    H5Dwrite(
+                        dset,
+                        H5T_NATIVE_INT,
+                        H5S_ALL,
+                        H5S_ALL,
+                        H5P_DEFAULT,
+                        &checkpoint);
+                    H5Dclose(dset);
+                }
+                //endcpp
+        """
+        self.fluid_start += """
+                //begincpp
+                char fname[512];
+                fs = new vorticity_equation<{0}, FFTW>(
+                        simname,
+                        nx, ny, nz,
+                        dkx, dky, dkz,
+                        {1});
+                tmp_vec_field = new field<{0}, FFTW, THREE>(
+                        nx, ny, nz,
+                        MPI_COMM_WORLD,
+                        {1});
+                tmp_scal_field = new field<{0}, FFTW, ONE>(
+                        nx, ny, nz,
+                        MPI_COMM_WORLD,
+                        {1});
+                fs->checkpoints_per_file = checkpoints_per_file;
+                fs->nu = nu;
+                fs->fmode = fmode;
+                fs->famplitude = famplitude;
+                fs->fk0 = fk0;
+                fs->fk1 = fk1;
+                strncpy(fs->forcing_type, forcing_type, 128);
+                fs->iteration = iteration;
+                {2}
+                fs->cvorticity->real_space_representation = false;
+                fs->io_checkpoint();
+                //endcpp
+                """.format(
+                        self.C_dtype,
+                        self.fftw_plan_rigor,
+                        self.read_checkpoint)
+        self.fluid_start += self.store_kspace
+        self.fluid_start += 'stop_code_now = false;\n'
+        self.fluid_loop = 'fs->step(dt);\n'
+        self.fluid_loop += ('if (fs->iteration % niter_out == 0)\n{\n' +
+                            self.fluid_output +
+                            self.store_checkpoint +
+                            '\n}\n' +
+                            'if (stop_code_now){\n' +
+                            'iteration = fs->iteration;\n' +
+                            'break;\n}\n')
+        self.fluid_end = ('if (fs->iteration % niter_out != 0)\n{\n' +
+                          self.fluid_output +
+                          self.store_checkpoint +
+                          'DEBUG_MSG("checkpoint value is %d\\n", checkpoint);\n' +
+                          '\n}\n' +
+                          'delete fs;\n' +
+                          'delete tmp_vec_field;\n' +
+                          'delete tmp_scal_field;\n')
+        return None
+    def get_postprocess_file_name(self):
+        return os.path.join(self.work_dir, self.simname + '_postprocess.h5')
+    def get_postprocess_file(self):
+        return h5py.File(self.get_postprocess_file_name(), 'r')
+    def compute_statistics(self, iter0 = 0, iter1 = None):
+        """Run basic postprocessing on raw data.
+        The energy spectrum :math:`E(t, k)` and the enstrophy spectrum
+        :math:`\\frac{1}{2}\omega^2(t, k)` are computed from the
+
+        .. math::
+
+            \sum_{k \\leq \\|\\mathbf{k}\\| \\leq k+dk}\\hat{u_i} \\hat{u_j}^*, \\hskip .5cm
+            \sum_{k \\leq \\|\\mathbf{k}\\| \\leq k+dk}\\hat{\omega_i} \\hat{\\omega_j}^*
+
+        tensors, and the enstrophy spectrum is also used to
+        compute the dissipation :math:`\\varepsilon(t)`.
+        These basic quantities are stored in a newly created HDF5 file,
+        ``simname_postprocess.h5``.
+        """
+        if len(list(self.statistics.keys())) > 0:
+            return None
+        self.read_parameters()
+        with self.get_data_file() as data_file:
+            if 'moments' not in data_file['statistics'].keys():
+                return None
+            iter0 = min((data_file['statistics/moments/velocity'].shape[0] *
+                         self.parameters['niter_stat']-1),
+                        iter0)
+            if type(iter1) == type(None):
+                iter1 = data_file['iteration'].value
+            else:
+                iter1 = min(data_file['iteration'].value, iter1)
+            ii0 = iter0 // self.parameters['niter_stat']
+            ii1 = iter1 // self.parameters['niter_stat']
+            self.statistics['kshell'] = data_file['kspace/kshell'].value
+            self.statistics['kM'] = data_file['kspace/kM'].value
+            self.statistics['dk'] = data_file['kspace/dk'].value
+            computation_needed = True
+            pp_file = h5py.File(self.get_postprocess_file_name(), 'a')
+            if 'ii0' in pp_file.keys():
+                computation_needed =  not (ii0 == pp_file['ii0'].value and
+                                           ii1 == pp_file['ii1'].value)
+                if computation_needed:
+                    for k in pp_file.keys():
+                        del pp_file[k]
+            if computation_needed:
+                pp_file['iter0'] = iter0
+                pp_file['iter1'] = iter1
+                pp_file['ii0'] = ii0
+                pp_file['ii1'] = ii1
+                pp_file['t'] = (self.parameters['dt']*
+                                self.parameters['niter_stat']*
+                                (np.arange(ii0, ii1+1).astype(np.float)))
+                pp_file['energy(t, k)'] = (
+                    data_file['statistics/spectra/velocity_velocity'][ii0:ii1+1, :, 0, 0] +
+                    data_file['statistics/spectra/velocity_velocity'][ii0:ii1+1, :, 1, 1] +
+                    data_file['statistics/spectra/velocity_velocity'][ii0:ii1+1, :, 2, 2])/2
+                pp_file['enstrophy(t, k)'] = (
+                    data_file['statistics/spectra/vorticity_vorticity'][ii0:ii1+1, :, 0, 0] +
+                    data_file['statistics/spectra/vorticity_vorticity'][ii0:ii1+1, :, 1, 1] +
+                    data_file['statistics/spectra/vorticity_vorticity'][ii0:ii1+1, :, 2, 2])/2
+                pp_file['vel_max(t)'] = data_file['statistics/moments/velocity']  [ii0:ii1+1, 9, 3]
+                pp_file['renergy(t)'] = data_file['statistics/moments/velocity'][ii0:ii1+1, 2, 3]/2
+            for k in ['t',
+                      'energy(t, k)',
+                      'enstrophy(t, k)',
+                      'vel_max(t)',
+                      'renergy(t)']:
+                if k in pp_file.keys():
+                    self.statistics[k] = pp_file[k].value
+            self.compute_time_averages()
+        return None
+    def compute_time_averages(self):
+        """Compute easy stats.
+
+        Further computation of statistics based on the contents of
+        ``simname_postprocess.h5``.
+        Standard quantities are as follows
+        (consistent with [Ishihara]_):
+
+        .. math::
+
+            U_{\\textrm{int}}(t) = \\sqrt{\\frac{2E(t)}{3}}, \\hskip .5cm
+            L_{\\textrm{int}}(t) = \\frac{\pi}{2U_{int}^2(t)} \\int \\frac{dk}{k} E(t, k), \\hskip .5cm
+            T_{\\textrm{int}}(t) =
+            \\frac{L_{\\textrm{int}}(t)}{U_{\\textrm{int}}(t)}
+
+            \\eta_K = \\left(\\frac{\\nu^3}{\\varepsilon}\\right)^{1/4}, \\hskip .5cm
+            \\tau_K = \\left(\\frac{\\nu}{\\varepsilon}\\right)^{1/2}, \\hskip .5cm
+            \\lambda = \\sqrt{\\frac{15 \\nu U_{\\textrm{int}}^2}{\\varepsilon}}
+
+            Re = \\frac{U_{\\textrm{int}} L_{\\textrm{int}}}{\\nu}, \\hskip
+            .5cm
+            R_{\\lambda} = \\frac{U_{\\textrm{int}} \\lambda}{\\nu}
+
+        .. [Ishihara] T. Ishihara et al,
+                      *Small-scale statistics in high-resolution direct numerical
+                      simulation of turbulence: Reynolds number dependence of
+                      one-point velocity gradient statistics*.
+                      J. Fluid Mech.,
+                      **592**, 335-366, 2007
+        """
+        for key in ['energy', 'enstrophy']:
+            self.statistics[key + '(t)'] = (self.statistics['dk'] *
+                                            np.sum(self.statistics[key + '(t, k)'], axis = 1))
+        self.statistics['Uint(t)'] = np.sqrt(2*self.statistics['energy(t)'] / 3)
+        self.statistics['Lint(t)'] = ((self.statistics['dk']*np.pi /
+                                       (2*self.statistics['Uint(t)']**2)) *
+                                      np.nansum(self.statistics['energy(t, k)'] /
+                                                self.statistics['kshell'][None, :], axis = 1))
+        for key in ['energy',
+                    'enstrophy',
+                    'vel_max',
+                    'Uint',
+                    'Lint']:
+            if key + '(t)' in self.statistics.keys():
+                self.statistics[key] = np.average(self.statistics[key + '(t)'], axis = 0)
+        for suffix in ['', '(t)']:
+            self.statistics['diss'    + suffix] = (self.parameters['nu'] *
+                                                   self.statistics['enstrophy' + suffix]*2)
+            self.statistics['etaK'    + suffix] = (self.parameters['nu']**3 /
+                                                   self.statistics['diss' + suffix])**.25
+            self.statistics['tauK'    + suffix] =  (self.parameters['nu'] /
+                                                    self.statistics['diss' + suffix])**.5
+            self.statistics['Re' + suffix] = (self.statistics['Uint' + suffix] *
+                                              self.statistics['Lint' + suffix] /
+                                              self.parameters['nu'])
+            self.statistics['lambda' + suffix] = (15 * self.parameters['nu'] *
+                                                  self.statistics['Uint' + suffix]**2 /
+                                                  self.statistics['diss' + suffix])**.5
+            self.statistics['Rlambda' + suffix] = (self.statistics['Uint' + suffix] *
+                                                   self.statistics['lambda' + suffix] /
+                                                   self.parameters['nu'])
+            self.statistics['kMeta' + suffix] = (self.statistics['kM'] *
+                                                 self.statistics['etaK' + suffix])
+            if self.parameters['dealias_type'] == 1:
+                self.statistics['kMeta' + suffix] *= 0.8
+        self.statistics['Tint'] = self.statistics['Lint'] / self.statistics['Uint']
+        self.statistics['Taylor_microscale'] = self.statistics['lambda']
+        return None
+    def set_plt_style(
+            self,
+            style = {'dashes' : (None, None)}):
+        self.style.update(style)
+        return None
+    def convert_complex_from_binary(
+            self,
+            field_name = 'vorticity',
+            iteration = 0,
+            file_name = None):
+        """read the Fourier representation of a vector field.
+
+        Read the binary file containing iteration ``iteration`` of the
+        field ``field_name``, and write it in a ``.h5`` file.
+        """
+        data = np.memmap(
+                os.path.join(self.work_dir,
+                             self.simname + '_{0}_i{1:0>5x}'.format('c' + field_name, iteration)),
+                dtype = self.ctype,
+                mode = 'r',
+                shape = (self.parameters['ny'],
+                         self.parameters['nz'],
+                         self.parameters['nx']//2+1,
+                         3))
+        if type(file_name) == type(None):
+            file_name = self.simname + '_{0}_i{1:0>5x}.h5'.format('c' + field_name, iteration)
+            file_name = os.path.join(self.work_dir, file_name)
+        f = h5py.File(file_name, 'a')
+        f[field_name + '/complex/{0}'.format(iteration)] = data
+        f.close()
+        return None
+    def write_par(
+            self,
+            iter0 = 0,
+            particle_ic = None):
+        _fluid_particle_base.write_par(self, iter0 = iter0)
+        with h5py.File(self.get_data_file_name(), 'r+') as ofile:
+            kspace = self.get_kspace()
+            nshells = kspace['nshell'].shape[0]
+            vec_stat_datasets = ['velocity', 'vorticity']
+            scal_stat_datasets = []
+            for k in vec_stat_datasets:
+                time_chunk = 2**20//(8*3*self.parameters['nx']) # FIXME: use proper size of self.dtype
+                time_chunk = max(time_chunk, 1)
+                ofile.create_dataset('statistics/xlines/' + k,
+                                     (1, self.parameters['nx'], 3),
+                                     chunks = (time_chunk, self.parameters['nx'], 3),
+                                     maxshape = (None, self.parameters['nx'], 3),
+                                     dtype = self.dtype)
+            for k in vec_stat_datasets:
+                time_chunk = 2**20//(8*3*3*nshells)
+                time_chunk = max(time_chunk, 1)
+                ofile.create_dataset('statistics/spectra/' + k + '_' + k,
+                                     (1, nshells, 3, 3),
+                                     chunks = (time_chunk, nshells, 3, 3),
+                                     maxshape = (None, nshells, 3, 3),
+                                     dtype = np.float64)
+                time_chunk = 2**20//(8*4*10)
+                time_chunk = max(time_chunk, 1)
+                a = ofile.create_dataset('statistics/moments/' + k,
+                                     (1, 10, 4),
+                                     chunks = (time_chunk, 10, 4),
+                                     maxshape = (None, 10, 4),
+                                     dtype = np.float64)
+                time_chunk = 2**20//(8*4*self.parameters['histogram_bins'])
+                time_chunk = max(time_chunk, 1)
+                ofile.create_dataset('statistics/histograms/' + k,
+                                     (1,
+                                      self.parameters['histogram_bins'],
+                                      4),
+                                     chunks = (time_chunk,
+                                               self.parameters['histogram_bins'],
+                                               4),
+                                     maxshape = (None,
+                                                 self.parameters['histogram_bins'],
+                                                 4),
+                                     dtype = np.int64)
+            ofile['checkpoint'] = int(0)
+        if self.particle_species == 0:
+            return None
+
+        if type(particle_ic) == type(None):
+            pbase_shape = (self.parameters['nparticles'],)
+            number_of_particles = self.parameters['nparticles']
+        else:
+            pbase_shape = particle_ic.shape[:-1]
+            assert(particle_ic.shape[-1] == 3)
+            number_of_particles = 1
+            for val in pbase_shape[1:]:
+                number_of_particles *= val
+        with h5py.File(self.get_particle_file_name(), 'a') as ofile:
+            s = 0
+            ofile.create_group('tracers{0}'.format(s))
+            ofile.create_group('tracers{0}/rhs'.format(s))
+            ofile.create_group('tracers{0}/state'.format(s))
+            ofile['tracers{0}/rhs'.format(s)].create_dataset(
+                    '0',
+                    shape = (
+                        (self.parameters['tracers{0}_integration_steps'.format(s)],) +
+                        pbase_shape +
+                        (3,)),
+                    dtype = np.float)
+            ofile['tracers{0}/state'.format(s)].create_dataset(
+                    '0',
+                    shape = (
+                        pbase_shape +
+                        (3,)),
+                    dtype = np.float)
+        return None
+    def specific_parser_arguments(
+            self,
+            parser):
+        _fluid_particle_base.specific_parser_arguments(self, parser)
+        parser.add_argument(
+                '--src-wd',
+                type = str,
+                dest = 'src_work_dir',
+                default = '')
+        parser.add_argument(
+                '--src-simname',
+                type = str,
+                dest = 'src_simname',
+                default = '')
+        parser.add_argument(
+                '--src-iteration',
+                type = int,
+                dest = 'src_iteration',
+                default = 0)
+        parser.add_argument(
+               '--njobs',
+               type = int, dest = 'njobs',
+               default = 1)
+        parser.add_argument(
+               '--kMeta',
+               type = float,
+               dest = 'kMeta',
+               default = 2.0)
+        parser.add_argument(
+               '--dtfactor',
+               type = float,
+               dest = 'dtfactor',
+               default = 0.5,
+               help = 'dt is computed as DTFACTOR / N')
+        parser.add_argument(
+               '--particle-rand-seed',
+               type = int,
+               dest = 'particle_rand_seed',
+               default = None)
+        parser.add_argument(
+               '--pclouds',
+               type = int,
+               dest = 'pclouds',
+               default = 1,
+               help = ('number of particle clouds. Particle "clouds" '
+                       'consist of particles distributed according to '
+                       'pcloud-type.'))
+        parser.add_argument(
+                '--pcloud-type',
+                choices = ['random-cube',
+                           'regular-cube'],
+                dest = 'pcloud_type',
+                default = 'random-cube')
+        parser.add_argument(
+               '--particle-cloud-size',
+               type = float,
+               dest = 'particle_cloud_size',
+               default = 2*np.pi)
+        parser.add_argument(
+                '--neighbours',
+                type = int,
+                dest = 'neighbours',
+                default = 1)
+        parser.add_argument(
+                '--smoothness',
+                type = int,
+                dest = 'smoothness',
+                default = 1)
+        return None
+    def prepare_launch(
+            self,
+            args = []):
+        """Set up reasonable parameters.
+
+        With the default Lundgren forcing applied in the band [2, 4],
+        we can estimate the dissipation, therefore we can estimate
+        :math:`k_M \\eta_K` and constrain the viscosity.
+
+        In brief, the command line parameter :math:`k_M \\eta_K` is
+        used in the following formula for :math:`\\nu` (:math:`N` is the
+        number of real space grid points per coordinate):
+
+        .. math::
+
+            \\nu = \\left(\\frac{2 k_M \\eta_K}{N} \\right)^{4/3}
+
+        With this choice, the average dissipation :math:`\\varepsilon`
+        will be close to 0.4, and the integral scale velocity will be
+        close to 0.77, yielding the approximate value for the Taylor
+        microscale and corresponding Reynolds number:
+
+        .. math::
+
+            \\lambda \\approx 4.75\\left(\\frac{2 k_M \\eta_K}{N} \\right)^{4/6}, \\hskip .5in
+            R_\\lambda \\approx 3.7 \\left(\\frac{N}{2 k_M \\eta_K} \\right)^{4/6}
+
+        """
+        opt = _code.prepare_launch(self, args = args)
+        self.parameters['nu'] = (opt.kMeta * 2 / opt.n)**(4./3)
+        self.parameters['dt'] = (opt.dtfactor / opt.n)
+        # custom famplitude for 288 and 576
+        if opt.n == 288:
+            self.parameters['famplitude'] = 0.45
+        elif opt.n == 576:
+            self.parameters['famplitude'] = 0.47
+        if ((self.parameters['niter_todo'] % self.parameters['niter_out']) != 0):
+            self.parameters['niter_out'] = self.parameters['niter_todo']
+        if len(opt.src_work_dir) == 0:
+            opt.src_work_dir = os.path.realpath(opt.work_dir)
+        self.pars_from_namespace(opt)
+        return opt
+    def launch(
+            self,
+            args = [],
+            **kwargs):
+        opt = self.prepare_launch(args = args)
+        if type(opt.nparticles) != type(None):
+            if opt.nparticles > 0:
+                self.name += '-particles'
+                self.add_particles(
+                    integration_steps = 4,
+                    neighbours = opt.neighbours,
+                    smoothness = opt.smoothness)
+        self.fill_up_fluid_code()
+        self.finalize_code()
+        self.launch_jobs(opt = opt)
+        return None
+    def generate_tracer_state(
+            self,
+            rseed = None,
+            iteration = 0,
+            species = 0,
+            write_to_file = False,
+            ncomponents = 3,
+            testing = False,
+            data = None):
+        if (type(data) == type(None)):
+            if not type(rseed) == type(None):
+                np.random.seed(rseed)
+            #point with problems: 5.37632864e+00,   6.10414710e+00,   6.25256493e+00]
+            data = np.zeros(self.parameters['nparticles']*ncomponents).reshape(-1, ncomponents)
+            data[:, :3] = np.random.random((self.parameters['nparticles'], 3))*2*np.pi
+        if testing:
+            #data[0] = np.array([3.26434, 4.24418, 3.12157])
+            data[:] = np.array([ 0.72086101,  2.59043666,  6.27501953])
+        with h5py.File(self.get_particle_file_name(), 'r+') as data_file:
+            data_file['tracers{0}/state/0'.format(species)][:] = data
+        if write_to_file:
+            data.tofile(
+                    os.path.join(
+                        self.work_dir,
+                        "tracers{0}_state_i{1:0>5x}".format(species, iteration)))
+        return data
+    def launch_jobs(
+            self,
+            opt = None):
+        if not os.path.exists(os.path.join(self.work_dir, self.simname + '.h5')):
+            particle_initial_condition = None
+            if opt.pclouds > 1:
+                np.random.seed(opt.particle_rand_seed)
+                if opt.pcloud_type == 'random-cube':
+                    particle_initial_condition = (
+                        np.random.random((opt.pclouds, 1, 3))*2*np.pi +
+                        np.random.random((1, self.parameters['nparticles'], 3))*opt.particle_cloud_size)
+                elif opt.pcloud_type == 'regular-cube':
+                    onedarray = np.linspace(
+                            -opt.particle_cloud_size/2,
+                            opt.particle_cloud_size/2,
+                            self.parameters['nparticles'])
+                    particle_initial_condition = np.zeros(
+                            (opt.pclouds,
+                             self.parameters['nparticles'],
+                             self.parameters['nparticles'],
+                             self.parameters['nparticles'], 3),
+                            dtype = np.float64)
+                    particle_initial_condition[:] = \
+                        np.random.random((opt.pclouds, 1, 1, 1, 3))*2*np.pi
+                    particle_initial_condition[..., 0] += onedarray[None, None, None, :]
+                    particle_initial_condition[..., 1] += onedarray[None, None, :, None]
+                    particle_initial_condition[..., 2] += onedarray[None, :, None, None]
+            self.write_par(
+                    particle_ic = particle_initial_condition)
+            if self.parameters['nparticles'] > 0:
+                data = self.generate_tracer_state(
+                        species = 0,
+                        rseed = opt.particle_rand_seed,
+                        data = particle_initial_condition)
+                for s in range(1, self.particle_species):
+                    self.generate_tracer_state(species = s, data = data)
+            init_condition_file = os.path.join(
+                    self.work_dir,
+                    self.simname + '_checkpoint_0.h5')
+            if not os.path.exists(init_condition_file):
+                f = h5py.File(init_condition_file, 'w')
+                if len(opt.src_simname) > 0:
+                    source_cp = 0
+                    src_file = 'not_a_file'
+                    while True:
+                        src_file = os.path.join(
+                            os.path.realpath(opt.src_work_dir),
+                            opt.src_simname + '_checkpoint_{0}.h5'.format(source_cp))
+                        f0 = h5py.File(src_file, 'r')
+                        if '{0}'.format(opt.src_iteration) in f0['vorticity/complex'].keys():
+                            f0.close()
+                            break
+                        source_cp += 1
+                    f['vorticity/complex/{0}'.format(0)] = h5py.ExternalLink(
+                            src_file,
+                            'vorticity/complex/{0}'.format(opt.src_iteration))
+                else:
+                    data = self.generate_vector_field(
+                           write_to_file = False,
+                           spectra_slope = 2.0,
+                           amplitude = 0.05)
+                    f['vorticity/complex/{0}'.format(0)] = data
+                f.close()
+        self.run(
+                nb_processes = opt.nb_processes,
+                nb_threads_per_process = opt.nb_threads_per_process,
+                njobs = opt.njobs,
+                hours = opt.minutes // 60,
+                minutes = opt.minutes % 60,
+                no_submit = opt.no_submit)
+        return None
+
+if __name__ == '__main__':
+    pass
+
diff --git a/bfps/NavierStokes.py b/bfps/NavierStokes.py
index af1982a60b0c2f35c3d5d53f81e0ac6a1cb6a94b..7ff89ebb6599264dec802272222471000ec79161 100644
--- a/bfps/NavierStokes.py
+++ b/bfps/NavierStokes.py
@@ -31,6 +31,7 @@ import h5py
 import argparse
 
 import bfps
+import bfps.tools
 from ._code import _code
 from ._fluid_base import _fluid_particle_base
 
@@ -262,20 +263,6 @@ class NavierStokes(_fluid_particle_base):
             field_H5T = 'H5T_NATIVE_FLOAT'
         elif self.dtype == np.float64:
             field_H5T = 'H5T_NATIVE_DOUBLE'
-        self.stat_src += self.create_stat_output(
-                '/statistics/xlines/velocity',
-                'fs->rvelocity',
-                data_type = field_H5T,
-                size_setup = """
-                    count[0] = 1;
-                    count[1] = nx;
-                    count[2] = 3;
-                    """,
-                close_spaces = False)
-        self.stat_src += self.create_stat_output(
-                '/statistics/xlines/vorticity',
-                'fs->rvorticity',
-                data_type = field_H5T)
         if self.QR_stats_on:
             self.stat_src += self.create_stat_output(
                     '/statistics/moments/trS2_Q_R',
@@ -615,7 +602,9 @@ class NavierStokes(_fluid_particle_base):
                 computation_needed =  not (ii0 == pp_file['ii0'].value and
                                            ii1 == pp_file['ii1'].value)
                 if computation_needed:
-                    for k in pp_file.keys():
+                    for k in ['t', 'vel_max(t)', 'renergy(t)',
+                              'energy(t, k)', 'enstrophy(t, k)',
+                              'ii0', 'ii1', 'iter0', 'iter1']:
                         del pp_file[k]
             if computation_needed:
                 pp_file['iter0'] = iter0
@@ -751,12 +740,14 @@ class NavierStokes(_fluid_particle_base):
             vec_stat_datasets = ['velocity', 'vorticity']
             scal_stat_datasets = []
             for k in vec_stat_datasets:
-                time_chunk = 2**20//(8*3*self.parameters['nx']) # FIXME: use proper size of self.dtype
+                time_chunk = 2**20 // (
+                        self.dtype.itemsize*3*
+                        self.parameters['nx']*self.parameters['ny'])
                 time_chunk = max(time_chunk, 1)
-                ofile.create_dataset('statistics/xlines/' + k,
-                                     (1, self.parameters['nx'], 3),
-                                     chunks = (time_chunk, self.parameters['nx'], 3),
-                                     maxshape = (None, self.parameters['nx'], 3),
+                ofile.create_dataset('statistics/0slices/' + k + '/real',
+                                     (1, self.parameters['ny'], self.parameters['nx'], 3),
+                                     chunks = (time_chunk, self.parameters['ny'], self.parameters['nx'], 3),
+                                     maxshape = (None, self.parameters['ny'], self.parameters['nx'], 3),
                                      dtype = self.dtype)
             if self.Lag_acc_stats_on:
                 vec_stat_datasets += ['Lagrangian_acceleration']
@@ -873,33 +864,6 @@ class NavierStokes(_fluid_particle_base):
                                      dtype = np.int64)
         if self.particle_species == 0:
             return None
-        def create_particle_dataset(
-                data_file,
-                dset_name,
-                dset_shape,
-                dset_maxshape,
-                dset_chunks,
-                # maybe something more general can be used here
-                dset_dtype = h5py.h5t.IEEE_F64LE):
-            # create the dataspace.
-            space_id = h5py.h5s.create_simple(
-                    dset_shape,
-                    dset_maxshape)
-            # create the dataset creation property list.
-            dcpl = h5py.h5p.create(h5py.h5p.DATASET_CREATE)
-            # set the allocation time to "early".
-            dcpl.set_alloc_time(h5py.h5d.ALLOC_TIME_EARLY)
-            dcpl.set_chunk(dset_chunks)
-            # and now create dataset
-            if sys.version_info[0] == 3:
-                dset_name = dset_name.encode()
-            return h5py.h5d.create(
-                    data_file.id,
-                    dset_name,
-                    dset_dtype,
-                    space_id,
-                    dcpl,
-                    h5py.h5p.DEFAULT)
 
         if type(particle_ic) == type(None):
             pbase_shape = (self.parameters['nparticles'],)
@@ -920,30 +884,41 @@ class NavierStokes(_fluid_particle_base):
                          self.parameters['tracers{0}_integration_steps'.format(s)]) +
                         pbase_shape + (3,))
                 maxshape = (h5py.h5s.UNLIMITED,) + dims[1:]
-                chunks = (time_chunk, 1, 1) + dims[3:]
-                create_particle_dataset(
+                if len(pbase_shape) > 1:
+                    chunks = (time_chunk, 1, 1) + dims[3:]
+                else:
+                    chunks = (time_chunk, 1) + dims[2:]
+                bfps.tools.create_alloc_early_dataset(
                         ofile,
                         '/tracers{0}/rhs'.format(s),
                         dims, maxshape, chunks)
-                create_particle_dataset(
+                if len(pbase_shape) > 1:
+                    chunks = (time_chunk, 1) + pbase_shape[1:] + (3,)
+                else:
+                    chunks = (time_chunk, pbase_shape[0], 3)
+                bfps.tools.create_alloc_early_dataset(
                         ofile,
                         '/tracers{0}/state'.format(s),
                         (1,) + pbase_shape + (3,),
                         (h5py.h5s.UNLIMITED,) + pbase_shape + (3,),
-                        (time_chunk, 1) + pbase_shape[1:] + (3,))
-                create_particle_dataset(
+                        chunks)
+                # "velocity" is sampled, single precision is enough
+                # for the results we are interested in.
+                bfps.tools.create_alloc_early_dataset(
                         ofile,
                         '/tracers{0}/velocity'.format(s),
                         (1,) + pbase_shape + (3,),
                         (h5py.h5s.UNLIMITED,) + pbase_shape + (3,),
-                        (time_chunk, 1) + pbase_shape[1:] + (3,))
+                        chunks,
+                        dset_dtype = h5py.h5t.IEEE_F32LE)
                 if self.parameters['tracers{0}_acc_on'.format(s)]:
-                    create_particle_dataset(
+                    bfps.tools.create_alloc_early_dataset(
                             ofile,
                             '/tracers{0}/acceleration'.format(s),
                             (1,) + pbase_shape + (3,),
                             (h5py.h5s.UNLIMITED,) + pbase_shape + (3,),
-                            (time_chunk, 1) + pbase_shape[1:] + (3,))
+                            chunks,
+                            dset_dtype = h5py.h5t.IEEE_F32LE)
         return None
     def add_particle_fields(
             self,
@@ -1058,6 +1033,16 @@ class NavierStokes(_fluid_particle_base):
                type = float,
                dest = 'particle_cloud_size',
                default = 2*np.pi)
+        parser.add_argument(
+                '--neighbours',
+                type = int,
+                dest = 'neighbours',
+                default = 1)
+        parser.add_argument(
+                '--smoothness',
+                type = int,
+                dest = 'smoothness',
+                default = 1)
         return None
     def prepare_launch(
             self,
@@ -1128,12 +1113,13 @@ class NavierStokes(_fluid_particle_base):
             opt.nparticles = 0
         elif type(opt.nparticles) == int:
             if opt.nparticles > 0:
+                self.name += '-particles'
                 self.add_3D_rFFTW_field(
                         name = 'rFFTW_acc')
                 self.add_interpolator(
                         name = 'cubic_spline',
-                        neighbours = 1,
-                        smoothness = 1,
+                        neighbours = opt.neighbours,
+                        smoothness = opt.smoothness,
                         class_name = 'rFFTW_interpolator')
                 self.add_particles(
                         integration_steps = [4],
@@ -1193,8 +1179,12 @@ class NavierStokes(_fluid_particle_base):
                            write_to_file = True,
                            spectra_slope = 2.0,
                            amplitude = 0.05)
-        self.run(
-                ncpu = opt.ncpu,
-                njobs = opt.njobs)
+        self.run(                
+                nb_processes = opt.nb_processes,
+                nb_threads_per_process = opt.nb_threads_per_process,
+                njobs = opt.njobs,
+                hours = opt.minutes // 60,
+                minutes = opt.minutes % 60,
+                no_submit = opt.no_submit)
         return None
 
diff --git a/bfps/__init__.py b/bfps/__init__.py
index 4a90f95268cffe3b0c2e1d68d7f4763a4c142e84..09663e1da56539eb51d257032444b38ba7096bc9 100644
--- a/bfps/__init__.py
+++ b/bfps/__init__.py
@@ -49,4 +49,5 @@ from host_information import host_info
 from .FluidConvert import FluidConvert
 from .FluidResize import FluidResize
 from .NavierStokes import NavierStokes
+from .NSVorticityEquation import NSVorticityEquation
 
diff --git a/bfps/__main__.py b/bfps/__main__.py
index a26d84d0e918cebe1a9351ca20b5249418d6a3c6..9db5e350340e67dfe99c5a40e3027b489399a42e 100644
--- a/bfps/__main__.py
+++ b/bfps/__main__.py
@@ -29,6 +29,7 @@ import argparse
 
 import bfps
 from .NavierStokes import NavierStokes
+from .NSVorticityEquation import NSVorticityEquation
 from .FluidResize import FluidResize
 from .FluidConvert import FluidConvert
 from .NSManyParticles import NSManyParticles
@@ -45,6 +46,12 @@ def main():
                  'NS',
                  'NS-single',
                  'NS-double']
+    NSVEoptions = ['NSVorticityEquation',
+                 'NSVorticityEquation-single',
+                 'NSVorticityEquation-double',
+                 'NSVE',
+                 'NSVE-single',
+                 'NSVE-double']
     FRoptions = ['FluidResize',
                  'FluidResize-single',
                  'FluidResize-double',
@@ -57,7 +64,7 @@ def main():
                'NSManyParticles-double']
     parser.add_argument(
             'base_class',
-            choices = NSoptions + FRoptions + FCoptions + NSMPopt,
+            choices = NSoptions + NSVEoptions + FRoptions + FCoptions + NSMPopt,
             type = str)
     # first option is the choice of base class or -h or -v
     # all other options are passed on to the base_class instance
@@ -70,6 +77,8 @@ def main():
         precision = 'single'
     if opt.base_class in NSoptions:
         base_class = NavierStokes
+    if opt.base_class in NSVEoptions:
+        base_class = NSVorticityEquation
     elif opt.base_class in FRoptions:
         base_class = FluidResize
     elif opt.base_class in FCoptions:
diff --git a/bfps/_base.py b/bfps/_base.py
index 2204fe666402eeccc4d815b6381d6b5060a0e7ac..1a112baa3775842f013640596768ad0597eaa187 100644
--- a/bfps/_base.py
+++ b/bfps/_base.py
@@ -94,11 +94,10 @@ class _base(object):
             elif type(parameters[key[i]]) == str:
                 src_txt += ('space = H5Dget_space(dset);\n' +
                             'memtype = H5Dget_type(dset);\n' +
-                            'H5Sget_simple_extent_dims(space, dims, NULL);\n' +
-                            'string_data = (char*)malloc(dims[0]*sizeof(char));\n' +
+                            'string_data = (char*)malloc(256);\n' +
                             'H5Dread(dset, memtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &string_data);\n' +
                             'sprintf({0}, "%s", string_data);\n'.format(key[i]) +
-                            'free(string_data);\n' +
+                            'free(string_data);\n'
                             'H5Sclose(space);\n' +
                             'H5Tclose(memtype);\n')
             elif type(parameters[key[i]]) == np.ndarray:
@@ -123,7 +122,7 @@ class _base(object):
             elif type(self.parameters[key[i]]) == str:
                 src_txt += 'DEBUG_MSG("'+ key[i] + ' = %s\\n", ' + key[i] + ');\n'
             elif type(self.parameters[key[i]]) == np.ndarray:
-                src_txt += ('for (int array_counter=0; array_counter<' +
+                src_txt += ('for (unsigned int array_counter=0; array_counter<' +
                             key[i] +
                             '.size(); array_counter++)\n' +
                             '{\n' +
@@ -250,8 +249,27 @@ class _base(object):
                help = 'code is run by default in a grid of NxNxN')
         parser.add_argument(
                 '--ncpu',
-                type = int, dest = 'ncpu',
-                default = 2)
+                type = int,
+                dest = 'ncpu',
+                default = -1)
+        parser.add_argument(
+                '--np', '--nprocesses',
+                metavar = 'NPROCESSES',
+                help = 'number of mpi processes to use',
+                type = int,
+                dest = 'nb_processes',
+                default = 4)
+        parser.add_argument(
+                '--ntpp', '--nthreads-per-process',
+                type = int,
+                dest = 'nb_threads_per_process',
+                metavar = 'NTHREADS_PER_PROCESS',
+                help = 'number of threads to use per MPI process',
+                default = 1)
+        parser.add_argument(
+                '--no-submit',
+                action = 'store_true',
+                dest = 'no_submit')
         parser.add_argument(
                 '--simname',
                 type = str, dest = 'simname',
@@ -265,6 +283,13 @@ class _base(object):
                 '--wd',
                 type = str, dest = 'work_dir',
                 default = './')
+        parser.add_argument(
+                '--minutes',
+                type = int,
+                dest = 'minutes',
+                default = 5,
+                help = 'If environment supports it, this is the requested wall-clock-limit.')
+
         return None
     def parameters_to_parser_arguments(
             self,
diff --git a/bfps/_code.py b/bfps/_code.py
index 314681ada3bb81e5700fdb7f1307c9af96fa5011..faf151559f25078ffbb214659a81f0a2f418b177 100644
--- a/bfps/_code.py
+++ b/bfps/_code.py
@@ -32,6 +32,7 @@ import argparse
 import h5py
 from datetime import datetime
 import math
+import warnings
 
 import bfps
 from ._base import _base
@@ -45,19 +46,25 @@ class _code(_base):
             work_dir = './',
             simname = 'test'):
         _base.__init__(self, work_dir = work_dir, simname = simname)
-        self.version_message = ('/***********************************************************************\n' +
-                                '* this code automatically generated by bfps\n' +
-                                '* version {0}\n'.format(bfps.__version__) +
-                                '***********************************************************************/\n\n\n')
+        self.version_message = (
+                '/***********************************************************************\n' +
+                '* this code automatically generated by bfps\n' +
+                '* version {0}\n'.format(bfps.__version__) +
+                '***********************************************************************/\n\n\n')
         self.includes = """
                 //begincpp
                 #include "base.hpp"
                 #include "fluid_solver.hpp"
+                #include "scope_timer.hpp"
+                #include "fftw_interface.hpp"
                 #include <iostream>
                 #include <hdf5.h>
                 #include <string>
                 #include <cstring>
                 #include <fftw3-mpi.h>
+				#include <omp.h>
+                #include <fenv.h>
+                #include <cstdlib>
                 //endcpp
                 """
         self.variables = 'int myrank, nprocs;\n'
@@ -69,23 +76,58 @@ class _code(_base):
                 //begincpp
                 int main(int argc, char *argv[])
                 {
-                    MPI_Init(&argc, &argv);
-                    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-                    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-                    fftw_mpi_init();
-                    fftwf_mpi_init();
+                    if(getenv("BFPS_FPE_OFF") == nullptr || getenv("BFPS_FPE_OFF") != std::string("TRUE")){
+                        feenableexcept(FE_INVALID | FE_OVERFLOW);
+                    }
+                    else{
+                        std::cout << "FPE have been turned OFF" << std::endl;
+                    }
                     if (argc != 2)
                     {
                         std::cerr << "Wrong number of command line arguments. Stopping." << std::endl;
                         MPI_Finalize();
                         return EXIT_SUCCESS;
                     }
+                #ifdef NO_FFTWOMP
+                    MPI_Init(&argc, &argv);
+                    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+                    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+                    fftw_mpi_init();
+                    fftwf_mpi_init();
+                    DEBUG_MSG("There are %d processes\\n", nprocs);
+                #else
+                    int mpiprovided;
+                    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &mpiprovided);
+                    assert(mpiprovided >= MPI_THREAD_FUNNELED);
+                    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+                    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+                    const int nbThreads = omp_get_max_threads();
+                    DEBUG_MSG("Number of threads for the FFTW = %d\\n", nbThreads);
+                    if (nbThreads > 1){
+                        fftw_init_threads();
+                        fftwf_init_threads();
+                    }
+                    fftw_mpi_init();
+                    fftwf_mpi_init();
+                    DEBUG_MSG("There are %d processes and %d threads\\n", nprocs, nbThreads);
+                    if (nbThreads > 1){
+                        fftw_plan_with_nthreads(nbThreads);
+                        fftwf_plan_with_nthreads(nbThreads);
+                    }
+                #endif
                     strcpy(simname, argv[1]);
                     sprintf(fname, "%s.h5", simname);
                     parameter_file = H5Fopen(fname, H5F_ACC_RDONLY, H5P_DEFAULT);
                     Cdset = H5Dopen(parameter_file, "iteration", H5P_DEFAULT);
-                    H5Dread(Cdset, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, &iteration);
-                    DEBUG_MSG("simname is %s and iteration is %d\\n", simname, iteration);
+                    H5Dread(
+                        Cdset,
+                        H5T_NATIVE_INT,
+                        H5S_ALL,
+                        H5S_ALL,
+                        H5P_DEFAULT,
+                        &iteration);
+                    DEBUG_MSG("simname is %s and iteration is %d\\n",
+                              simname, iteration);
                     H5Dclose(Cdset);
                     H5Fclose(parameter_file);
                     read_parameters();
@@ -97,12 +139,16 @@ class _code(_base):
                         DEBUG_MSG("when setting stat_file cache I got %d\\n", cache_err);
                         stat_file = H5Fopen(fname, H5F_ACC_RDWR, fapl);
                     }
+                    {
+                        TIMEZONE("code::main_start");
                 //endcpp
                 """
         for ostream in ['cout', 'cerr']:
-            self.main_start += 'if (myrank == 0) std::{1} << "{0}" << std::endl;'.format(self.version_message, ostream).replace('\n', '\\n') + '\n'
+            self.main_start += 'if (myrank == 0) std::{1} << "{0}" << std::endl;'.format(
+                    self.version_message, ostream).replace('\n', '\\n') + '\n'
         self.main_end = """
                 //begincpp
+                    }
                     // clean up
                     if (myrank == 0)
                     {
@@ -113,6 +159,17 @@ class _code(_base):
                     }
                     fftwf_mpi_cleanup();
                     fftw_mpi_cleanup();
+                #ifndef NO_FFTWOMP
+                    if (nbThreads > 1){
+                        fftw_cleanup_threads();
+                        fftwf_cleanup_threads();
+                    }
+                #endif
+                    #ifdef USE_TIMINGOUTPUT
+                    global_timer_manager.show(MPI_COMM_WORLD);
+                    global_timer_manager.showMpi(MPI_COMM_WORLD);
+                    global_timer_manager.showHtml(MPI_COMM_WORLD);
+                    #endif
                     MPI_Finalize();
                     return EXIT_SUCCESS;
                 }
@@ -147,15 +204,21 @@ class _code(_base):
         libraries = ['bfps']
         libraries += bfps.install_info['libraries']
 
-        command_strings = ['g++']
+        command_strings = [bfps.install_info['compiler']]
         command_strings += [self.name + '.cpp', '-o', self.name]
         command_strings += bfps.install_info['extra_compile_args']
         command_strings += ['-I' + idir for idir in bfps.install_info['include_dirs']]
         command_strings.append('-I' + bfps.header_dir)
         command_strings += ['-L' + ldir for ldir in bfps.install_info['library_dirs']]
+        command_strings += ['-Wl,-rpath=' + ldir for ldir in bfps.install_info['library_dirs']]
         command_strings.append('-L' + bfps.lib_dir)
+        command_strings.append('-Wl,-rpath=' + bfps.lib_dir)
+
         for libname in libraries:
             command_strings += ['-l' + libname]
+
+        command_strings += ['-fopenmp']
+
         self.write_src()
         print('compiling code with command\n' + ' '.join(command_strings))
         return subprocess.call(command_strings)
@@ -165,12 +228,14 @@ class _code(_base):
         self.host_info.update(host_info)
         return None
     def run(self,
-            ncpu = 2,
+            nb_processes,
+            nb_threads_per_process,
             out_file = 'out_file',
             err_file = 'err_file',
-            hours = 1,
-            minutes = 0,
-            njobs = 1):
+            hours = 0,
+            minutes = 10,
+            njobs = 1,
+            no_submit = False):
         self.read_parameters()
         with h5py.File(os.path.join(self.work_dir, self.simname + '.h5'), 'r') as data_file:
             iter0 = data_file['iteration'].value
@@ -190,7 +255,9 @@ class _code(_base):
         os.chdir(current_dir)
         command_atoms = ['mpirun',
                          '-np',
-                         '{0}'.format(ncpu),
+                         '{0}'.format(nb_processes),
+                         '-x',
+                         'OMP_NUM_THREADS={0}'.format(nb_threads_per_process),
                          './' + self.name,
                          self.simname]
         if self.host_info['type'] == 'cluster':
@@ -200,9 +267,9 @@ class _code(_base):
                 qsub_script_name = 'run_' + suffix + '.sh'
                 self.write_sge_file(
                     file_name     = os.path.join(self.work_dir, qsub_script_name),
-                    nprocesses    = ncpu,
+                    nprocesses    = nb_processes*nb_threads_per_process,
                     name_of_run   = suffix,
-                    command_atoms = command_atoms[3:],
+                    command_atoms = command_atoms[5:],
                     hours         = hours,
                     minutes       = minutes,
                     out_file      = out_file + '_' + suffix,
@@ -214,6 +281,65 @@ class _code(_base):
                 subprocess.call(qsub_atoms + [qsub_script_name])
                 os.chdir(current_dir)
                 job_name_list.append(suffix)
+        if self.host_info['type'] == 'SLURM':
+            job_id_list = []
+            for j in range(njobs):
+                suffix = self.simname + '_{0}'.format(iter0 + j*self.parameters['niter_todo'])
+                qsub_script_name = 'run_' + suffix + '.sh'
+                self.write_slurm_file(
+                    file_name     = os.path.join(self.work_dir, qsub_script_name),
+                    name_of_run   = suffix,
+                    command_atoms = command_atoms[5:],
+                    hours         = hours,
+                    minutes       = minutes,
+                    out_file      = out_file + '_' + suffix,
+                    err_file      = err_file + '_' + suffix,
+                    nb_mpi_processes = nb_processes,
+			        nb_threads_per_process = nb_threads_per_process)
+                os.chdir(self.work_dir)
+                qsub_atoms = ['sbatch']
+
+                if not no_submit:
+                    if len(job_id_list) >= 1:
+                        qsub_atoms += ['--dependency=afterok:{0}'.format(job_id_list[-1])]
+                    p = subprocess.Popen(
+                        qsub_atoms + [qsub_script_name],
+                        stdout = subprocess.PIPE)
+                    out, err = p.communicate()
+                    p.terminate()
+                    job_id_list.append(int(out.split()[-1]))
+                os.chdir(current_dir)
+        elif self.host_info['type'] == 'IBMLoadLeveler':
+            suffix = self.simname + '_{0}'.format(iter0)
+            job_script_name = 'run_' + suffix + '.sh'
+            if (njobs == 1):
+                self.write_IBMLoadLeveler_file_single_job(
+                    file_name     = os.path.join(self.work_dir, job_script_name),
+                    name_of_run   = suffix,
+                    command_atoms = command_atoms[5:],
+                    hours         = hours,
+                    minutes       = minutes,
+                    out_file      = out_file + '_' + suffix,
+                    err_file      = err_file + '_' + suffix,
+                    nb_mpi_processes = nb_processes,
+			        nb_threads_per_process = nb_threads_per_process)
+            else:
+                self.write_IBMLoadLeveler_file_many_job(
+                    file_name     = os.path.join(self.work_dir, job_script_name),
+                    name_of_run   = suffix,
+                    command_atoms = command_atoms[5:],
+                    hours         = hours,
+                    minutes       = minutes,
+                    out_file      = out_file + '_' + suffix,
+                    err_file      = err_file + '_' + suffix,
+                    njobs = njobs,
+                    nb_mpi_processes = nb_processes,
+			        nb_threads_per_process = nb_threads_per_process)
+            submit_atoms = ['llsubmit']
+
+            if not no_submit:
+                subprocess.call(submit_atoms + [os.path.join(self.work_dir, job_script_name)])
+
         elif self.host_info['type'] == 'pc':
             os.chdir(self.work_dir)
             os.environ['LD_LIBRARY_PATH'] += ':{0}'.format(bfps.lib_dir)
@@ -226,6 +352,195 @@ class _code(_base):
                                 stderr = open(err_file + '_' + suffix, 'w'))
             os.chdir(current_dir)
         return None
+    def write_IBMLoadLeveler_file_single_job(
+            self,
+            file_name = None,
+            nprocesses = None,
+            name_of_run = None,
+            command_atoms = [],
+            hours = None,
+            minutes = None,
+            out_file = None,
+            err_file = None,
+			nb_mpi_processes = None,
+			nb_threads_per_process = None):
+
+        script_file = open(file_name, 'w')
+        script_file.write('# @ shell=/bin/bash\n')
+        # error file
+        if type(err_file) == type(None):
+            err_file = 'err.job.$(jobid)'
+        script_file.write('# @ error = ' + os.path.join(self.work_dir, err_file) + '\n')
+        # output file
+        if type(out_file) == type(None):
+            out_file = 'out.job.$(jobid)'
+        script_file.write('# @ output = ' + os.path.join(self.work_dir, out_file) + '\n')
+
+        # If Ibm is used should be : script_file.write('# @ job_type = parallel\n')
+        script_file.write('# @ job_type = MPICH\n')
+
+        script_file.write('# @ node_usage = not_shared\n')
+        script_file.write('# @ notification = complete\n')
+        script_file.write('# @ notify_user = $(user)@rzg.mpg.de\n')
+
+        nb_cpus_per_node = self.host_info['deltanprocs']
+        assert(isinstance(nb_cpus_per_node, int) and nb_cpus_per_node >= 1,
+                'nb_cpus_per_node is {}'.format(nb_cpus_per_node))
+
+        # No more threads than the number of cores
+        assert(nb_threads_per_process <= nb_cpus_per_node,
+               "Cannot use more threads ({} asked) than the number of cores ({})".format(
+                   nb_threads_per_process, nb_cpus_per_node))
+        # Warn if some core will not be ued
+        if nb_cpus_per_node%nb_threads_per_process != 0:
+            warnings.warn("The number of threads is smaller than the number of cores (machine will be underused)",
+                    UserWarning)
+
+        nb_cpus = nb_mpi_processes*nb_threads_per_process
+        if (nb_cpus < nb_cpus_per_node):
+            # in case we use only a few process on a single node
+            nb_nodes = 1
+            nb_processes_per_node = nb_mpi_processes
+            first_node_tasks = nb_mpi_processes
+        else:
+            nb_nodes = int((nb_cpus+nb_cpus_per_node-1) // nb_cpus_per_node)
+            # if more than one node we requiere to have a multiple of deltanprocs
+            nb_processes_per_node = int(nb_cpus_per_node // nb_threads_per_process)
+            first_node_tasks = int(nb_mpi_processes - (nb_nodes-1)*nb_processes_per_node)
+
+        script_file.write('# @ resources = ConsumableCpus({})\n'.format(nb_threads_per_process))
+        script_file.write('# @ network.MPI = sn_all,not_shared,us\n')
+        script_file.write('# @ wall_clock_limit = {0}:{1:0>2d}:00\n'.format(hours, minutes))
+        assert(type(self.host_info['environment']) != type(None))
+        script_file.write('# @ node = {0}\n'.format(nb_nodes))
+        script_file.write('# @ tasks_per_node = {0}\n'.format(nb_processes_per_node))
+        if (first_node_tasks > 0):
+            script_file.write('# @ first_node_tasks = {0}\n'.format(first_node_tasks))
+        script_file.write('# @ queue\n')
+
+
+        script_file.write('source ~/.config/bfps/bashrc\n')
+        script_file.write('module li\n')
+        script_file.write('export OMP_NUM_THREADS={}\n'.format(nb_threads_per_process))
+
+        script_file.write('LD_LIBRARY_PATH=' +
+                          ':'.join([bfps.lib_dir] + bfps.install_info['library_dirs']) +
+                          ':${LD_LIBRARY_PATH}\n')
+        script_file.write('echo "Start time is `date`"\n')
+        script_file.write('export HTMLOUTPUT={}.html\n'.format(command_atoms[-1]))
+        script_file.write('cd ' + self.work_dir + '\n')
+
+        script_file.write('export KMP_AFFINITY=compact,verbose\n')
+        script_file.write('export I_MPI_PIN_DOMAIN=omp\n')
+        script_file.write('mpiexec.hydra '
+            + ' -np {} '.format(nb_mpi_processes)
+            + ' -ppn {} '.format(nb_processes_per_node)
+            + ' -ordered-output -prepend-rank '
+            + os.path.join(
+                self.work_dir,
+                command_atoms[0]) +
+            ' ' +
+            ' '.join(command_atoms[1:]) +
+            '\n')
+
+        script_file.write('echo "End time is `date`"\n')
+        script_file.write('exit 0\n')
+        script_file.close()
+        return None
+    def write_IBMLoadLeveler_file_many_job(
+            self,
+            file_name = None,
+            nprocesses = None,
+            name_of_run = None,
+            command_atoms = [],
+            hours = None,
+            minutes = None,
+            out_file = None,
+            err_file = None,
+            njobs = 2,
+			nb_mpi_processes = None,
+			nb_threads_per_process = None):
+        assert(type(self.host_info['environment']) != type(None))
+        script_file = open(file_name, 'w')
+        script_file.write('# @ shell=/bin/bash\n')
+        # error file
+        if type(err_file) == type(None):
+            err_file = 'err.job.$(jobid).$(stepid)'
+        script_file.write('# @ error = ' + os.path.join(self.work_dir, err_file) + '\n')
+        # output file
+        if type(out_file) == type(None):
+            out_file = 'out.job.$(jobid).$(stepid)'
+        script_file.write('# @ output = ' + os.path.join(self.work_dir, out_file) + '\n')
+        # If Ibm is used should be : script_file.write('# @ job_type = parallel\n')
+        script_file.write('# @ job_type = MPICH\n')
+        script_file.write('# @ node_usage = not_shared\n')
+        script_file.write('#\n')
+
+        nb_cpus_per_node = self.host_info['deltanprocs']
+        assert(isinstance(nb_cpus_per_node, int) and nb_cpus_per_node >= 1, 'nb_cpus_per_node is {}'.format(nb_cpus_per_node))
+
+        # No more threads than the number of cores
+        assert(nb_threads_per_process <= nb_cpus_per_node,
+               "Cannot use more threads ({} asked) than the number of cores ({})".format(
+                   nb_threads_per_process, nb_cpus_per_node))
+        # Warn if some core will not be ued
+        if nb_cpus_per_node%nb_threads_per_process != 0:
+            warnings.warn("The number of threads is smaller than the number of cores (machine will be underused)",
+                    UserWarning)
+
+        nb_cpus = nb_mpi_processes*nb_threads_per_process
+        if (nb_cpus < nb_cpus_per_node):
+            # in case we use only a few process on a single node
+            nb_nodes = 1
+            nb_processes_per_node = nb_mpi_processes
+            first_node_tasks = nb_mpi_processes
+        else:
+            nb_nodes = int((nb_cpus+nb_cpus_per_node-1) // nb_cpus_per_node)
+            # if more than one node we requiere to have a multiple of deltanprocs
+            nb_processes_per_node = int(nb_cpus_per_node // nb_threads_per_process)
+            first_node_tasks = int(nb_mpi_processes - (nb_nodes-1)*nb_processes_per_node)
+
+        for job in range(njobs):
+            script_file.write('# @ step_name = {0}.$(stepid)\n'.format(self.simname))
+            script_file.write('# @ resources = ConsumableCpus({})\n'.format(nb_threads_per_process))
+            script_file.write('# @ network.MPI = sn_all,not_shared,us\n')
+            script_file.write('# @ wall_clock_limit = {0}:{1:0>2d}:00\n'.format(hours, minutes))
+            assert(type(self.host_info['environment']) != type(None))
+            script_file.write('# @ node = {0}\n'.format(nb_nodes))
+            script_file.write('# @ tasks_per_node = {0}\n'.format(nb_processes_per_node))
+            if (first_node_tasks > 0):
+                script_file.write('# @ first_node_tasks = {0}\n'.format(first_node_tasks))
+            script_file.write('# @ queue\n')
+
+        script_file.write('source ~/.config/bfps/bashrc\n')
+        script_file.write('module li\n')
+        script_file.write('export OMP_NUM_THREADS={}\n'.format(nb_threads_per_process))
+
+        script_file.write('LD_LIBRARY_PATH=' +
+                          ':'.join([bfps.lib_dir] + bfps.install_info['library_dirs']) +
+                          ':${LD_LIBRARY_PATH}\n')
+        script_file.write('echo "Start time is `date`"\n')
+        script_file.write('export HTMLOUTPUT={}.html\n'.format(command_atoms[-1]))
+        script_file.write('cd ' + self.work_dir + '\n')
+
+        script_file.write('export KMP_AFFINITY=compact,verbose\n')
+        script_file.write('export I_MPI_PIN_DOMAIN=omp\n')
+
+        script_file.write('mpiexec.hydra '
+            + ' -np {} '.format(nb_mpi_processes)
+            + ' -ppn {} '.format(nb_processes_per_node)
+            + ' -ordered-output -prepend-rank '
+            + os.path.join(
+                self.work_dir,
+                command_atoms[0]) +
+            ' ' +
+            ' '.join(command_atoms[1:]) +
+            '\n')
+
+        script_file.write('echo "End time is `date`"\n')
+        script_file.write('exit 0\n')
+        script_file.close()
+        return None
     def write_sge_file(
             self,
             file_name = None,
@@ -267,6 +582,79 @@ class _code(_base):
         script_file.write('exit 0\n')
         script_file.close()
         return None
+    def write_slurm_file(
+            self,
+            file_name = None,
+            name_of_run = None,
+            command_atoms = [],
+            hours = None,
+            minutes = None,
+            out_file = None,
+            err_file = None,
+			nb_mpi_processes = None,
+			nb_threads_per_process = None):
+        script_file = open(file_name, 'w')
+        script_file.write('#!/bin/bash -l\n')
+        # job name
+        script_file.write('#SBATCH -J {0}\n'.format(name_of_run))
+        # use current working directory
+        script_file.write('#SBATCH -D ./\n')
+        # error file
+        if not type(err_file) == type(None):
+            script_file.write('#SBATCH -e ' + err_file + '\n')
+        # output file
+        if not type(out_file) == type(None):
+            script_file.write('#SBATCH -o ' + out_file + '\n')
+        script_file.write('#SBATCH --partition={0}\n'.format(
+                self.host_info['environment']))
+
+        nb_cpus_per_node = self.host_info['deltanprocs']
+        assert(isinstance(nb_cpus_per_node, int) and nb_cpus_per_node >= 1,
+               'nb_cpus_per_node is {}'.format(nb_cpus_per_node))
+
+        # No more threads than the number of cores
+        assert(nb_threads_per_process <= nb_cpus_per_node,
+               "Cannot use more threads ({} asked) than the number of cores ({})".format(
+                   nb_threads_per_process, nb_cpus_per_node))
+        # Warn if some core will not be ued
+        if nb_cpus_per_node%nb_threads_per_process != 0:
+            warnings.warn(
+                    "The number of threads is smaller than the number of cores (machine will be underused)",
+                    UserWarning)
+
+        nb_cpus = nb_mpi_processes*nb_threads_per_process
+        if (nb_cpus < nb_cpus_per_node):
+            # in case we use only a few process on a single node
+            nb_nodes = 1
+            nb_processes_per_node = nb_mpi_processes
+        else:
+            nb_nodes = int((nb_cpus+nb_cpus_per_node-1) // nb_cpus_per_node)
+            # if more than one node we requiere to have a multiple of deltanprocs
+            nb_processes_per_node = int(nb_cpus_per_node // nb_threads_per_process)
+
+
+        script_file.write('#SBATCH --nodes={0}\n'.format(nb_nodes))
+        script_file.write('#SBATCH --ntasks-per-node={0}\n'.format(nb_processes_per_node))
+        script_file.write('#SBATCH --cpus-per-task={0}\n'.format(nb_threads_per_process))
+
+        script_file.write('#SBATCH --mail-type=none\n')
+        script_file.write('#SBATCH --time={0}:{1:0>2d}:00\n'.format(hours, minutes))
+        script_file.write('source ~/.config/bfps/bashrc\n')
+        if nb_threads_per_process > 1:
+            script_file.write('export OMP_NUM_THREADS={0}\n'.format(nb_threads_per_process))
+            script_file.write('export OMP_PLACES=cores\n')
+
+        script_file.write('LD_LIBRARY_PATH=' +
+                          ':'.join([bfps.lib_dir] + bfps.install_info['library_dirs']) +
+                          ':${LD_LIBRARY_PATH}\n')
+        script_file.write('echo "Start time is `date`"\n')
+        script_file.write('cd ' + self.work_dir + '\n')
+        script_file.write('export HTMLOUTPUT={}.html\n'.format(command_atoms[-1]))
+        script_file.write('srun {0}\n'.format(' '.join(command_atoms)))
+        script_file.write('echo "End time is `date`"\n')
+        script_file.write('exit 0\n')
+        script_file.close()
+        return None
     def prepare_launch(
             self,
             args = [],
@@ -274,6 +662,14 @@ class _code(_base):
         parser = argparse.ArgumentParser('bfps ' + type(self).__name__)
         self.add_parser_arguments(parser)
         opt = parser.parse_args(args)
+
+        if opt.ncpu != -1:
+            warnings.warn(
+                    'ncpu should be replaced by np/ntpp',
+                    DeprecationWarning)
+            opt.nb_processes = opt.ncpu
+            opt.nb_threads_per_process = 1
+
         self.set_host_info(bfps.host_info)
         if type(opt.environment) != type(None):
             self.host_info['environment'] = opt.environment
diff --git a/bfps/_fluid_base.py b/bfps/_fluid_base.py
index 7eef1e1569cf3f5f66b5adcf52494be8de2fbe49..dac5a581c73bc456adcbd82112a29b7353635075 100644
--- a/bfps/_fluid_base.py
+++ b/bfps/_fluid_base.py
@@ -95,6 +95,7 @@ class _fluid_particle_base(_code):
                 //begincpp
                 if (myrank == 0 && iteration == 0)
                 {
+                    TIMEZONE("fuild_base::store_kspace");
                     hsize_t dims[4];
                     hid_t space, dset;
                     // store kspace information
@@ -142,7 +143,7 @@ class _fluid_particle_base(_code):
         self.includes   += self.fluid_includes
         self.includes   += '#include <ctime>\n'
         self.variables  += (self.fluid_variables +
-                            'hid_t particle_file;\n')
+                            '//hid_t particle_file;\n')
         self.definitions += ('int grow_single_dataset(hid_t dset, int tincrement)\n{\n' +
                              'int ndims;\n' +
                              'hsize_t space;\n' +
@@ -217,7 +218,7 @@ class _fluid_particle_base(_code):
                         """.format(fftw_prefix) + self.main_end
         if self.particle_species > 0:
             self.main_start += """
-                if (myrank == 0)
+                /*if (myrank == 0)
                 {
                     // set caching parameters
                     hid_t fapl = H5Pcreate(H5P_FILE_ACCESS);
@@ -225,12 +226,12 @@ class _fluid_particle_base(_code):
                     DEBUG_MSG("when setting cache for particles I got %d\\n", cache_err);
                     sprintf(fname, "%s_particles.h5", simname);
                     particle_file = H5Fopen(fname, H5F_ACC_RDWR, fapl);
-                }
+                }*/
                 """
-            self.main_end = ('if (myrank == 0)\n' +
+            self.main_end = ('/*if (myrank == 0)\n' +
                              '{\n' +
                              'H5Fclose(particle_file);\n' +
-                             '}\n') + self.main_end
+                             '}*/\n') + self.main_end
         self.main        = """
                            //begincpp
                            int data_file_problem;
@@ -263,8 +264,15 @@ class _fluid_particle_base(_code):
                                       '<< time_difference/nprocs << " seconds" << std::endl;\n' +
                                   'time0 = time1;\n')
         if not postprocess_mode:
-            self.main       += 'for (int max_iter = iteration+niter_todo; iteration < max_iter; iteration++)\n'
+            self.main       += 'for (int max_iter = iteration+niter_todo-iteration%niter_todo; iteration < max_iter; iteration++)\n'
             self.main       += '{\n'
+
+            self.main       += """
+                                #ifdef USE_TIMINGOUTPUT
+                                const std::string loopLabel = "code::main_start::loop-" + std::to_string(iteration);
+                                TIMEZONE(loopLabel.c_str());
+                                #endif
+                                """
             self.main       += 'if (iteration % niter_stat == 0) do_stats();\n'
             if self.particle_species > 0:
                 self.main       += 'if (iteration % niter_part == 0) do_particle_stats();\n'
@@ -278,6 +286,12 @@ class _fluid_particle_base(_code):
         else:
             self.main       += 'for (int frame_index = iter0; frame_index <= iter1; frame_index += niter_out)\n'
             self.main       += '{\n'
+            self.main       += """
+                                #ifdef USE_TIMINGOUTPUT
+                                const std::string loopLabel = "code::main_start::loop-" + std::to_string(frame_index);
+                                TIMEZONE(loopLabel.c_str());
+                                #endif
+                                """
             if self.particle_species > 0:
                 self.main   += self.particle_loop
             self.main       += self.fluid_loop
@@ -292,6 +306,9 @@ class _fluid_particle_base(_code):
             field = 'velocity',
             iteration = 0,
             filename = None):
+        """
+            :note: assumes field is a vector field
+        """
         if type(filename) == type(None):
             filename = os.path.join(
                     self.work_dir,
@@ -299,6 +316,7 @@ class _fluid_particle_base(_code):
         return np.memmap(
                 filename,
                 dtype = self.dtype,
+                mode = 'r',
                 shape = (self.parameters['nz'],
                          self.parameters['ny'],
                          self.parameters['nx'], 3))
@@ -437,7 +455,7 @@ class _fluid_particle_base(_code):
             #data[0] = np.array([3.26434, 4.24418, 3.12157])
             data[0] = np.array([ 0.72086101,  2.59043666,  6.27501953])
         with h5py.File(self.get_particle_file_name(), 'r+') as data_file:
-            data_file['tracers{0}/state'.format(species)][0] = data
+            data_file['tracers{0}/state/0'.format(species)][0] = data
         if write_to_file:
             data.tofile(
                     os.path.join(
diff --git a/bfps/cpp/base.hpp b/bfps/cpp/base.hpp
index ee2d74d5b751451e9bb34600a0e2b09891a73d1f..adfdd62f772795269cbcc5241dcb881677e38e72 100644
--- a/bfps/cpp/base.hpp
+++ b/bfps/cpp/base.hpp
@@ -42,6 +42,9 @@ inline int MOD(int a, int n)
     return ((a%n) + n) % n;
 }
 
+/////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////
+
 #ifdef OMPI_MPI_H
 
 #define BFPS_MPICXX_DOUBLE_COMPLEX MPI_DOUBLE_COMPLEX
@@ -52,6 +55,37 @@ inline int MOD(int a, int n)
 
 #endif//OMPI_MPI_H
 
+template <class realtype>
+class mpi_real_type;
+
+template <>
+class mpi_real_type<float>
+{
+public:
+    static constexpr MPI_Datatype real(){
+        return MPI_FLOAT;
+    }
+
+    static constexpr MPI_Datatype complex(){
+        return MPI_COMPLEX;
+    }
+};
+
+template <>
+class mpi_real_type<double>
+{
+public:
+    static constexpr MPI_Datatype real(){
+        return MPI_DOUBLE;
+    }
+
+    static constexpr MPI_Datatype complex(){
+        return BFPS_MPICXX_DOUBLE_COMPLEX;
+    }
+};
+
+/////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////
 
 #ifndef NDEBUG
 
@@ -99,5 +133,7 @@ inline void DEBUG_MSG_WAIT(MPI_Comm communicator, const char * format, ...)
 
 #endif//NDEBUG
 
+#define variable_used_only_in_assert(x) ((void)(x))
+
 #endif//BASE
 
diff --git a/bfps/cpp/bfps_timer.hpp b/bfps/cpp/bfps_timer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bec3cb681aed06d04f789bbe6e335f59958266be
--- /dev/null
+++ b/bfps/cpp/bfps_timer.hpp
@@ -0,0 +1,104 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+#ifndef BFPS_TIMER_HPP
+#define BFPS_TIMER_HPP
+
+#include <chrono>
+
+/**
+  * @file
+ *
+ * Each section to measure should be embraced by start/stop.
+ * The measured time is given by "getElapsed".
+ * The total time measured by a timer is given by "getCumulated".
+ * Example :
+ * @code bfps_timer tm; // Implicit start
+ * @code ...
+ * @code tm.stop(); // stop the timer
+ * @code tm.getElapsed(); // return the duration in s [A]
+ * @code tm.start(); // restart the timer
+ * @code ...
+ * @code tm.stopAndGetElapsed(); // stop the timer and return the duraction in s
+ * [B]
+ * @code tm.getCumulated(); // Equal [A] + [B]
+ */
+class bfps_timer {
+    using double_second_time = std::chrono::duration<double, std::ratio<1, 1>>;
+
+    std::chrono::high_resolution_clock::time_point
+    m_start;  ///< m_start time (start)
+    std::chrono::high_resolution_clock::time_point m_end;  ///< stop time (stop)
+    std::chrono::nanoseconds m_cumulate;  ///< the m_cumulate time
+
+public:
+    /// Constructor
+    bfps_timer() { start(); }
+
+    /// Copy constructor
+    bfps_timer(const bfps_timer& other) = delete;
+    /// Copies an other timer
+    bfps_timer& operator=(const bfps_timer& other) = delete;
+    /// Move constructor
+    bfps_timer(bfps_timer&& other) = delete;
+    /// Copies an other timer
+    bfps_timer& operator=(bfps_timer&& other) = delete;
+
+    /** Rest all the values, and apply start */
+    void reset() {
+        m_start = std::chrono::high_resolution_clock::time_point();
+        m_end = std::chrono::high_resolution_clock::time_point();
+        m_cumulate = std::chrono::nanoseconds();
+        start();
+    }
+
+    /** Start the timer */
+    void start() {
+        m_start = std::chrono::high_resolution_clock::now();
+    }
+
+    /** Stop the current timer */
+    void stop() {
+        m_end = std::chrono::high_resolution_clock::now();
+        m_cumulate += std::chrono::duration_cast<std::chrono::nanoseconds>(m_end - m_start);
+    }
+
+    /** Return the elapsed time between start and stop (in second) */
+    double getElapsed() const {
+        return std::chrono::duration_cast<double_second_time>(
+                    std::chrono::duration_cast<std::chrono::nanoseconds>(m_end - m_start)).count();
+    }
+
+    /** Return the total counted time */
+    double getCumulated() const {
+        return std::chrono::duration_cast<double_second_time>(m_cumulate).count();
+    }
+
+    /** End the current counter (stop) and return the elapsed time */
+    double stopAndGetElapsed() {
+        stop();
+        return getElapsed();
+    }
+};
+
+#endif
diff --git a/bfps/cpp/distributed_particles.cpp b/bfps/cpp/distributed_particles.cpp
index 7d0808419cc0c7c001e37f38e25395fe3fd559b1..73fd0275d8138d41bb4ee7fbc28e2d41e8017661 100644
--- a/bfps/cpp/distributed_particles.cpp
+++ b/bfps/cpp/distributed_particles.cpp
@@ -24,17 +24,19 @@
 
 
 
-#define NDEBUG
+//#define NDEBUG
 
 #include <cmath>
 #include <cassert>
 #include <cstring>
 #include <string>
 #include <sstream>
+#include <array>
 
 #include "base.hpp"
 #include "distributed_particles.hpp"
 #include "fftw_tools.hpp"
+#include "scope_timer.hpp"
 
 
 extern int myrank, nprocs;
@@ -43,17 +45,17 @@ template <particle_types particle_type, class rnumber, int interp_neighbours>
 distributed_particles<particle_type, rnumber, interp_neighbours>::distributed_particles(
         const char *NAME,
         const hid_t data_file_id,
-        interpolator<rnumber, interp_neighbours> *FIELD,
+        interpolator<rnumber, interp_neighbours> *VEL,
         const int TRAJ_SKIP,
         const int INTEGRATION_STEPS) : particles_io_base<particle_type>(
             NAME,
             TRAJ_SKIP,
             data_file_id,
-            FIELD->descriptor->comm)
+            VEL->descriptor->comm)
 {
     assert((INTEGRATION_STEPS <= 6) &&
            (INTEGRATION_STEPS >= 1));
-    this->vel = FIELD;
+    this->vel = VEL;
     this->rhs.resize(INTEGRATION_STEPS);
     this->integration_steps = INTEGRATION_STEPS;
     this->state.reserve(2*this->nparticles / this->nprocs);
@@ -72,14 +74,13 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::sample(
         const std::unordered_map<int, single_particle_state<particle_type>> &x,
         std::unordered_map<int, single_particle_state<POINT3D>> &y)
 {
-    double *yy = new double[3];
+    std::array<double, 3> yy;
     y.clear();
     for (auto &pp: x)
     {
-        (*field)(pp.second.data, yy);
-        y[pp.first] = yy;
+        (*field)(pp.second.data, &yy.front());
+        y[pp.first] = &yy.front();
     }
-    delete[] yy;
 }
 
 template <particle_types particle_type, class rnumber, int interp_neighbours>
@@ -121,6 +122,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::redistrib
         std::unordered_map<int, single_particle_state<particle_type>> &x,
         std::vector<std::unordered_map<int, single_particle_state<particle_type>>> &vals)
 {
+    TIMEZONE("distributed_particles::redistribute");
     //DEBUG_MSG("entered redistribute\n");
     /* neighbouring rank offsets */
     int ro[2];
@@ -312,6 +314,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::AdamsBash
 template <particle_types particle_type, class rnumber, int interp_neighbours>
 void distributed_particles<particle_type, rnumber, interp_neighbours>::step()
 {
+    TIMEZONE("distributed_particles::step");
     this->AdamsBashforth((this->iteration < this->integration_steps) ?
                             this->iteration+1 :
                             this->integration_steps);
@@ -368,6 +371,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::write(
         const char *dset_name,
         std::unordered_map<int, single_particle_state<POINT3D>> &y)
 {
+    TIMEZONE("distributed_particles::write");
     double *data = new double[this->nparticles*3];
     double *yy = new double[this->nparticles*3];
     for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++)
@@ -399,6 +403,7 @@ template <particle_types particle_type, class rnumber, int interp_neighbours>
 void distributed_particles<particle_type, rnumber, interp_neighbours>::write(
         const bool write_rhs)
 {
+    TIMEZONE("distributed_particles::write2");
     double *temp0 = new double[this->chunk_size*state_dimension(particle_type)];
     double *temp1 = new double[this->chunk_size*state_dimension(particle_type)];
     for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++)
@@ -411,7 +416,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::write(
             if (pp != this->state.end())
                 std::copy(pp->second.data,
                           pp->second.data + state_dimension(particle_type),
-                          temp0 + pp->first*state_dimension(particle_type));
+                          temp0 + p*state_dimension(particle_type));
         }
         MPI_Allreduce(
                 temp0,
@@ -433,7 +438,7 @@ void distributed_particles<particle_type, rnumber, interp_neighbours>::write(
                     if (pp != this->rhs[i].end())
                         std::copy(pp->second.data,
                                   pp->second.data + state_dimension(particle_type),
-                                  temp0 + pp->first*state_dimension(particle_type));
+                                  temp0 + p*state_dimension(particle_type));
                 }
                 MPI_Allreduce(
                         temp0,
diff --git a/bfps/cpp/fftw_interface.hpp b/bfps/cpp/fftw_interface.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b2e5074b2dc346b00dcfab0090598b486234bb5
--- /dev/null
+++ b/bfps/cpp/fftw_interface.hpp
@@ -0,0 +1,170 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+#ifndef FFTW_INTERFACE_HPP
+#define FFTW_INTERFACE_HPP
+
+#include <fftw3-mpi.h>
+
+#ifdef USE_FFTWESTIMATE
+#define DEFAULT_FFTW_FLAG FFTW_ESTIMATE
+#warning You are using FFTW estimate
+#else
+#define DEFAULT_FFTW_FLAG FFTW_PATIENT
+#endif
+
+template <class realtype>
+class fftw_interface;
+
+template <>
+class fftw_interface<float>
+{
+public:
+    using real = float;
+    using complex = fftwf_complex;
+    using plan = fftwf_plan;
+    using iodim = fftwf_iodim;
+
+    static complex* alloc_complex(const size_t in_size){
+        return fftwf_alloc_complex(in_size);
+    }
+
+    static real* alloc_real(const size_t in_size){
+        return fftwf_alloc_real(in_size);
+    }
+
+    static void free(void* ptr){
+        fftwf_free(ptr);
+    }
+
+    static void execute(plan in_plan){
+        fftwf_execute(in_plan);
+    }
+
+    static void destroy_plan(plan in_plan){
+        fftwf_destroy_plan(in_plan);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_transpose(Params ... params){
+        return fftwf_mpi_plan_transpose(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_transpose(Params ... params){
+        return fftwf_mpi_plan_many_transpose(params...);
+    }
+
+    template <class ... Params>
+    static plan plan_guru_r2r(Params ... params){
+        return fftwf_plan_guru_r2r(params...);
+    }
+
+    template <class ... Params>
+    static plan plan_guru_dft(Params ... params){
+        return fftwf_plan_guru_dft(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_dft_c2r(Params ... params){
+        return fftwf_mpi_plan_many_dft_c2r(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_dft_r2c(Params ... params){
+        return fftwf_mpi_plan_many_dft_r2c(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_dft_c2r_3d(Params ... params){
+        return fftwf_mpi_plan_dft_c2r_3d(params...);
+    }
+};
+
+template <>
+class fftw_interface<double>
+{
+public:
+    using real = double;
+    using complex = fftw_complex;
+    using plan = fftw_plan;
+    using iodim = fftw_iodim;
+
+    static complex* alloc_complex(const size_t in_size){
+        return fftw_alloc_complex(in_size);
+    }
+
+    static real* alloc_real(const size_t in_size){
+        return fftw_alloc_real(in_size);
+    }
+
+    static void free(void* ptr){
+        fftw_free(ptr);
+    }
+
+    static void execute(plan in_plan){
+        fftw_execute(in_plan);
+    }
+
+    static void destroy_plan(plan in_plan){
+        fftw_destroy_plan(in_plan);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_transpose(Params ... params){
+        return fftw_mpi_plan_transpose(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_transpose(Params ... params){
+        return fftw_mpi_plan_many_transpose(params...);
+    }
+
+    template <class ... Params>
+    static plan plan_guru_r2r(Params ... params){
+        return fftw_plan_guru_r2r(params...);
+    }
+
+    template <class ... Params>
+    static plan plan_guru_dft(Params ... params){
+        return fftw_plan_guru_dft(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_dft_c2r(Params ... params){
+        return fftw_mpi_plan_many_dft_c2r(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_dft_r2c(Params ... params){
+        return fftw_mpi_plan_many_dft_r2c(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_dft_c2r_3d(Params ... params){
+        return fftw_mpi_plan_dft_c2r_3d(params...);
+    }
+};
+
+#endif // FFTW_INTERFACE_HPP
diff --git a/bfps/cpp/fftw_tools.cpp b/bfps/cpp/fftw_tools.cpp
index f6eacbf1dfe2dfe31e603e9239c42d4639327d3d..61e03d292f81aed1fa4b2dfcab880fb7105b676e 100644
--- a/bfps/cpp/fftw_tools.cpp
+++ b/bfps/cpp/fftw_tools.cpp
@@ -27,6 +27,7 @@
 #include <iostream>
 #include "base.hpp"
 #include "fftw_tools.hpp"
+#include "fftw_interface.hpp"
 
 #define NDEBUG
 
@@ -51,150 +52,171 @@ int clip_zero_padding(
     return EXIT_SUCCESS;
 }
 
+template
+int clip_zero_padding<float>(
+        field_descriptor<float> *f,
+        float *a,
+        int howmany);
 
+template
+int clip_zero_padding<double>(
+        field_descriptor<double> *f,
+        double *a,
+        int howmany);
+
+
+
+template <class rnumber>
+int copy_complex_array(
+        field_descriptor<rnumber> *fi,
+        rnumber (*ai)[2],
+field_descriptor<rnumber> *fo,
+rnumber (*ao)[2],
+int howmany)
+{
+    DEBUG_MSG("entered copy_complex_array\n");
+    typename fftw_interface<rnumber>::complex *buffer;
+    buffer = fftw_interface<rnumber>::alloc_complex(fi->slice_size*howmany);
+
+    int min_fast_dim;
+    min_fast_dim =
+            (fi->sizes[2] > fo->sizes[2]) ?
+                fo->sizes[2] : fi->sizes[2];
 
-#define TOOLS_IMPLEMENTATION(FFTW, R, MPI_RNUM, MPI_CNUM) \
-template <> \
-int copy_complex_array<R>( \
-        field_descriptor<R> *fi, \
-        R (*ai)[2], \
-        field_descriptor<R> *fo, \
-        R (*ao)[2], \
-        int howmany) \
-{ \
-    DEBUG_MSG("entered copy_complex_array\n"); \
-    FFTW(complex) *buffer; \
-    buffer = FFTW(alloc_complex)(fi->slice_size*howmany); \
- \
-    int min_fast_dim; \
-    min_fast_dim = \
-        (fi->sizes[2] > fo->sizes[2]) ? \
-         fo->sizes[2] : fi->sizes[2]; \
- \
     /* clean up destination, in case we're padding with zeros
-       (even if only for one dimension) */ \
-    std::fill_n((R*)ao, fo->local_size*2, 0.0); \
- \
-    int64_t ii0, ii1; \
-    int64_t oi0, oi1; \
-    int64_t delta1, delta0; \
-    int irank, orank; \
-    delta0 = (fo->sizes[0] - fi->sizes[0]); \
-    delta1 = (fo->sizes[1] - fi->sizes[1]); \
-    for (ii0=0; ii0 < fi->sizes[0]; ii0++) \
-    { \
-        if (ii0 <= fi->sizes[0]/2) \
-        { \
-            oi0 = ii0; \
-            if (oi0 > fo->sizes[0]/2) \
-                continue; \
-        } \
-        else \
-        { \
-            oi0 = ii0 + delta0; \
-            if ((oi0 < 0) || ((fo->sizes[0] - oi0) >= fo->sizes[0]/2)) \
-                continue; \
-        } \
-        irank = fi->rank[ii0]; \
-        orank = fo->rank[oi0]; \
-        if ((irank == orank) && \
-            (irank == fi->myrank)) \
-        { \
-            std::copy( \
-                    (R*)(ai + (ii0 - fi->starts[0]    )*fi->slice_size), \
-                    (R*)(ai + (ii0 - fi->starts[0] + 1)*fi->slice_size), \
-                    (R*)buffer); \
-        } \
-        else \
-        { \
-            if (fi->myrank == irank) \
-            { \
-                MPI_Send( \
-                        (void*)(ai + (ii0-fi->starts[0])*fi->slice_size), \
-                        fi->slice_size, \
-                        MPI_CNUM, \
-                        orank, \
-                        ii0, \
-                        fi->comm); \
-            } \
-            if (fi->myrank == orank) \
-            { \
-                MPI_Recv( \
-                        (void*)(buffer), \
-                        fi->slice_size, \
-                        MPI_CNUM, \
-                        irank, \
-                        ii0, \
-                        fi->comm, \
-                        MPI_STATUS_IGNORE); \
-            } \
-        } \
-        if (fi->myrank == orank) \
-        { \
-            for (ii1 = 0; ii1 < fi->sizes[1]; ii1++) \
-            { \
-                if (ii1 <= fi->sizes[1]/2) \
-                { \
-                    oi1 = ii1; \
-                    if (oi1 > fo->sizes[1]/2) \
-                        continue; \
-                } \
-                else \
-                { \
-                    oi1 = ii1 + delta1; \
-                    if ((oi1 < 0) || ((fo->sizes[1] - oi1) >= fo->sizes[1]/2)) \
-                        continue; \
-                } \
-                std::copy( \
-                        (R*)(buffer + (ii1*fi->sizes[2]*howmany)), \
-                        (R*)(buffer + (ii1*fi->sizes[2] + min_fast_dim)*howmany), \
-                        (R*)(ao + \
-                                 ((oi0 - fo->starts[0])*fo->sizes[1] + \
-                                  oi1)*fo->sizes[2]*howmany)); \
-            } \
-        } \
-    } \
-    fftw_free(buffer); \
-    MPI_Barrier(fi->comm); \
- \
-    DEBUG_MSG("exiting copy_complex_array\n"); \
-    return EXIT_SUCCESS; \
-} \
- \
-template <> \
-int get_descriptors_3D<R>( \
-        int n0, int n1, int n2, \
-        field_descriptor<R> **fr, \
-        field_descriptor<R> **fc) \
-{ \
-    int ntmp[3]; \
-    ntmp[0] = n0; \
-    ntmp[1] = n1; \
-    ntmp[2] = n2; \
-    *fr = new field_descriptor<R>(3, ntmp, MPI_RNUM, MPI_COMM_WORLD); \
-    ntmp[0] = n0; \
-    ntmp[1] = n1; \
-    ntmp[2] = n2/2+1; \
-    *fc = new field_descriptor<R>(3, ntmp, MPI_CNUM, MPI_COMM_WORLD); \
-    return EXIT_SUCCESS; \
-} \
- \
-template \
-int clip_zero_padding<R>( \
-        field_descriptor<R> *f, \
-        R *a, \
-        int howmany); \
-
-
-
-TOOLS_IMPLEMENTATION(
-        FFTW_MANGLE_FLOAT,
-        float,
-        MPI_FLOAT,
-        MPI_COMPLEX)
-TOOLS_IMPLEMENTATION(
-        FFTW_MANGLE_DOUBLE,
-        double,
-        MPI_DOUBLE,
-        BFPS_MPICXX_DOUBLE_COMPLEX)
+       (even if only for one dimension) */
+    std::fill_n((rnumber*)ao, fo->local_size*2, 0.0);
+
+    int64_t ii0, ii1;
+    int64_t oi0, oi1;
+    int64_t delta1, delta0;
+    int irank, orank;
+    delta0 = (fo->sizes[0] - fi->sizes[0]);
+    delta1 = (fo->sizes[1] - fi->sizes[1]);
+    for (ii0=0; ii0 < fi->sizes[0]; ii0++)
+    {
+        if (ii0 <= fi->sizes[0]/2)
+        {
+            oi0 = ii0;
+            if (oi0 > fo->sizes[0]/2)
+                continue;
+        }
+        else
+        {
+            oi0 = ii0 + delta0;
+            if ((oi0 < 0) || ((fo->sizes[0] - oi0) >= fo->sizes[0]/2))
+                continue;
+        }
+        irank = fi->rank[ii0];
+        orank = fo->rank[oi0];
+        if ((irank == orank) &&
+                (irank == fi->myrank))
+        {
+            std::copy(
+                        (rnumber*)(ai + (ii0 - fi->starts[0]    )*fi->slice_size),
+                    (rnumber*)(ai + (ii0 - fi->starts[0] + 1)*fi->slice_size),
+                    (rnumber*)buffer);
+        }
+        else
+        {
+            if (fi->myrank == irank)
+            {
+                MPI_Send(
+                            (void*)(ai + (ii0-fi->starts[0])*fi->slice_size),
+                        fi->slice_size,
+                        mpi_real_type<rnumber>::complex(),
+                        orank,
+                        ii0,
+                        fi->comm);
+            }
+            if (fi->myrank == orank)
+            {
+                MPI_Recv(
+                            (void*)(buffer),
+                            fi->slice_size,
+                            mpi_real_type<rnumber>::complex(),
+                            irank,
+                            ii0,
+                            fi->comm,
+                            MPI_STATUS_IGNORE);
+            }
+        }
+        if (fi->myrank == orank)
+        {
+            for (ii1 = 0; ii1 < fi->sizes[1]; ii1++)
+            {
+                if (ii1 <= fi->sizes[1]/2)
+                {
+                    oi1 = ii1;
+                    if (oi1 > fo->sizes[1]/2)
+                        continue;
+                }
+                else
+                {
+                    oi1 = ii1 + delta1;
+                    if ((oi1 < 0) || ((fo->sizes[1] - oi1) >= fo->sizes[1]/2))
+                        continue;
+                }
+                std::copy(
+                            (rnumber*)(buffer + (ii1*fi->sizes[2]*howmany)),
+                        (rnumber*)(buffer + (ii1*fi->sizes[2] + min_fast_dim)*howmany),
+                        (rnumber*)(ao +
+                                   ((oi0 - fo->starts[0])*fo->sizes[1] +
+                        oi1)*fo->sizes[2]*howmany));
+            }
+        }
+    }
+    fftw_interface<rnumber>::free(buffer);
+    MPI_Barrier(fi->comm);
+
+    DEBUG_MSG("exiting copy_complex_array\n");
+    return EXIT_SUCCESS;
+}
+
+template
+int copy_complex_array<float>(
+        field_descriptor<float> *fi,
+        float (*ai)[2],
+        field_descriptor<float> *fo,
+        float (*ao)[2],
+        int howmany);
+
+template
+int copy_complex_array<double>(
+        field_descriptor<double> *fi,
+        double (*ai)[2],
+        field_descriptor<double> *fo,
+        double (*ao)[2],
+        int howmany);
+
+
+template <class rnumber>
+int get_descriptors_3D(
+        int n0, int n1, int n2,
+        field_descriptor<rnumber> **fr,
+        field_descriptor<rnumber> **fc)
+{
+    int ntmp[3];
+    ntmp[0] = n0;
+    ntmp[1] = n1;
+    ntmp[2] = n2;
+    *fr = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), MPI_COMM_WORLD);
+    ntmp[0] = n0;
+    ntmp[1] = n1;
+    ntmp[2] = n2/2+1;
+    *fc = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::complex(), MPI_COMM_WORLD);
+    return EXIT_SUCCESS;
+}
+
+template
+int get_descriptors_3D<float>(
+        int n0, int n1, int n2,
+        field_descriptor<float> **fr,
+        field_descriptor<float> **fc);
+
+template
+int get_descriptors_3D<double>(
+        int n0, int n1, int n2,
+        field_descriptor<double> **fr,
+        field_descriptor<double> **fc);
 
diff --git a/bfps/cpp/field.cpp b/bfps/cpp/field.cpp
index ad1e77f107113952bd84a5a3f72f2a5d64064f9a..768e723da343cb344fb1e9583e777a621a10e864 100644
--- a/bfps/cpp/field.cpp
+++ b/bfps/cpp/field.cpp
@@ -23,87 +23,16 @@
 **********************************************************************/
 
 
+#include <sys/stat.h>
+#include <cmath>
 #include <cstdlib>
 #include <algorithm>
 #include <cassert>
 #include "field.hpp"
+#include "scope_timer.hpp"
+#include "shared_array.hpp"
 
-template <field_components fc>
-field_layout<fc>::field_layout(
-        const hsize_t *SIZES,
-        const hsize_t *SUBSIZES,
-        const hsize_t *STARTS,
-        const MPI_Comm COMM_TO_USE)
-{
-    this->comm = COMM_TO_USE;
-    MPI_Comm_rank(this->comm, &this->myrank);
-    MPI_Comm_size(this->comm, &this->nprocs);
-
-    std::copy(SIZES, SIZES + 3, this->sizes);
-    std::copy(SUBSIZES, SUBSIZES + 3, this->subsizes);
-    std::copy(STARTS, STARTS + 3, this->starts);
-    if (fc == THREE || fc == THREExTHREE)
-    {
-        this->sizes[3] = 3;
-        this->subsizes[3] = 3;
-        this->starts[3] = 0;
-    }
-    if (fc == THREExTHREE)
-    {
-        this->sizes[4] = 3;
-        this->subsizes[4] = 3;
-        this->starts[4] = 0;
-    }
-    this->local_size = 1;
-    this->full_size = 1;
-    for (unsigned int i=0; i<ndim(fc); i++)
-    {
-        this->local_size *= this->subsizes[i];
-        this->full_size *= this->sizes[i];
-    }
 
-    /*field will at most be distributed in 2D*/
-    this->rank.resize(2);
-    this->all_start.resize(2);
-    this->all_size.resize(2);
-    for (int i=0; i<2; i++)
-    {
-        this->rank[i].resize(this->sizes[i]);
-        std::vector<int> local_rank;
-        local_rank.resize(this->sizes[i], 0);
-        for (unsigned int ii=this->starts[i]; ii<this->starts[i]+this->subsizes[i]; ii++)
-            local_rank[ii] = this->myrank;
-        MPI_Allreduce(
-                &local_rank.front(),
-                &this->rank[i].front(),
-                this->sizes[i],
-                MPI_INT,
-                MPI_SUM,
-                this->comm);
-        this->all_start[i].resize(this->nprocs);
-        std::vector<int> local_start;
-        local_start.resize(this->nprocs, 0);
-        local_start[this->myrank] = this->starts[i];
-        MPI_Allreduce(
-                &local_start.front(),
-                &this->all_start[i].front(),
-                this->nprocs,
-                MPI_INT,
-                MPI_SUM,
-                this->comm);
-        this->all_size[i].resize(this->nprocs);
-        std::vector<int> local_subsize;
-        local_subsize.resize(this->nprocs, 0);
-        local_subsize[this->myrank] = this->subsizes[i];
-        MPI_Allreduce(
-                &local_subsize.front(),
-                &this->all_size[i].front(),
-                this->nprocs,
-                MPI_INT,
-                MPI_SUM,
-                this->comm);
-    }
-}
 
 template <typename rnumber,
           field_backend be,
@@ -115,6 +44,7 @@ field<rnumber, be, fc>::field(
                 const MPI_Comm COMM_TO_USE,
                 const unsigned FFTW_PLAN_RIGOR)
 {
+    TIMEZONE("field::field");
     this->comm = COMM_TO_USE;
     MPI_Comm_rank(this->comm, &this->myrank);
     MPI_Comm_size(this->comm, &this->nprocs);
@@ -164,47 +94,27 @@ field<rnumber, be, fc>::field(
             starts[0] = local_0_start; starts[1] = 0; starts[2] = 0;
             this->rmemlayout = new field_layout<fc>(
                     sizes, subsizes, starts, this->comm);
-            sizes[0] = nz; sizes[1] = ny; sizes[2] = nx/2+1;
-            subsizes[0] = local_n1; subsizes[1] = ny; subsizes[2] = nx/2+1;
+            sizes[0] = ny; sizes[1] = nz; sizes[2] = nx/2+1;
+            subsizes[0] = local_n1; subsizes[1] = nz; subsizes[2] = nx/2+1;
             starts[0] = local_1_start; starts[1] = 0; starts[2] = 0;
             this->clayout = new field_layout<fc>(
                     sizes, subsizes, starts, this->comm);
-            this->data = (rnumber*)fftw_malloc(
-                    sizeof(rnumber)*this->rmemlayout->local_size);
-            if(typeid(rnumber) == typeid(float))
-            {
-                this->c2r_plan = new fftwf_plan;
-                this->r2c_plan = new fftwf_plan;
-                *((fftwf_plan*)this->c2r_plan) = fftwf_mpi_plan_many_dft_c2r(
-                        3, nfftw, ncomp(fc),
-                        FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
-                        (fftwf_complex*)this->data, (float*)this->data,
-                        this->comm,
-                        this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
-                *((fftwf_plan*)this->r2c_plan) = fftwf_mpi_plan_many_dft_r2c(
-                        3, nfftw, ncomp(fc),
-                        FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
-                        (float*)this->data, (fftwf_complex*)this->data,
-                        this->comm,
-                        this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
-            }
-            if (typeid(rnumber) == typeid(double))
-            {
-                this->c2r_plan = new fftw_plan;
-                this->r2c_plan = new fftw_plan;
-                *((fftw_plan*)this->c2r_plan) = fftw_mpi_plan_many_dft_c2r(
-                        3, nfftw, ncomp(fc),
-                        FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
-                        (fftw_complex*)this->data, (double*)this->data,
-                        this->comm,
-                        this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
-                *((fftw_plan*)this->r2c_plan) = fftw_mpi_plan_many_dft_r2c(
-                        3, nfftw, ncomp(fc),
-                        FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
-                        (double*)this->data, (fftw_complex*)this->data,
-                        this->comm,
-                        this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
-            }
+            this->data = fftw_interface<rnumber>::alloc_real(
+                    this->rmemlayout->local_size);
+            this->c2r_plan = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                    3, nfftw, ncomp(fc),
+                    FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                    (typename fftw_interface<rnumber>::complex*)this->data,
+                    this->data,
+                    this->comm,
+                    this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+            this->r2c_plan = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                    3, nfftw, ncomp(fc),
+                    FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                    this->data,
+                    (typename fftw_interface<rnumber>::complex*)this->data,
+                    this->comm,
+                    this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
             break;
     }
 }
@@ -223,21 +133,9 @@ field<rnumber, be, fc>::~field()
             delete this->rlayout;
             delete this->rmemlayout;
             delete this->clayout;
-            fftw_free(this->data);
-            if (typeid(rnumber) == typeid(float))
-            {
-                fftwf_destroy_plan(*(fftwf_plan*)this->c2r_plan);
-                delete (fftwf_plan*)this->c2r_plan;
-                fftwf_destroy_plan(*(fftwf_plan*)this->r2c_plan);
-                delete (fftwf_plan*)this->r2c_plan;
-            }
-            else if (typeid(rnumber) == typeid(double))
-            {
-                fftw_destroy_plan(*(fftw_plan*)this->c2r_plan);
-                delete (fftw_plan*)this->c2r_plan;
-                fftw_destroy_plan(*(fftw_plan*)this->r2c_plan);
-                delete (fftw_plan*)this->r2c_plan;
-            }
+            fftw_interface<rnumber>::free(this->data);
+            fftw_interface<rnumber>::destroy_plan(this->c2r_plan);
+            fftw_interface<rnumber>::destroy_plan(this->r2c_plan);
             break;
     }
 }
@@ -247,10 +145,8 @@ template <typename rnumber,
           field_components fc>
 void field<rnumber, be, fc>::ift()
 {
-    if (typeid(rnumber) == typeid(float))
-        fftwf_execute(*((fftwf_plan*)this->c2r_plan));
-    else if (typeid(rnumber) == typeid(double))
-        fftw_execute(*((fftw_plan*)this->c2r_plan));
+    TIMEZONE("field::ift");
+    fftw_interface<rnumber>::execute(this->c2r_plan);
     this->real_space_representation = true;
 }
 
@@ -259,10 +155,8 @@ template <typename rnumber,
           field_components fc>
 void field<rnumber, be, fc>::dft()
 {
-    if (typeid(rnumber) == typeid(float))
-        fftwf_execute(*((fftwf_plan*)this->r2c_plan));
-    else if (typeid(rnumber) == typeid(double))
-        fftw_execute(*((fftw_plan*)this->r2c_plan));
+    TIMEZONE("field::dft");
+    fftw_interface<rnumber>::execute(this->r2c_plan);
     this->real_space_representation = false;
 }
 
@@ -271,59 +165,340 @@ template <typename rnumber,
           field_components fc>
 int field<rnumber, be, fc>::io(
         const std::string fname,
-        const std::string dset_name,
+        const std::string field_name,
+        const int iteration,
+        const bool read)
+{
+    /* file dataset has same dimensions as field */
+    TIMEZONE("field::io");
+    hid_t file_id, dset_id, plist_id;
+    dset_id = H5I_BADID;
+    std::string representation = std::string(
+            this->real_space_representation ?
+                "real" : "complex");
+    std::string dset_name = (
+            "/" + field_name +
+            "/" + representation +
+            "/" + std::to_string(iteration));
+
+    /* open/create file */
+    plist_id = H5Pcreate(H5P_FILE_ACCESS);
+    H5Pset_fapl_mpio(plist_id, this->comm, MPI_INFO_NULL);
+    bool file_exists = false;
+    {
+        struct stat file_buffer;
+        file_exists = (stat(fname.c_str(), &file_buffer) == 0);
+    }
+    if (read)
+    {
+        assert(file_exists);
+        file_id = H5Fopen(fname.c_str(), H5F_ACC_RDONLY, plist_id);
+    }
+    else
+    {
+        if (file_exists)
+            file_id = H5Fopen(fname.c_str(), H5F_ACC_RDWR, plist_id);
+        else
+            file_id = H5Fcreate(fname.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, plist_id);
+    }
+    assert(file_id >= 0);
+    H5Pclose(plist_id);
+
+    /* check what kind of representation is being used */
+    if (read)
+    {
+        dset_id = H5Dopen(
+                file_id,
+                dset_name.c_str(),
+                H5P_DEFAULT);
+        assert(dset_id >= 0);
+        hid_t dset_type = H5Dget_type(dset_id);
+        assert(dset_type >= 0);
+        bool io_for_real = (
+                H5Tequal(dset_type, H5T_IEEE_F32BE) ||
+                H5Tequal(dset_type, H5T_IEEE_F32LE) ||
+                H5Tequal(dset_type, H5T_INTEL_F32) ||
+                H5Tequal(dset_type, H5T_NATIVE_FLOAT) ||
+                H5Tequal(dset_type, H5T_IEEE_F64BE) ||
+                H5Tequal(dset_type, H5T_IEEE_F64LE) ||
+                H5Tequal(dset_type, H5T_INTEL_F64) ||
+                H5Tequal(dset_type, H5T_NATIVE_DOUBLE));
+        H5Tclose(dset_type);
+        assert(this->real_space_representation == io_for_real);
+    }
+
+    /* generic space initialization */
+    hid_t fspace, mspace;
+    hsize_t count[ndim(fc)], offset[ndim(fc)], dims[ndim(fc)];
+    hsize_t memoffset[ndim(fc)], memshape[ndim(fc)];
+
+    if (this->real_space_representation)
+    {
+        for (unsigned int i=0; i<ndim(fc); i++)
+        {
+            count[i] = this->rlayout->subsizes[i];
+            offset[i] = this->rlayout->starts[i];
+            dims[i] = this->rlayout->sizes[i];
+            memshape[i] = this->rmemlayout->subsizes[i];
+            memoffset[i] = 0;
+        }
+    }
+    else
+    {
+        for (unsigned int i=0; i<ndim(fc); i++)
+        {
+            count [i] = this->clayout->subsizes[i];
+            offset[i] = this->clayout->starts[i];
+            dims  [i] = this->clayout->sizes[i];
+            memshape [i] = count[i];
+            memoffset[i] = 0;
+        }
+    }
+    mspace = H5Screate_simple(ndim(fc), memshape, NULL);
+    H5Sselect_hyperslab(mspace, H5S_SELECT_SET, memoffset, NULL, count, NULL);
+
+    /* open/create data set */
+    if (read)
+        fspace = H5Dget_space(dset_id);
+    else
+    {
+        if (!H5Lexists(file_id, field_name.c_str(), H5P_DEFAULT))
+        {
+            hid_t gid_tmp = H5Gcreate(
+                    file_id, field_name.c_str(),
+                    H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            H5Gclose(gid_tmp);
+        }
+
+        if (!H5Lexists(file_id, (field_name + "/" + representation).c_str(), H5P_DEFAULT))
+        {
+            hid_t gid_tmp = H5Gcreate(
+                    file_id, ("/" + field_name + "/" + representation).c_str(),
+                    H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            H5Gclose(gid_tmp);
+        }
+        if (H5Lexists(file_id, dset_name.c_str(), H5P_DEFAULT))
+        {
+            dset_id = H5Dopen(file_id, dset_name.c_str(), H5P_DEFAULT);
+            fspace = H5Dget_space(dset_id);
+        }
+        else
+        {
+            fspace = H5Screate_simple(
+                    ndim(fc),
+                    dims,
+                    NULL);
+            /* chunking needs to go in here */
+            dset_id = H5Dcreate(
+                    file_id,
+                    dset_name.c_str(),
+                    (this->real_space_representation ? this->rnumber_H5T : this->cnumber_H5T),
+                    fspace,
+                    H5P_DEFAULT,
+                    H5P_DEFAULT,
+                    H5P_DEFAULT);
+        }
+    }
+    /* both dset_id and fspace should now have sane values */
+
+    /* check file space */
+    int ndims_fspace = H5Sget_simple_extent_dims(fspace, dims, NULL);
+    assert(((unsigned int)(ndims_fspace)) == ndim(fc));
+    if (this->real_space_representation)
+    {
+        for (unsigned int i=0; i<ndim(fc); i++)
+        {
+            offset[i] = this->rlayout->starts[i];
+            assert(dims[i] == this->rlayout->sizes[i]);
+        }
+        H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL);
+        if (read)
+        {
+            std::fill_n(this->data, this->rmemlayout->local_size, 0);
+            H5Dread(dset_id, this->rnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data);
+        }
+        else
+        {
+            assert(this->real_space_representation);
+            H5Dwrite(dset_id, this->rnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data);
+        }
+        H5Sclose(mspace);
+    }
+    else
+    {
+        for (unsigned int i=0; i<ndim(fc); i++)
+        {
+            offset[i] = this->clayout->starts[i];
+            assert(dims[i] == this->clayout->sizes[i]);
+        }
+        H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL);
+        if (read)
+        {
+            std::fill_n(this->data, this->clayout->local_size*2, 0);
+            H5Dread(dset_id, this->cnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data);
+            this->symmetrize();
+        }
+        else
+        {
+            assert(!this->real_space_representation);
+            H5Dwrite(dset_id, this->cnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data);
+        }
+        H5Sclose(mspace);
+    }
+
+    H5Sclose(fspace);
+    /* close data set */
+    H5Dclose(dset_id);
+    /* close file */
+    H5Fclose(file_id);
+    return EXIT_SUCCESS;
+}
+
+template <typename rnumber,
+          field_backend be,
+          field_components fc>
+int field<rnumber, be, fc>::io_database(
+        const std::string fname,
+        const std::string field_name,
         const int toffset,
         const bool read)
 {
+    /* file dataset is has a time dimension as well */
+    TIMEZONE("field::io_database");
     hid_t file_id, dset_id, plist_id;
-    hid_t dset_type;
-    bool io_for_real = false;
+    dset_id = H5I_BADID;
+    std::string representation = std::string(
+            this->real_space_representation ?
+                "real" : "complex");
+    std::string dset_name = (
+            "/" + field_name +
+            "/" + representation);
 
-    /* open file */
+    /* open/create file */
     plist_id = H5Pcreate(H5P_FILE_ACCESS);
     H5Pset_fapl_mpio(plist_id, this->comm, MPI_INFO_NULL);
+    bool file_exists = false;
+    {
+        struct stat file_buffer;
+        file_exists = (stat(fname.c_str(), &file_buffer) == 0);
+    }
     if (read)
+    {
+        assert(file_exists);
         file_id = H5Fopen(fname.c_str(), H5F_ACC_RDONLY, plist_id);
+    }
     else
-        file_id = H5Fopen(fname.c_str(), H5F_ACC_RDWR, plist_id);
+    {
+        if (file_exists)
+            file_id = H5Fopen(fname.c_str(), H5F_ACC_RDWR, plist_id);
+        else
+            file_id = H5Fcreate(fname.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, plist_id);
+    }
     H5Pclose(plist_id);
 
-    /* open data set */
-    dset_id = H5Dopen(file_id, dset_name.c_str(), H5P_DEFAULT);
-    dset_type = H5Dget_type(dset_id);
-    io_for_real = (
-            H5Tequal(dset_type, H5T_IEEE_F32BE) ||
-            H5Tequal(dset_type, H5T_IEEE_F32LE) ||
-            H5Tequal(dset_type, H5T_INTEL_F32) ||
-            H5Tequal(dset_type, H5T_NATIVE_FLOAT) ||
-            H5Tequal(dset_type, H5T_IEEE_F64BE) ||
-            H5Tequal(dset_type, H5T_IEEE_F64LE) ||
-            H5Tequal(dset_type, H5T_INTEL_F64) ||
-            H5Tequal(dset_type, H5T_NATIVE_DOUBLE));
+    /* check what kind of representation is being used */
+    if (read)
+    {
+        dset_id = H5Dopen(
+                file_id,
+                dset_name.c_str(),
+                H5P_DEFAULT);
+        hid_t dset_type = H5Dget_type(dset_id);
+        bool io_for_real = (
+                H5Tequal(dset_type, H5T_IEEE_F32BE) ||
+                H5Tequal(dset_type, H5T_IEEE_F32LE) ||
+                H5Tequal(dset_type, H5T_INTEL_F32) ||
+                H5Tequal(dset_type, H5T_NATIVE_FLOAT) ||
+                H5Tequal(dset_type, H5T_IEEE_F64BE) ||
+                H5Tequal(dset_type, H5T_IEEE_F64LE) ||
+                H5Tequal(dset_type, H5T_INTEL_F64) ||
+                H5Tequal(dset_type, H5T_NATIVE_DOUBLE));
+        H5Tclose(dset_type);
+        assert(this->real_space_representation == io_for_real);
+    }
 
     /* generic space initialization */
     hid_t fspace, mspace;
-    fspace = H5Dget_space(dset_id);
     hsize_t count[ndim(fc)+1], offset[ndim(fc)+1], dims[ndim(fc)+1];
     hsize_t memoffset[ndim(fc)+1], memshape[ndim(fc)+1];
-    H5Sget_simple_extent_dims(fspace, dims, NULL);
+
+    int dim_counter_offset = 1;
+    dim_counter_offset = 1;
     count[0] = 1;
-    offset[0] = toffset;
     memshape[0] = 1;
     memoffset[0] = 0;
-    if (io_for_real)
+    if (this->real_space_representation)
     {
         for (unsigned int i=0; i<ndim(fc); i++)
         {
-            count[i+1] = this->rlayout->subsizes[i];
-            offset[i+1] = this->rlayout->starts[i];
-            assert(dims[i+1] == this->rlayout->sizes[i]);
-            memshape[i+1] = this->rmemlayout->subsizes[i];
-            memoffset[i+1] = 0;
+            count[i+dim_counter_offset] = this->rlayout->subsizes[i];
+            offset[i+dim_counter_offset] = this->rlayout->starts[i];
+            dims[i+dim_counter_offset] = this->rlayout->sizes[i];
+            memshape[i+dim_counter_offset] = this->rmemlayout->subsizes[i];
+            memoffset[i+dim_counter_offset] = 0;
         }
-        mspace = H5Screate_simple(ndim(fc)+1, memshape, NULL);
-        H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL);
+        mspace = H5Screate_simple(dim_counter_offset + ndim(fc), memshape, NULL);
+        H5Sselect_hyperslab(mspace, H5S_SELECT_SET, memoffset, NULL, count, NULL);
+    }
+    else
+    {
+        for (unsigned int i=0; i<ndim(fc); i++)
+        {
+            count[i+dim_counter_offset] = this->clayout->subsizes[i];
+            offset[i+dim_counter_offset] = this->clayout->starts[i];
+            dims[i+dim_counter_offset] = this->clayout->sizes[i];
+            memshape[i+dim_counter_offset] = count[i+dim_counter_offset];
+            memoffset[i+dim_counter_offset] = 0;
+        }
+        mspace = H5Screate_simple(dim_counter_offset + ndim(fc), memshape, NULL);
         H5Sselect_hyperslab(mspace, H5S_SELECT_SET, memoffset, NULL, count, NULL);
+    }
+
+    /* open/create data set */
+    if (read)
+        fspace = H5Dget_space(dset_id);
+    else
+    {
+        if (!H5Lexists(file_id, field_name.c_str(), H5P_DEFAULT))
+            H5Gcreate(
+                    file_id, field_name.c_str(),
+                    H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        if (H5Lexists(file_id, dset_name.c_str(), H5P_DEFAULT))
+        {
+            dset_id = H5Dopen(file_id, dset_name.c_str(), H5P_DEFAULT);
+            fspace = H5Dget_space(dset_id);
+        }
+        else
+        {
+            fspace = H5Screate_simple(
+                    ndim(fc),
+                    dims,
+                    NULL);
+            /* chunking needs to go in here */
+            dset_id = H5Dcreate(
+                    file_id,
+                    dset_name.c_str(),
+                    (this->real_space_representation ? this->rnumber_H5T : this->cnumber_H5T),
+                    fspace,
+                    H5P_DEFAULT,
+                    H5P_DEFAULT,
+                    H5P_DEFAULT);
+        }
+    }
+    /* both dset_id and fspace should now have sane values */
+
+    /* check file space */
+    int ndims_fspace = H5Sget_simple_extent_dims(fspace, dims, NULL);
+    assert(ndims_fspace == int(ndim(fc) + 1));
+    offset[0] = toffset;
+    if (this->real_space_representation)
+    {
+        for (unsigned int i=0; i<ndim(fc); i++)
+        {
+            offset[i+dim_counter_offset] = this->rlayout->starts[i];
+            assert(dims[i+dim_counter_offset] == this->rlayout->sizes[i]);
+        }
+        H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL);
         if (read)
         {
             std::fill_n(this->data, this->rmemlayout->local_size, 0);
@@ -332,13 +507,8 @@ int field<rnumber, be, fc>::io(
         }
         else
         {
+            assert(this->real_space_representation);
             H5Dwrite(dset_id, this->rnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data);
-            if (!this->real_space_representation)
-                /* in principle we could do an inverse Fourier transform in here,
-                 * however that would be unsafe since we wouldn't know whether we'd need to
-                 * normalize or not.
-                 * */
-                DEBUG_MSG("I just wrote complex field into real space dataset. It's probably nonsense.\n");
         }
         H5Sclose(mspace);
     }
@@ -346,30 +516,24 @@ int field<rnumber, be, fc>::io(
     {
         for (unsigned int i=0; i<ndim(fc); i++)
         {
-            count[i+1] = this->clayout->subsizes[i];
-            offset[i+1] = this->clayout->starts[i];
-            assert(dims[i+1] == this->clayout->sizes[i]);
-            memshape[i+1] = count[i+1];
-            memoffset[i+1] = 0;
+            offset[i+dim_counter_offset] = this->clayout->starts[i];
+            assert(dims[i+dim_counter_offset] == this->clayout->sizes[i]);
         }
-        mspace = H5Screate_simple(ndim(fc)+1, memshape, NULL);
         H5Sselect_hyperslab(fspace, H5S_SELECT_SET, offset, NULL, count, NULL);
-        H5Sselect_hyperslab(mspace, H5S_SELECT_SET, memoffset, NULL, count, NULL);
         if (read)
         {
             H5Dread(dset_id, this->cnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data);
             this->real_space_representation = false;
+            this->symmetrize();
         }
         else
         {
+            assert(!this->real_space_representation);
             H5Dwrite(dset_id, this->cnumber_H5T, mspace, fspace, H5P_DEFAULT, this->data);
-            if (this->real_space_representation)
-                DEBUG_MSG("I just wrote real space field into complex dataset. It's probably nonsense.\n");
         }
         H5Sclose(mspace);
     }
 
-    H5Tclose(dset_type);
     H5Sclose(fspace);
     /* close data set */
     H5Dclose(dset_id);
@@ -378,17 +542,127 @@ int field<rnumber, be, fc>::io(
     return EXIT_SUCCESS;
 }
 
+
 template <typename rnumber,
           field_backend be,
           field_components fc>
-void field<rnumber, be, fc>::compute_rspace_stats(
+int field<rnumber, be, fc>::write_0slice(
+        const hid_t group,
+        const std::string field_name,
+        const int iteration)
+{
+    TIMEZONE("field::write_0slice");
+    assert(this->real_space_representation);
+    assert(fc == THREE);
+    if (this->myrank == 0)
+    {
+        hid_t dset, wspace, mspace;
+        int ndims;
+        hsize_t count[4], offset[4], dims[4];
+        offset[0] = iteration;
+        offset[1] = 0;
+        offset[2] = 0;
+        offset[3] = 0;
+        dset = H5Dopen(
+                group,
+                ("0slices/" + field_name + "/real").c_str(),
+                H5P_DEFAULT);
+        wspace = H5Dget_space(dset);
+        ndims = H5Sget_simple_extent_dims(wspace, dims, NULL);
+        // array in memory has 2 extra x points, because FFTW
+        count[0] = 1;
+        count[1] = this->rmemlayout->sizes[1];
+        count[2] = this->rmemlayout->sizes[2];
+        count[3] = 3;
+        mspace = H5Screate_simple(ndims, count, NULL);
+        // array in file should not have the extra 2 points
+        count[1] = this->rlayout->sizes[1];
+        count[2] = this->rlayout->sizes[2];
+        // select right slice in file
+        H5Sselect_hyperslab(
+            wspace,
+            H5S_SELECT_SET,
+            offset,
+            NULL,
+            count,
+            NULL);
+        offset[0] = 0;
+        // select proper regions of memory
+        H5Sselect_hyperslab(
+            mspace,
+            H5S_SELECT_SET,
+            offset,
+            NULL,
+            count,
+            NULL);
+        H5Dwrite(
+            dset,
+            this->rnumber_H5T,
+            mspace,
+            wspace,
+            H5P_DEFAULT,
+            this->data);
+        H5Dclose(dset);
+        H5Sclose(mspace);
+        H5Sclose(wspace);
+    }
+    return EXIT_SUCCESS;
+}
+
+
+template <typename rnumber,
+          field_backend be,
+          field_components fc>
+void field<rnumber, be, fc>::compute_rspace_xincrement_stats(
+                const int xcells,
                 const hid_t group,
                 const std::string dset_name,
                 const hsize_t toffset,
                 const std::vector<double> max_estimate)
 {
+    TIMEZONE("field::compute_rspace_xincrement_stats");
     assert(this->real_space_representation);
     assert(fc == ONE || fc == THREE);
+    field<rnumber, be, fc> *tmp_field = new field<rnumber, be, fc>(
+            this->rlayout->sizes[2],
+            this->rlayout->sizes[1],
+            this->rlayout->sizes[0],
+            this->rlayout->comm);
+    tmp_field->real_space_representation = true;
+    this->RLOOP(
+                [&](ptrdiff_t rindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex){
+            hsize_t rrindex = (xindex + xcells)%this->rlayout->sizes[2] + (
+                zindex * this->rlayout->subsizes[1] + yindex)*(
+                    this->rmemlayout->subsizes[2]);
+            for (unsigned int component=0; component < ncomp(fc); component++)
+                tmp_field->data[rindex*ncomp(fc) + component] =
+                    this->data[rrindex*ncomp(fc) + component] -
+                    this->data[rindex*ncomp(fc) + component];
+                    });
+    tmp_field->compute_rspace_stats(
+            group,
+            dset_name,
+            toffset,
+            max_estimate);
+    delete tmp_field;
+}
+
+
+
+template <typename rnumber,
+          field_backend be,
+          field_components fc>
+void field<rnumber, be, fc>::compute_rspace_stats(
+                const hid_t group,
+                const std::string dset_name,
+                const hsize_t toffset,
+                const std::vector<double> max_estimate)
+{
+    TIMEZONE("field::compute_rspace_stats");
+    assert(this->real_space_representation);
     const unsigned int nmoments = 10;
     int nvals, nbins;
     if (this->myrank == 0)
@@ -427,25 +701,41 @@ void field<rnumber, be, fc>::compute_rspace_stats(
         H5Sclose(wspace);
         H5Dclose(dset);
     }
-    MPI_Bcast(&nvals, 1, MPI_INT, 0, this->comm);
-    MPI_Bcast(&nbins, 1, MPI_INT, 0, this->comm);
+    {
+        TIMEZONE("MPI_Bcast");
+        MPI_Bcast(&nvals, 1, MPI_INT, 0, this->comm);
+        MPI_Bcast(&nbins, 1, MPI_INT, 0, this->comm);
+    }
     assert(nvals == int(max_estimate.size()));
-    double *moments = new double[nmoments*nvals];
-    double *local_moments = new double[nmoments*nvals];
-    double *val_tmp = new double[nvals];
+
+    shared_array<double> local_moments_threaded(nmoments*nvals, [&](double* local_moments){
+        std::fill_n(local_moments, nmoments*nvals, 0);
+        if (nvals == 4) local_moments[3] = max_estimate[3];
+    });
+
+    shared_array<double> val_tmp_threaded(nvals,[&](double *val_tmp){
+        std::fill_n(val_tmp, nvals, 0);
+    });
+
+    shared_array<ptrdiff_t> local_hist_threaded(nbins*nvals,[&](ptrdiff_t* local_hist){
+        std::fill_n(local_hist, nbins*nvals, 0);
+    });
+
     double *binsize = new double[nvals];
-    double *pow_tmp = new double[nvals];
-    ptrdiff_t *hist = new ptrdiff_t[nbins*nvals];
-    ptrdiff_t *local_hist = new ptrdiff_t[nbins*nvals];
-    int bin;
     for (int i=0; i<nvals; i++)
         binsize[i] = 2*max_estimate[i] / nbins;
-    std::fill_n(local_hist, nbins*nvals, 0);
-    std::fill_n(local_moments, nmoments*nvals, 0);
-    if (nvals == 4) local_moments[3] = max_estimate[3];
-    FIELD_RLOOP(
-            this,
-            std::fill_n(pow_tmp, nvals, 1.0);
+
+    {
+        TIMEZONE("field::RLOOP");
+        this->RLOOP(
+                [&](ptrdiff_t rindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex){
+            double *local_moments = local_moments_threaded.getMine();
+            double *val_tmp = val_tmp_threaded.getMine();
+            ptrdiff_t *local_hist = local_hist_threaded.getMine();
+
             if (nvals == int(4)) val_tmp[3] = 0.0;
             for (unsigned int i=0; i<ncomp(fc); i++)
             {
@@ -459,9 +749,10 @@ void field<rnumber, be, fc>::compute_rspace_stats(
                     local_moments[0*nvals+3] = val_tmp[3];
                 if (val_tmp[3] > local_moments[9*nvals+3])
                     local_moments[9*nvals+3] = val_tmp[3];
-                bin = int(floor(val_tmp[3]*2/binsize[3]));
-                if (bin >= 0 && bin < nbins)
+                int bin = int(floor(val_tmp[3]*2/binsize[3]));
+                if (bin >= 0 && bin < nbins){
                     local_hist[bin*nvals+3]++;
+                }
             }
             for (unsigned int i=0; i<ncomp(fc); i++)
             {
@@ -469,44 +760,70 @@ void field<rnumber, be, fc>::compute_rspace_stats(
                     local_moments[0*nvals+i] = val_tmp[i];
                 if (val_tmp[i] > local_moments[(nmoments-1)*nvals+i])
                     local_moments[(nmoments-1)*nvals+i] = val_tmp[i];
-                bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i]));
+                int bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i]));
                 if (bin >= 0 && bin < nbins)
                     local_hist[bin*nvals+i]++;
             }
-            for (int n=1; n < int(nmoments)-1; n++)
-                for (int i=0; i<nvals; i++)
-                    local_moments[n*nvals + i] += (pow_tmp[i] = val_tmp[i]*pow_tmp[i]);
-            );
-    MPI_Allreduce(
-            (void*)local_moments,
-            (void*)moments,
-            nvals,
-            MPI_DOUBLE, MPI_MIN, this->comm);
-    MPI_Allreduce(
-            (void*)(local_moments + nvals),
-            (void*)(moments+nvals),
-            (nmoments-2)*nvals,
-            MPI_DOUBLE, MPI_SUM, this->comm);
-    MPI_Allreduce(
-            (void*)(local_moments + (nmoments-1)*nvals),
-            (void*)(moments+(nmoments-1)*nvals),
-            nvals,
-            MPI_DOUBLE, MPI_MAX, this->comm);
-    MPI_Allreduce(
-            (void*)local_hist,
-            (void*)hist,
-            nbins*nvals,
-            MPI_INT64_T, MPI_SUM, this->comm);
+            for (int n=1; n < int(nmoments)-1; n++){
+                double pow_tmp = 1;
+                for (int i=0; i<nvals; i++){
+                    local_moments[n*nvals + i] += (pow_tmp = val_tmp[i]*pow_tmp);
+				}
+			}
+                });
+
+          TIMEZONE("FIELD_RLOOP::Merge");
+          local_moments_threaded.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double {
+              if(nvals == int(4) && idx == 0*nvals+3){
+                  return std::min(v1, v2);  
+              }
+              if(nvals == int(4) && idx == 9*nvals+3){
+                  return std::max(v1, v2);  
+              }
+              if(idx < int(ncomp(fc))){
+                  return std::min(v1, v2);        
+              }      
+              if(int(nmoments-1)*nvals <= idx && idx < int(int(nmoments-1)*nvals+ncomp(fc))){
+                  return std::max(v1, v2);        
+              }
+              return v1 + v2;
+          });
+
+        local_hist_threaded.mergeParallel();
+    }
+    ptrdiff_t *hist = new ptrdiff_t[nbins*nvals];
+    double *moments = new double[nmoments*nvals];
+    {
+        TIMEZONE("MPI_Allreduce");
+        MPI_Allreduce(
+                (void*)local_moments_threaded.getMasterData(),
+                (void*)moments,
+                nvals,
+                MPI_DOUBLE, MPI_MIN, this->comm);
+        MPI_Allreduce(
+                (void*)(local_moments_threaded.getMasterData() + nvals),
+                (void*)(moments+nvals),
+                (nmoments-2)*nvals,
+                MPI_DOUBLE, MPI_SUM, this->comm);
+        MPI_Allreduce(
+                (void*)(local_moments_threaded.getMasterData() + (nmoments-1)*nvals),
+                (void*)(moments+(nmoments-1)*nvals),
+                nvals,
+                MPI_DOUBLE, MPI_MAX, this->comm);
+        MPI_Allreduce(
+                (void*)local_hist_threaded.getMasterData(),
+                (void*)hist,
+                nbins*nvals,
+                MPI_INT64_T, MPI_SUM, this->comm);
+    }
     for (int n=1; n < int(nmoments)-1; n++)
         for (int i=0; i<nvals; i++)
             moments[n*nvals + i] /= this->npoints;
-    delete[] local_moments;
-    delete[] local_hist;
-    delete[] val_tmp;
+
     delete[] binsize;
-    delete[] pow_tmp;
     if (this->myrank == 0)
     {
+        TIMEZONE("root-work");
         hid_t dset, wspace, mspace;
         hsize_t count[ndim(fc)-1], offset[ndim(fc)-1], dims[ndim(fc)-1];
         dset = H5Dopen(group, ("moments/" + dset_name).c_str(), H5P_DEFAULT);
@@ -543,6 +860,11 @@ void field<rnumber, be, fc>::compute_rspace_stats(
         H5Sclose(wspace);
         H5Sclose(mspace);
         H5Dclose(dset);
+        if (H5Lexists(group, "0slices", H5P_DEFAULT))
+            this->write_0slice(
+                    group,
+                    dset_name,
+                    toffset);
     }
     delete[] moments;
     delete[] hist;
@@ -557,6 +879,86 @@ void field<rnumber, be, fc>::normalize()
             this->data[tmp_index] /= this->npoints;
 }
 
+template <typename rnumber,
+          field_backend be,
+          field_components fc>
+void field<rnumber, be, fc>::symmetrize()
+{
+    TIMEZONE("field::symmetrize");
+    assert(!this->real_space_representation);
+    ptrdiff_t ii, cc;
+    typename fftw_interface<rnumber>::complex *data = this->get_cdata();
+    MPI_Status *mpistatus = new MPI_Status;
+    if (this->myrank == this->clayout->rank[0][0])
+    {
+        for (cc = 0; cc < ncomp(fc); cc++)
+            data[cc][1] = 0.0;
+        for (ii = 1; ii < ptrdiff_t(this->clayout->sizes[1]/2); ii++)
+            for (cc = 0; cc < ncomp(fc); cc++) {
+                ( *(data + cc + ncomp(fc)*(this->clayout->sizes[1] - ii)*this->clayout->sizes[2]))[0] =
+                 (*(data + cc + ncomp(fc)*(                          ii)*this->clayout->sizes[2]))[0];
+                ( *(data + cc + ncomp(fc)*(this->clayout->sizes[1] - ii)*this->clayout->sizes[2]))[1] =
+                -(*(data + cc + ncomp(fc)*(                          ii)*this->clayout->sizes[2]))[1];
+            }
+    }
+    typename fftw_interface<rnumber>::complex *buffer;
+    buffer = fftw_interface<rnumber>::alloc_complex(ncomp(fc)*this->clayout->sizes[1]);
+    ptrdiff_t yy;
+    /*ptrdiff_t tindex;*/
+    int ranksrc, rankdst;
+    for (yy = 1; yy < ptrdiff_t(this->clayout->sizes[0]/2); yy++) {
+        ranksrc = this->clayout->rank[0][yy];
+        rankdst = this->clayout->rank[0][this->clayout->sizes[0] - yy];
+        if (this->clayout->myrank == ranksrc)
+            for (ii = 0; ii < ptrdiff_t(this->clayout->sizes[1]); ii++)
+                for (cc = 0; cc < ncomp(fc); cc++)
+                    for (int imag_comp=0; imag_comp<2; imag_comp++)
+                        (*(buffer + ncomp(fc)*ii+cc))[imag_comp] =
+                            (*(data + ncomp(fc)*((yy - this->clayout->starts[0])*this->clayout->sizes[1] + ii)*this->clayout->sizes[2] + cc))[imag_comp];
+        if (ranksrc != rankdst)
+        {
+            if (this->clayout->myrank == ranksrc)
+                MPI_Send((void*)buffer,
+                         ncomp(fc)*this->clayout->sizes[1], mpi_real_type<rnumber>::complex(), rankdst, yy,
+                        this->clayout->comm);
+            if (this->clayout->myrank == rankdst)
+                MPI_Recv((void*)buffer,
+                         ncomp(fc)*this->clayout->sizes[1], mpi_real_type<rnumber>::complex(), ranksrc, yy,
+                        this->clayout->comm, mpistatus);
+        }
+        if (this->clayout->myrank == rankdst)
+        {
+            for (ii = 1; ii < ptrdiff_t(this->clayout->sizes[1]); ii++)
+                for (cc = 0; cc < ncomp(fc); cc++)
+                {
+                    (*(data + ncomp(fc)*((this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1] + ii)*this->clayout->sizes[2] + cc))[0] =
+                            (*(buffer + ncomp(fc)*(this->clayout->sizes[1]-ii)+cc))[0];
+                    (*(data + ncomp(fc)*((this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1] + ii)*this->clayout->sizes[2] + cc))[1] =
+                            -(*(buffer + ncomp(fc)*(this->clayout->sizes[1]-ii)+cc))[1];
+                }
+            for (cc = 0; cc < ncomp(fc); cc++)
+            {
+                (*((data + cc + ncomp(fc)*(this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1]*this->clayout->sizes[2])))[0] =  (*(buffer + cc))[0];
+                (*((data + cc + ncomp(fc)*(this->clayout->sizes[0] - yy - this->clayout->starts[0])*this->clayout->sizes[1]*this->clayout->sizes[2])))[1] = -(*(buffer + cc))[1];
+            }
+        }
+    }
+    fftw_interface<rnumber>::free(buffer);
+    delete mpistatus;
+    /* put asymmetric data to 0 */
+    /*if (this->clayout->myrank == this->clayout->rank[0][this->clayout->sizes[0]/2])
+    {
+        tindex = ncomp(fc)*(this->clayout->sizes[0]/2 - this->clayout->starts[0])*this->clayout->sizes[1]*this->clayout->sizes[2];
+        for (ii = 0; ii < this->clayout->sizes[1]; ii++)
+        {
+            std::fill_n((rnumber*)(data + tindex), ncomp(fc)*2*this->clayout->sizes[2], 0.0);
+            tindex += ncomp(fc)*this->clayout->sizes[2];
+        }
+    }
+    tindex = ncomp(fc)*();
+    std::fill_n((rnumber*)(data + tindex), ncomp(fc)*2, 0.0);*/
+}
+
 template <typename rnumber,
           field_backend be,
           field_components fc>
@@ -568,6 +970,7 @@ void field<rnumber, be, fc>::compute_stats(
         const hsize_t toffset,
         const double max_estimate)
 {
+    TIMEZONE("field::compute_stats");
     std::vector<double> max_estimate_vector;
     bool did_rspace = false;
     switch(fc)
@@ -585,6 +988,7 @@ void field<rnumber, be, fc>::compute_stats(
     }
     if (this->real_space_representation)
     {
+        TIMEZONE("field::compute_stats::compute_rspace_stats");
         this->compute_rspace_stats(
                 group,
                 dset_name,
@@ -593,14 +997,15 @@ void field<rnumber, be, fc>::compute_stats(
         did_rspace = true;
         this->dft();
         // normalize
+        TIMEZONE("field::normalize");
         for (hsize_t tmp_index=0; tmp_index<this->rmemlayout->local_size; tmp_index++)
             this->data[tmp_index] /= this->npoints;
     }
     // what follows gave me a headache until I found this link:
     // http://stackoverflow.com/questions/8256636/expected-primary-expression-error-on-template-method-using
     kk->template cospectrum<rnumber, fc>(
-            (cnumber*)this->data,
-            (cnumber*)this->data,
+            (typename fftw_interface<rnumber>::complex*)this->data,
+            (typename fftw_interface<rnumber>::complex*)this->data,
             group,
             dset_name + "_" + dset_name,
             toffset);
@@ -616,218 +1021,62 @@ void field<rnumber, be, fc>::compute_stats(
     }
 }
 
-template <field_backend be,
-          kspace_dealias_type dt>
-template <field_components fc>
-kspace<be, dt>::kspace(
-        const field_layout<fc> *source_layout,
-        const double DKX,
-        const double DKY,
-        const double DKZ)
-{
-    /* get layout */
-    this->layout = new field_layout<ONE>(
-            source_layout->sizes,
-            source_layout->subsizes,
-            source_layout->starts,
-            source_layout->comm);
-
-    /* store dk values */
-    this->dkx = DKX;
-    this->dky = DKY;
-    this->dkz = DKZ;
-
-    /* compute kx, ky, kz and compute kM values */
-    switch(be)
-    {
-        case FFTW:
-            this->kx.resize(this->layout->sizes[2]);
-            this->ky.resize(this->layout->subsizes[0]);
-            this->kz.resize(this->layout->sizes[1]);
-            int i, ii;
-            for (i = 0; i<int(this->layout->sizes[2]); i++)
-                this->kx[i] = i*this->dkx;
-            for (i = 0; i<int(this->layout->subsizes[0]); i++)
-            {
-                ii = i + this->layout->starts[0];
-                if (ii <= int(this->layout->sizes[1]/2))
-                    this->ky[i] = this->dky*ii;
-                else
-                    this->ky[i] = this->dky*(ii - int(this->layout->sizes[1]));
-            }
-            for (i = 0; i<int(this->layout->sizes[1]); i++)
-            {
-                if (i <= int(this->layout->sizes[0]/2))
-                    this->kz[i] = this->dkz*i;
-                else
-                    this->kz[i] = this->dkz*(i - int(this->layout->sizes[0]));
-            }
-            switch(dt)
-            {
-                case TWO_THIRDS:
-                    this->kMx = this->dkx*(int(2*(int(this->layout->sizes[2])-1)/3)-1);
-                    this->kMy = this->dky*(int(this->layout->sizes[0] / 3)-1);
-                    this->kMz = this->dkz*(int(this->layout->sizes[1] / 3)-1);
-                    break;
-                case SMOOTH:
-                    this->kMx = this->dkx*(int(this->layout->sizes[2])-2);
-                    this->kMy = this->dky*(int(this->layout->sizes[0] / 2)-1);
-                    this->kMz = this->dkz*(int(this->layout->sizes[1] / 2)-1);
-                    break;
-            }
-            break;
-    }
-
-    /* get global kM and dk */
-    this->kM = this->kMx;
-    if (this->kM < this->kMy) this->kM = this->kMy;
-    if (this->kM < this->kMz) this->kM = this->kMz;
-    this->kM2 = this->kM * this->kM;
-    this->dk = this->dkx;
-    if (this->dk > this->dky) this->dk = this->dky;
-    if (this->dk > this->dkz) this->dk = this->dkz;
-    this->dk2 = this->dk*this->dk;
-
-    /* spectra stuff */
-    this->nshells = int(this->kM / this->dk) + 2;
-    this->kshell.resize(this->nshells, 0);
-    this->nshell.resize(this->nshells, 0);
-    std::vector<double> kshell_local;
-    kshell_local.resize(this->nshells, 0);
-    std::vector<int64_t> nshell_local;
-    nshell_local.resize(this->nshells, 0);
-    double knorm;
-    KSPACE_CLOOP_K2_NXMODES(
-            this,
-            if (k2 < this->kM2)
-            {
-                knorm = sqrt(k2);
-                nshell_local[int(knorm/this->dk)] += nxmodes;
-                kshell_local[int(knorm/this->dk)] += nxmodes*knorm;
-            }
-            if (dt == TWO_THIRDS)
-                this->dealias_filter[int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));
-            );
-    MPI_Allreduce(
-            &nshell_local.front(),
-            &this->nshell.front(),
-            this->nshells,
-            MPI_INT64_T, MPI_SUM, this->layout->comm);
-    MPI_Allreduce(
-            &kshell_local.front(),
-            &this->kshell.front(),
-            this->nshells,
-            MPI_DOUBLE, MPI_SUM, this->layout->comm);
-    for (int n=0; n<this->nshells; n++)
-        this->kshell[n] /= this->nshell[n];
-}
-
-template <field_backend be,
-          kspace_dealias_type dt>
-kspace<be, dt>::~kspace()
-{
-    delete this->layout;
-}
-
-template <field_backend be,
-          kspace_dealias_type dt>
-template <typename rnumber,
-          field_components fc>
-void kspace<be, dt>::low_pass(rnumber *__restrict__ a, const double kmax)
-{
-    const double km2 = kmax*kmax;
-    KSPACE_CLOOP_K2(
-            this,
-            if (k2 >= km2)
-                std::fill_n(a + 2*ncomp(fc)*cindex, 2*ncomp(fc), 0);
-            );
-}
-
-template <field_backend be,
-          kspace_dealias_type dt>
 template <typename rnumber,
-          field_components fc>
-void kspace<be, dt>::dealias(rnumber *__restrict__ a)
-{
-    switch(be)
-    {
-        case TWO_THIRDS:
-            this->low_pass<rnumber, fc>(a, this->kM);
-            break;
-        case SMOOTH:
-            KSPACE_CLOOP_K2(
-                    this,
-                    double tval = this->dealias_filter[int(round(k2 / this->dk2))];
-                    for (int tcounter=0; tcounter<2*ncomp(fc); tcounter++)
-                        a[2*ncomp(fc)*cindex + tcounter] *= tval;
-                    );
-            break;
-    }
-}
-
-template <field_backend be,
+          field_backend be,
+          field_components fc1,
+          field_components fc2,
           kspace_dealias_type dt>
-template <typename rnumber,
-          field_components fc>
-void kspace<be, dt>::cospectrum(
-        const rnumber(* __restrict a)[2],
-        const rnumber(* __restrict b)[2],
-        const hid_t group,
-        const std::string dset_name,
-        const hsize_t toffset)
+void compute_gradient(
+        kspace<be, dt> *kk,
+        field<rnumber, be, fc1> *src,
+        field<rnumber, be, fc2> *dst)
 {
-    std::vector<double> spec, spec_local;
-    spec.resize(this->nshells*ncomp(fc)*ncomp(fc), 0);
-    spec_local.resize(this->nshells*ncomp(fc)*ncomp(fc), 0);
-    KSPACE_CLOOP_K2_NXMODES(
-            this,
-            if (k2 <= this->kM2)
+    TIMEZONE("compute_gradient");
+    assert(!src->real_space_representation);
+    assert((fc1 == ONE && fc2 == THREE) ||
+           (fc1 == THREE && fc2 == THREExTHREE));
+    kk->CLOOP_K2(
+            [&](ptrdiff_t cindex,
+                ptrdiff_t xindex,
+                ptrdiff_t yindex,
+                ptrdiff_t zindex,
+                double k2){
+            if (k2 < kk->kM2) switch(fc1)
             {
-                int tmp_int = int(sqrt(k2) / this->dk)*ncomp(fc)*ncomp(fc);
-                for (hsize_t i=0; i<ncomp(fc); i++)
-                for (hsize_t j=0; j<ncomp(fc); j++)
-                    spec_local[tmp_int + i*ncomp(fc)+j] += nxmodes * (
-                    (a[ncomp(fc)*cindex + i][0] * b[ncomp(fc)*cindex + j][0]) +
-                    (a[ncomp(fc)*cindex + i][1] * b[ncomp(fc)*cindex + j][1]));
+                case ONE:
+                    dst->cval(cindex, 0, 0) = -kk->kx[xindex]*src->cval(cindex, 1);
+                    dst->cval(cindex, 0, 1) =  kk->kx[xindex]*src->cval(cindex, 0);
+                    dst->cval(cindex, 1, 0) = -kk->ky[yindex]*src->cval(cindex, 1);
+                    dst->cval(cindex, 1, 1) =  kk->ky[yindex]*src->cval(cindex, 0);
+                    dst->cval(cindex, 2, 0) = -kk->kz[zindex]*src->cval(cindex, 1);
+                    dst->cval(cindex, 2, 1) =  kk->kz[zindex]*src->cval(cindex, 0);
+                    break;
+                case THREE:
+                    for (unsigned int field_component = 0;
+                         field_component < ncomp(fc1);
+                         field_component++)
+                    {
+                        dst->cval(cindex, 0, field_component, 0) = -kk->kx[xindex]*src->cval(cindex, field_component, 1);
+                        dst->cval(cindex, 0, field_component, 1) =  kk->kx[xindex]*src->cval(cindex, field_component, 0);
+                        dst->cval(cindex, 1, field_component, 0) = -kk->ky[yindex]*src->cval(cindex, field_component, 1);
+                        dst->cval(cindex, 1, field_component, 1) =  kk->ky[yindex]*src->cval(cindex, field_component, 0);
+                        dst->cval(cindex, 2, field_component, 0) = -kk->kz[zindex]*src->cval(cindex, field_component, 1);
+                        dst->cval(cindex, 2, field_component, 1) =  kk->kz[zindex]*src->cval(cindex, field_component, 0);
+                    }
+                //dst->get_cdata()[(cindex*3+0)*ncomp(fc1)+field_component][0] =
+                //    - kk->kx[xindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][1];
+                //dst->get_cdata()[(cindex*3+0)*ncomp(fc1)+field_component][1] =
+                //      kk->kx[xindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][0];
+                //dst->get_cdata()[(cindex*3+1)*ncomp(fc1)+field_component][0] =
+                //    - kk->ky[yindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][1];
+                //dst->get_cdata()[(cindex*3+1)*ncomp(fc1)+field_component][1] =
+                //      kk->ky[yindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][0];
+                //dst->get_cdata()[(cindex*3+2)*ncomp(fc1)+field_component][0] =
+                //    - kk->kz[zindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][1];
+                //dst->get_cdata()[(cindex*3+2)*ncomp(fc1)+field_component][1] =
+                //      kk->kz[zindex]*src->get_cdata()[cindex*ncomp(fc1)+field_component][0];
             }
-            );
-    MPI_Allreduce(
-            &spec_local.front(),
-            &spec.front(),
-            spec.size(),
-            MPI_DOUBLE, MPI_SUM, this->layout->comm);
-    if (this->layout->myrank == 0)
-    {
-        hid_t dset, wspace, mspace;
-        hsize_t count[(ndim(fc)-2)*2], offset[(ndim(fc)-2)*2], dims[(ndim(fc)-2)*2];
-        dset = H5Dopen(group, ("spectra/" + dset_name).c_str(), H5P_DEFAULT);
-        wspace = H5Dget_space(dset);
-        H5Sget_simple_extent_dims(wspace, dims, NULL);
-        switch (fc)
-        {
-            case THREExTHREE:
-                offset[4] = 0;
-                offset[5] = 0;
-                count[4] = ncomp(fc);
-                count[5] = ncomp(fc);
-            case THREE:
-                offset[2] = 0;
-                offset[3] = 0;
-                count[2] = ncomp(fc);
-                count[3] = ncomp(fc);
-            default:
-                offset[0] = toffset;
-                offset[1] = 0;
-                count[0] = 1;
-                count[1] = this->nshells;
-        }
-        mspace = H5Screate_simple((ndim(fc)-2)*2, count, NULL);
-        H5Sselect_hyperslab(wspace, H5S_SELECT_SET, offset, NULL, count, NULL);
-        H5Dwrite(dset, H5T_NATIVE_DOUBLE, mspace, wspace, H5P_DEFAULT, &spec.front());
-        H5Sclose(wspace);
-        H5Sclose(mspace);
-        H5Dclose(dset);
-    }
+            });
 }
 
 template class field<float, FFTW, ONE>;
@@ -837,49 +1086,6 @@ template class field<double, FFTW, ONE>;
 template class field<double, FFTW, THREE>;
 template class field<double, FFTW, THREExTHREE>;
 
-template class kspace<FFTW, TWO_THIRDS>;
-template class kspace<FFTW, SMOOTH>;
-
-template kspace<FFTW, TWO_THIRDS>::kspace<>(
-        const field_layout<ONE> *,
-        const double, const double, const double);
-template kspace<FFTW, TWO_THIRDS>::kspace<>(
-        const field_layout<THREE> *,
-        const double, const double, const double);
-template kspace<FFTW, TWO_THIRDS>::kspace<>(
-        const field_layout<THREExTHREE> *,
-        const double, const double, const double);
-
-template kspace<FFTW, SMOOTH>::kspace<>(
-        const field_layout<ONE> *,
-        const double, const double, const double);
-template kspace<FFTW, SMOOTH>::kspace<>(
-        const field_layout<THREE> *,
-        const double, const double, const double);
-template kspace<FFTW, SMOOTH>::kspace<>(
-        const field_layout<THREExTHREE> *,
-        const double, const double, const double);
-
-template void kspace<FFTW, SMOOTH>::low_pass<float, ONE>(
-   float *__restrict__ a,
-   const double kmax);
-template void kspace<FFTW, SMOOTH>::low_pass<float, THREE>(
-   float *__restrict__ a,
-   const double kmax);
-template void kspace<FFTW, SMOOTH>::low_pass<float, THREExTHREE>(
-   float *__restrict__ a,
-   const double kmax);
-
-template void kspace<FFTW, SMOOTH>::low_pass<double, ONE>(
-   double *__restrict__ a,
-   const double kmax);
-template void kspace<FFTW, SMOOTH>::low_pass<double, THREE>(
-   double *__restrict__ a,
-   const double kmax);
-template void kspace<FFTW, SMOOTH>::low_pass<double, THREExTHREE>(
-        double *__restrict__ a,
-        const double kmax);
-
 template void field<float, FFTW, ONE>::compute_stats<TWO_THIRDS>(
         kspace<FFTW, TWO_THIRDS> *,
         const hid_t, const std::string, const hsize_t, const double);
@@ -920,3 +1126,20 @@ template void field<double, FFTW, THREExTHREE>::compute_stats<SMOOTH>(
         kspace<FFTW, SMOOTH> *,
         const hid_t, const std::string, const hsize_t, const double);
 
+template void compute_gradient<float, FFTW, THREE, THREExTHREE, SMOOTH>(
+        kspace<FFTW, SMOOTH> *,
+        field<float, FFTW, THREE> *,
+        field<float, FFTW, THREExTHREE> *);
+template void compute_gradient<double, FFTW, THREE, THREExTHREE, SMOOTH>(
+        kspace<FFTW, SMOOTH> *,
+        field<double, FFTW, THREE> *,
+        field<double, FFTW, THREExTHREE> *);
+
+template void compute_gradient<float, FFTW, ONE, THREE, SMOOTH>(
+        kspace<FFTW, SMOOTH> *,
+        field<float, FFTW, ONE> *,
+        field<float, FFTW, THREE> *);
+template void compute_gradient<double, FFTW, ONE, THREE, SMOOTH>(
+        kspace<FFTW, SMOOTH> *,
+        field<double, FFTW, ONE> *,
+        field<double, FFTW, THREE> *);
diff --git a/bfps/cpp/field.hpp b/bfps/cpp/field.hpp
index 6ebd4090e38795b2209fffcb3b6d7aab2642a8f2..360d37e668130fe1d0e0c415fa98d34fc6b13de3 100644
--- a/bfps/cpp/field.hpp
+++ b/bfps/cpp/field.hpp
@@ -24,110 +24,17 @@
 
 
 
-#include <mpi.h>
 #include <hdf5.h>
-#include <fftw3-mpi.h>
 #include <unordered_map>
 #include <vector>
 #include <string>
-#include "base.hpp"
+#include "kspace.hpp"
+#include "omputils.hpp"
 
-#ifndef FIELD
+#ifndef FIELD_HPP
 
-#define FIELD
+#define FIELD_HPP
 
-enum field_backend {FFTW};
-enum field_components {ONE, THREE, THREExTHREE};
-enum kspace_dealias_type {TWO_THIRDS, SMOOTH};
-
-constexpr unsigned int ncomp(
-        field_components fc)
-    /* return actual number of field components for each enum value */
-{
-    return ((fc == THREE) ? 3 : (
-            (fc == THREExTHREE) ? 9 : 1));
-}
-
-constexpr unsigned int ndim(
-        field_components fc)
-    /* return actual number of field dimensions for each enum value */
-{
-    return ((fc == THREE) ? 4 : (
-            (fc == THREExTHREE) ? 5 : 3));
-}
-
-template <field_components fc>
-class field_layout
-{
-    public:
-        /* description */
-        hsize_t sizes[ndim(fc)];
-        hsize_t subsizes[ndim(fc)];
-        hsize_t starts[ndim(fc)];
-        hsize_t local_size, full_size;
-
-        int myrank, nprocs;
-        MPI_Comm comm;
-
-        std::vector<std::vector<int>> rank;
-        std::vector<std::vector<int>> all_start;
-        std::vector<std::vector<int>> all_size;
-
-        /* methods */
-        field_layout(
-                const hsize_t *SIZES,
-                const hsize_t *SUBSIZES,
-                const hsize_t *STARTS,
-                const MPI_Comm COMM_TO_USE);
-        ~field_layout(){}
-};
-
-template <field_backend be,
-          kspace_dealias_type dt>
-class kspace
-{
-    public:
-        /* relevant field layout */
-        field_layout<ONE> *layout;
-
-        /* physical parameters */
-        double dkx, dky, dkz, dk, dk2;
-
-        /* mode and dealiasing information */
-        double kMx, kMy, kMz, kM, kM2;
-        double kMspec, kMspec2;
-        std::vector<double> kx, ky, kz;
-        std::unordered_map<int, double> dealias_filter;
-        std::vector<double> kshell;
-        std::vector<int64_t> nshell;
-        int nshells;
-
-        /* methods */
-        template <field_components fc>
-        kspace(
-                const field_layout<fc> *source_layout,
-                const double DKX = 1.0,
-                const double DKY = 1.0,
-                const double DKZ = 1.0);
-        ~kspace();
-
-        template <typename rnumber,
-                  field_components fc>
-        void low_pass(rnumber *__restrict__ a, const double kmax);
-
-        template <typename rnumber,
-                  field_components fc>
-        void dealias(rnumber *__restrict__ a);
-
-        template <typename rnumber,
-                  field_components fc>
-        void cospectrum(
-                const rnumber(* __restrict__ a)[2],
-                const rnumber(* __restrict__ b)[2],
-                const hid_t group,
-                const std::string dset_name,
-                const hsize_t toffset);
-};
 
 template <typename rnumber,
           field_backend be,
@@ -136,10 +43,9 @@ class field
 {
     private:
         /* data arrays */
-        rnumber *data;
-        typedef rnumber cnumber[2];
-        hsize_t npoints;
+        rnumber *__restrict__ data;
     public:
+        hsize_t npoints;
         bool real_space_representation;
         /* basic MPI information */
         int myrank, nprocs;
@@ -153,8 +59,8 @@ class field
         field_layout<fc> *clayout, *rlayout, *rmemlayout;
 
         /* FFT plans */
-        void *c2r_plan;
-        void *r2c_plan;
+        typename fftw_interface<rnumber>::plan c2r_plan;
+        typename fftw_interface<rnumber>::plan r2c_plan;
         unsigned fftw_plan_rigor;
 
         /* HDF5 data types for arrays */
@@ -166,34 +72,100 @@ class field
                 const int ny,
                 const int nz,
                 const MPI_Comm COMM_TO_USE,
-                const unsigned FFTW_PLAN_RIGOR = FFTW_ESTIMATE);
+                const unsigned FFTW_PLAN_RIGOR = DEFAULT_FFTW_FLAG);
         ~field();
 
         int io(
                 const std::string fname,
-                const std::string dset_name,
+                const std::string field_name,
+                const int iteration,
+                const bool read = true);
+        int io_database(
+                const std::string fname,
+                const std::string field_name,
                 const int toffset,
                 const bool read = true);
 
+        int write_0slice(
+                const hid_t group,
+                const std::string field_name,
+                const int iteration);
+
+        /* essential FFT stuff */
         void dft();
         void ift();
         void normalize();
+        void symmetrize();
+
+        /* stats */
+        void compute_rspace_xincrement_stats(
+                const int xcells,
+                const hid_t group,
+                const std::string dset_name,
+                const hsize_t toffset,
+                const std::vector<double> max_estimate);
 
         void compute_rspace_stats(
                 const hid_t group,
                 const std::string dset_name,
                 const hsize_t toffset,
                 const std::vector<double> max_estimate);
-        inline rnumber *get_rdata()
+
+        /* acess data */
+        inline rnumber *__restrict__ get_rdata()
+        {
+            return this->data;
+        }
+
+        inline const rnumber *__restrict__ get_rdata() const
         {
             return this->data;
         }
-        inline cnumber *get_cdata()
+
+        inline typename fftw_interface<rnumber>::complex *__restrict__ get_cdata()
+        {
+            return (typename fftw_interface<rnumber>::complex*__restrict__)this->data;
+        }
+
+        inline rnumber &rval(ptrdiff_t rindex, unsigned int component = 0)
+        {
+            assert(fc == ONE || fc == THREE);
+            assert(component >= 0 && component < ncomp(fc));
+            return *(this->data + rindex*ncomp(fc) + component);
+        }
+
+        inline rnumber &rval(ptrdiff_t rindex, int comp1, int comp0)
+        {
+            assert(fc == THREExTHREE);
+            assert(comp1 >= 0 && comp1 < 3);
+            assert(comp0 >= 0 && comp0 < 3);
+            return *(this->data + ((rindex*3 + comp1)*3 + comp0));
+        }
+
+        inline rnumber &cval(ptrdiff_t cindex, int imag)
+        {
+            assert(fc == ONE);
+            assert(imag == 0 || imag == 1);
+            return *(this->data + cindex*2 + imag);
+        }
+
+        inline rnumber &cval(ptrdiff_t cindex, int component, int imag)
+        {
+            assert(fc == THREE);
+            assert(imag == 0 || imag == 1);
+            return *(this->data + (cindex*ncomp(fc) + component)*2 + imag);
+        }
+
+        inline rnumber &cval(ptrdiff_t cindex, int comp1, int comp0, int imag)
         {
-            return (cnumber*)this->data;
+            assert(fc == THREExTHREE);
+            assert(comp1 >= 0 && comp1 < 3);
+            assert(comp0 >= 0 && comp0 < 3);
+            assert(imag == 0 || imag == 1);
+            return *(this->data + ((cindex*3 + comp1)*3+comp0)*2 + imag);
         }
 
-        inline field<rnumber, be, fc>& operator=(const cnumber *__restrict__ source)
+        inline field<rnumber, be, fc>& operator=(const typename fftw_interface<rnumber>::complex *__restrict__ source)
         {
             std::copy((rnumber*)source,
                       (rnumber*)(source + this->clayout->local_size),
@@ -210,6 +182,15 @@ class field
             this->real_space_representation = true;
             return *this;
         }
+
+        inline field<rnumber, be, fc>& operator=(const rnumber value)
+        {
+            std::fill_n(this->data,
+                        this->rmemlayout->local_size,
+                        value);
+            return *this;
+        }
+
         template <kspace_dealias_type dt>
         void compute_stats(
                 kspace<be, dt> *kk,
@@ -217,74 +198,61 @@ class field
                 const std::string dset_name,
                 const hsize_t toffset,
                 const double max_estimate);
+        inline void impose_zero_mode()
+        {
+            if (this->clayout->myrank == this->clayout->rank[0][0] &&
+                this->real_space_representation == false)
+            {
+                std::fill_n(this->data, 2*ncomp(fc), 0.0);
+            }
+        }
+        template <class func_type>
+        void RLOOP(func_type expression)
+        {
+            switch(be)
+            {
+                case FFTW:
+                    #pragma omp parallel
+                    {
+                        const hsize_t start = OmpUtils::ForIntervalStart(this->rlayout->subsizes[1]);
+                        const hsize_t end = OmpUtils::ForIntervalEnd(this->rlayout->subsizes[1]);
+
+                        for (hsize_t zindex = 0; zindex < this->rlayout->subsizes[0]; zindex++)
+                        for (hsize_t yindex = start; yindex < end; yindex++)
+                        {
+                            ptrdiff_t rindex = (
+                                    zindex * this->rlayout->subsizes[1] + yindex)*(
+                                        this->rmemlayout->subsizes[2]);
+                            for (hsize_t xindex = 0; xindex < this->rlayout->subsizes[2]; xindex++)
+                            {
+                                expression(rindex, xindex, yindex, zindex);
+                                rindex++;
+                            }
+                        }
+                    }
+                    break;
+            }
+        }
+        ptrdiff_t get_cindex(
+                ptrdiff_t xindex,
+                ptrdiff_t yindex,
+                ptrdiff_t zindex)
+        {
+            return ((yindex*this->clayout->subsizes[1] +
+                     zindex)*this->clayout->subsizes[2] +
+                    xindex);
+        }
 };
 
-/* real space loop */
-#define FIELD_RLOOP(obj, expression) \
- \
-{ \
-    switch (be) \
-    { \
-        case FFTW: \
-            for (hsize_t zindex = 0; zindex < obj->rlayout->subsizes[0]; zindex++) \
-            for (hsize_t yindex = 0; yindex < obj->rlayout->subsizes[1]; yindex++) \
-            { \
-                ptrdiff_t rindex = ( \
-                        zindex * obj->rlayout->subsizes[1] + yindex)*( \
-                            obj->rmemlayout->subsizes[2]); \
-            for (hsize_t xindex = 0; xindex < obj->rlayout->subsizes[2]; xindex++) \
-                { \
-                    expression; \
-                    rindex++; \
-                } \
-            } \
-            break; \
-    } \
-}
-
-#define KSPACE_CLOOP_K2(obj, expression) \
- \
-{ \
-    double k2; \
-    ptrdiff_t cindex = 0; \
-    for (hsize_t yindex = 0; yindex < obj->layout->subsizes[0]; yindex++) \
-    for (hsize_t zindex = 0; zindex < obj->layout->subsizes[1]; zindex++) \
-    for (hsize_t xindex = 0; xindex < obj->layout->subsizes[2]; xindex++) \
-        { \
-            k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-                  obj->ky[yindex]*obj->ky[yindex] + \
-                  obj->kz[zindex]*obj->kz[zindex]); \
-            expression; \
-            cindex++; \
-        } \
-}
-
-#define KSPACE_CLOOP_K2_NXMODES(obj, expression) \
- \
-{ \
-    double k2; \
-    ptrdiff_t cindex = 0; \
-    for (hsize_t yindex = 0; yindex < obj->layout->subsizes[0]; yindex++) \
-    for (hsize_t zindex = 0; zindex < obj->layout->subsizes[1]; zindex++) \
-    { \
-        int nxmodes = 1; \
-        hsize_t xindex = 0; \
-        k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-              obj->ky[yindex]*obj->ky[yindex] + \
-              obj->kz[zindex]*obj->kz[zindex]); \
-        expression; \
-        cindex++; \
-        nxmodes = 2; \
-    for (xindex = 1; xindex < obj->layout->subsizes[2]; xindex++) \
-        { \
-            k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-                  obj->ky[yindex]*obj->ky[yindex] + \
-                  obj->kz[zindex]*obj->kz[zindex]); \
-            expression; \
-            cindex++; \
-        } \
-    } \
-}
-
-#endif//FIELD
+template <typename rnumber,
+          field_backend be,
+          field_components fc1,
+          field_components fc2,
+          kspace_dealias_type dt>
+void compute_gradient(
+        kspace<be, dt> *kk,
+        field<rnumber, be, fc1> *source,
+        field<rnumber, be, fc2> *destination);
+
+#endif//FIELD_HPP
 
diff --git a/bfps/cpp/field_descriptor.cpp b/bfps/cpp/field_descriptor.cpp
index b5025835903a37ea5384cb4102c716f527aabfe5..20c634262dbb45ad4c2bb5a1b5640b6df23d4d2c 100644
--- a/bfps/cpp/field_descriptor.cpp
+++ b/bfps/cpp/field_descriptor.cpp
@@ -31,476 +31,470 @@
 #include <iostream>
 #include "base.hpp"
 #include "field_descriptor.hpp"
-
+#include "fftw_interface.hpp"
+#include "scope_timer.hpp"
 
 /*****************************************************************************/
 /* macro for specializations to numeric types compatible with FFTW           */
 
-#define CLASS_IMPLEMENTATION(FFTW, R, MPI_RNUM, MPI_CNUM) \
-    \
-template<> \
-field_descriptor<R>::field_descriptor( \
-        int ndims, \
-        int *n, \
-        MPI_Datatype element_type, \
-        MPI_Comm COMM_TO_USE) \
-{ \
-    DEBUG_MSG("entered field_descriptor::field_descriptor\n"); \
-    this->comm = COMM_TO_USE; \
-    MPI_Comm_rank(this->comm, &this->myrank); \
-    MPI_Comm_size(this->comm, &this->nprocs); \
-    this->ndims = ndims; \
-    this->sizes    = new int[ndims]; \
-    this->subsizes = new int[ndims]; \
-    this->starts   = new int[ndims]; \
-    int tsizes   [ndims]; \
-    int tsubsizes[ndims]; \
-    int tstarts  [ndims]; \
-    ptrdiff_t *nfftw = new ptrdiff_t[ndims]; \
-    ptrdiff_t local_n0, local_0_start; \
-    for (int i = 0; i < this->ndims; i++) \
-        nfftw[i] = n[i]; \
-    this->local_size = fftw_mpi_local_size_many( \
-            this->ndims, \
-            nfftw, \
-            1, \
-            FFTW_MPI_DEFAULT_BLOCK, \
-            this->comm, \
-            &local_n0, \
-            &local_0_start); \
-    this->sizes[0] = n[0]; \
-    this->subsizes[0] = (int)local_n0; \
-    this->starts[0] = (int)local_0_start; \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "first subsizes[0] = %d %d %d\n", \
-            this->subsizes[0], \
-            tsubsizes[0], \
-            (int)local_n0); \
-    tsizes[0] = n[0]; \
-    tsubsizes[0] = (int)local_n0; \
-    tstarts[0] = (int)local_0_start; \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "second subsizes[0] = %d %d %d\n", \
-            this->subsizes[0], \
-            tsubsizes[0], \
-            (int)local_n0); \
-    this->mpi_dtype = element_type; \
-    this->slice_size = 1; \
-    this->full_size = this->sizes[0]; \
-    for (int i = 1; i < this->ndims; i++) \
-    { \
-        this->sizes[i] = n[i]; \
-        this->subsizes[i] = n[i]; \
-        this->starts[i] = 0; \
-        this->slice_size *= this->subsizes[i]; \
-        this->full_size *= this->sizes[i]; \
-        tsizes[i] = this->sizes[i]; \
-        tsubsizes[i] = this->subsizes[i]; \
-        tstarts[i] = this->starts[i]; \
-    } \
-    tsizes[ndims-1] *= sizeof(R); \
-    tsubsizes[ndims-1] *= sizeof(R); \
-    tstarts[ndims-1] *= sizeof(R); \
-    if (this->mpi_dtype == MPI_CNUM) \
-    { \
-        tsizes[ndims-1] *= 2; \
-        tsubsizes[ndims-1] *= 2; \
-        tstarts[ndims-1] *= 2; \
-    } \
-    int local_zero_array[this->nprocs], zero_array[this->nprocs]; \
-    for (int i=0; i<this->nprocs; i++) \
-        local_zero_array[i] = 0; \
-    local_zero_array[this->myrank] = (this->subsizes[0] == 0) ? 1 : 0; \
-    MPI_Allreduce( \
-            local_zero_array, \
-            zero_array, \
-            this->nprocs, \
-            MPI_INT, \
-            MPI_SUM, \
-            this->comm); \
-    int no_of_excluded_ranks = 0; \
-    for (int i = 0; i<this->nprocs; i++) \
-        no_of_excluded_ranks += zero_array[i]; \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "subsizes[0] = %d %d\n", \
-            this->subsizes[0], \
-            tsubsizes[0]); \
-    if (no_of_excluded_ranks == 0) \
-    { \
-        this->io_comm = this->comm; \
-        this->io_nprocs = this->nprocs; \
-        this->io_myrank = this->myrank; \
-    } \
-    else \
-    { \
-        int excluded_rank[no_of_excluded_ranks]; \
-        for (int i=0, j=0; i<this->nprocs; i++) \
-            if (zero_array[i]) \
-            { \
-                excluded_rank[j] = i; \
-                j++; \
-            } \
-        MPI_Group tgroup0, tgroup; \
-        MPI_Comm_group(this->comm, &tgroup0); \
-        MPI_Group_excl(tgroup0, no_of_excluded_ranks, excluded_rank, &tgroup); \
-        MPI_Comm_create(this->comm, tgroup, &this->io_comm); \
-        MPI_Group_free(&tgroup0); \
-        MPI_Group_free(&tgroup); \
-        if (this->subsizes[0] > 0) \
-        { \
-            MPI_Comm_rank(this->io_comm, &this->io_myrank); \
-            MPI_Comm_size(this->io_comm, &this->io_nprocs); \
-        } \
-        else \
-        { \
-            this->io_myrank = MPI_PROC_NULL; \
-            this->io_nprocs = -1; \
-        } \
-    } \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "inside field_descriptor constructor, about to call " \
-            "MPI_Type_create_subarray " \
-            "%d %d %d\n", \
-            this->sizes[0], \
-            this->subsizes[0], \
-            this->starts[0]); \
-    for (int i=0; i<this->ndims; i++) \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "tsizes " \
-            "%d %d %d\n", \
-            tsizes[i], \
-            tsubsizes[i], \
-            tstarts[i]); \
-    if (this->subsizes[0] > 0) \
-    { \
-        DEBUG_MSG("creating subarray\n"); \
-        MPI_Type_create_subarray( \
-                ndims, \
-                tsizes, \
-                tsubsizes, \
-                tstarts, \
-                MPI_ORDER_C, \
-                MPI_UNSIGNED_CHAR, \
-                &this->mpi_array_dtype); \
-        MPI_Type_commit(&this->mpi_array_dtype); \
-    } \
-    this->rank = new int[this->sizes[0]]; \
-    int *local_rank = new int[this->sizes[0]]; \
-    std::fill_n(local_rank, this->sizes[0], 0); \
-    for (int i = 0; i < this->sizes[0]; i++) \
-        if (i >= this->starts[0] && i < this->starts[0] + this->subsizes[0]) \
-            local_rank[i] = this->myrank; \
-    MPI_Allreduce( \
-            local_rank, \
-            this->rank, \
-            this->sizes[0], \
-            MPI_INT, \
-            MPI_SUM, \
-            this->comm); \
-    delete[] local_rank; \
-    this->all_start0 = new int[this->nprocs]; \
-    int *local_start0 = new int[this->nprocs]; \
-    std::fill_n(local_start0, this->nprocs, 0); \
-    for (int i = 0; i < this->nprocs; i++) \
-        if (this->myrank == i) \
-            local_start0[i] = this->starts[0]; \
-    MPI_Allreduce( \
-            local_start0, \
-            this->all_start0, \
-            this->nprocs, \
-            MPI_INT, \
-            MPI_SUM, \
-            this->comm); \
-    delete[] local_start0; \
-    this->all_size0  = new int[this->nprocs]; \
-    int *local_size0 = new int[this->nprocs]; \
-    std::fill_n(local_size0, this->nprocs, 0); \
-    for (int i = 0; i < this->nprocs; i++) \
-        if (this->myrank == i) \
-            local_size0[i] = this->subsizes[0]; \
-    MPI_Allreduce( \
-            local_size0, \
-            this->all_size0, \
-            this->nprocs, \
-            MPI_INT, \
-            MPI_SUM, \
-            this->comm); \
-    delete[] local_size0; \
-    DEBUG_MSG("exiting field_descriptor constructor\n"); \
-} \
- \
-template <> \
-int field_descriptor<R>::read( \
-        const char *fname, \
-        void *buffer) \
-{ \
-    DEBUG_MSG("entered field_descriptor::read\n"); \
-    char representation[] = "native"; \
-    if (this->subsizes[0] > 0) \
-    { \
-        MPI_Info info; \
-        MPI_Info_create(&info); \
-        MPI_File f; \
-        ptrdiff_t read_size = this->local_size*sizeof(R); \
-        DEBUG_MSG("read size is %ld\n", read_size); \
-        char ffname[200]; \
-        if (this->mpi_dtype == MPI_CNUM) \
-            read_size *= 2; \
-        DEBUG_MSG("read size is %ld\n", read_size); \
-        sprintf(ffname, "%s", fname); \
- \
-        MPI_File_open( \
-                this->io_comm, \
-                ffname, \
-                MPI_MODE_RDONLY, \
-                info, \
-                &f); \
-        DEBUG_MSG("opened file\n"); \
-        MPI_File_set_view( \
-                f, \
-                0, \
-                MPI_UNSIGNED_CHAR, \
-                this->mpi_array_dtype, \
-                representation, \
-                info); \
-        DEBUG_MSG("view is set\n"); \
-        MPI_File_read_all( \
-                f, \
-                buffer, \
-                read_size, \
-                MPI_UNSIGNED_CHAR, \
-                MPI_STATUS_IGNORE); \
-        DEBUG_MSG("info is read\n"); \
-        MPI_File_close(&f); \
-    } \
-    DEBUG_MSG("finished with field_descriptor::read\n"); \
-    return EXIT_SUCCESS; \
-} \
- \
-template <> \
-int field_descriptor<R>::write( \
-        const char *fname, \
-        void *buffer) \
-{ \
-    char representation[] = "native"; \
-    if (this->subsizes[0] > 0) \
-    { \
-        MPI_Info info; \
-        MPI_Info_create(&info); \
-        MPI_File f; \
-        ptrdiff_t read_size = this->local_size*sizeof(R); \
-        char ffname[200]; \
-        if (this->mpi_dtype == MPI_CNUM) \
-            read_size *= 2; \
-        sprintf(ffname, "%s", fname); \
- \
-        MPI_File_open( \
-                this->io_comm, \
-                ffname, \
-                MPI_MODE_CREATE | MPI_MODE_WRONLY, \
-                info, \
-                &f); \
-        MPI_File_set_view( \
-                f, \
-                0, \
-                MPI_UNSIGNED_CHAR, \
-                this->mpi_array_dtype, \
-                representation, \
-                info); \
-        MPI_File_write_all( \
-                f, \
-                buffer, \
-                read_size, \
-                MPI_UNSIGNED_CHAR, \
-                MPI_STATUS_IGNORE); \
-        MPI_File_close(&f); \
-    } \
- \
-    return EXIT_SUCCESS; \
-} \
- \
-template <> \
-int field_descriptor<R>::transpose( \
-        R *input, \
-        R *output) \
-{ \
-    /* IMPORTANT NOTE: \
-     for 3D transposition, the input data is messed up */ \
-    FFTW(plan) tplan; \
-    if (this->ndims == 3) \
-    { \
-        /* transpose the two local dimensions 1 and 2 */ \
-        R *atmp; \
-        atmp = FFTW(alloc_real)(this->slice_size); \
-        for (int k = 0; k < this->subsizes[0]; k++) \
-        { \
-            /* put transposed slice in atmp */ \
-            for (int j = 0; j < this->sizes[1]; j++) \
-                for (int i = 0; i < this->sizes[2]; i++) \
-                    atmp[i*this->sizes[1] + j] = \
-                        input[(k*this->sizes[1] + j)*this->sizes[2] + i]; \
-            /* copy back transposed slice */ \
-            std::copy( \
-                    atmp, \
-                    atmp + this->slice_size, \
-                    input + k*this->slice_size); \
-        } \
-        FFTW(free)(atmp); \
-    } \
-    tplan = FFTW(mpi_plan_transpose)( \
-            this->sizes[0], this->slice_size, \
-            input, output, \
-            this->comm, \
-            FFTW_ESTIMATE); \
-    FFTW(execute)(tplan); \
-    FFTW(destroy_plan)(tplan); \
-    return EXIT_SUCCESS; \
-} \
- \
-template<> \
-int field_descriptor<R>::transpose( \
-        FFTW(complex) *input, \
-        FFTW(complex) *output) \
-{ \
-    switch (this->ndims) \
-    { \
-        case 2: \
-            /* do a global transpose over the 2 dimensions */ \
-            if (output == NULL) \
-            { \
-                std::cerr << "bad arguments for transpose.\n" << std::endl; \
-                return EXIT_FAILURE; \
-            } \
-            FFTW(plan) tplan; \
-            tplan = FFTW(mpi_plan_many_transpose)( \
-                    this->sizes[0], this->sizes[1], 2, \
-                    FFTW_MPI_DEFAULT_BLOCK, \
-                    FFTW_MPI_DEFAULT_BLOCK, \
-                    (R*)input, (R*)output, \
-                    this->comm, \
-                    FFTW_ESTIMATE); \
-            FFTW(execute)(tplan); \
-            FFTW(destroy_plan)(tplan); \
-            break; \
-        case 3: \
-            /* transpose the two local dimensions 1 and 2 */ \
-            FFTW(complex) *atmp; \
-            atmp = FFTW(alloc_complex)(this->slice_size); \
-            for (int k = 0; k < this->subsizes[0]; k++) \
-            { \
-                /* put transposed slice in atmp */ \
-                for (int j = 0; j < this->sizes[1]; j++) \
-                    for (int i = 0; i < this->sizes[2]; i++) \
-                    { \
-                        atmp[i*this->sizes[1] + j][0] = \
-                            input[(k*this->sizes[1] + j)*this->sizes[2] + i][0]; \
-                        atmp[i*this->sizes[1] + j][1] = \
-                            input[(k*this->sizes[1] + j)*this->sizes[2] + i][1]; \
-                    } \
-                /* copy back transposed slice */ \
-                std::copy( \
-                        (R*)(atmp), \
-                        (R*)(atmp + this->slice_size), \
-                        (R*)(input + k*this->slice_size)); \
-            } \
-            FFTW(free)(atmp); \
-            break; \
-        default: \
-            return EXIT_FAILURE; \
-            break; \
-    } \
-    return EXIT_SUCCESS; \
-} \
- \
-template<> \
-int field_descriptor<R>::interleave( \
-        R *a, \
-        int dim) \
-{ \
-/* the following is copied from \
- * http://agentzlerich.blogspot.com/2010/01/using-fftw-for-in-place-matrix.html \
- * */ \
-    FFTW(iodim) howmany_dims[2]; \
-    howmany_dims[0].n  = dim; \
-    howmany_dims[0].is = this->local_size; \
-    howmany_dims[0].os = 1; \
-    howmany_dims[1].n  = this->local_size; \
-    howmany_dims[1].is = 1; \
-    howmany_dims[1].os = dim; \
-    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]); \
- \
-    FFTW(plan) tmp = FFTW(plan_guru_r2r)( \
-            /*rank*/0, \
-            /*dims*/NULL, \
-            howmany_rank, \
-            howmany_dims, \
-            a, \
-            a, \
-            /*kind*/NULL, \
-            FFTW_ESTIMATE); \
-    FFTW(execute)(tmp); \
-    FFTW(destroy_plan)(tmp); \
-    return EXIT_SUCCESS; \
-} \
- \
-template<> \
-int field_descriptor<R>::interleave( \
-        FFTW(complex) *a, \
-        int dim) \
-{ \
-    FFTW(iodim) howmany_dims[2]; \
-    howmany_dims[0].n  = dim; \
-    howmany_dims[0].is = this->local_size; \
-    howmany_dims[0].os = 1; \
-    howmany_dims[1].n  = this->local_size; \
-    howmany_dims[1].is = 1; \
-    howmany_dims[1].os = dim; \
-    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]); \
- \
-    FFTW(plan) tmp = FFTW(plan_guru_dft)( \
-            /*rank*/0, \
-            /*dims*/NULL, \
-            howmany_rank, \
-            howmany_dims, \
-            a, \
-            a, \
-            +1, \
-            FFTW_ESTIMATE); \
-    FFTW(execute)(tmp); \
-    FFTW(destroy_plan)(tmp); \
-    return EXIT_SUCCESS; \
-} \
- \
-template<> \
-field_descriptor<R>* field_descriptor<R>::get_transpose() \
-{ \
-    int n[this->ndims]; \
-    for (int i=0; i<this->ndims; i++) \
-        n[i] = this->sizes[this->ndims - i - 1]; \
-    return new field_descriptor<R>(this->ndims, n, this->mpi_dtype, this->comm); \
-} \
 
-/*****************************************************************************/
+template <class rnumber>
+field_descriptor<rnumber>::field_descriptor(
+        int ndims,
+        int *n,
+        MPI_Datatype element_type,
+        MPI_Comm COMM_TO_USE)
+{
+    TIMEZONE("field_descriptor");
+    DEBUG_MSG("entered field_descriptor::field_descriptor\n");
+    this->comm = COMM_TO_USE;
+    MPI_Comm_rank(this->comm, &this->myrank);
+    MPI_Comm_size(this->comm, &this->nprocs);
+    this->ndims = ndims;
+    this->sizes    = new int[ndims];
+    this->subsizes = new int[ndims];
+    this->starts   = new int[ndims];
+    int tsizes   [ndims];
+    int tsubsizes[ndims];
+    int tstarts  [ndims];
+    std::vector<ptrdiff_t> nfftw;
+    nfftw.resize(ndims);
+    ptrdiff_t local_n0, local_0_start;
+    for (int i = 0; i < this->ndims; i++)
+        nfftw[i] = n[i];
+    this->local_size = fftw_mpi_local_size_many(
+                this->ndims,
+                &nfftw.front(),
+                1,
+                FFTW_MPI_DEFAULT_BLOCK,
+                this->comm,
+                &local_n0,
+                &local_0_start);
+    this->sizes[0] = n[0];
+    this->subsizes[0] = (int)local_n0;
+    this->starts[0] = (int)local_0_start;
+    DEBUG_MSG_WAIT(
+                this->comm,
+                "first subsizes[0] = %d %d %d\n",
+                this->subsizes[0],
+            tsubsizes[0],
+            (int)local_n0);
+    tsizes[0] = n[0];
+    tsubsizes[0] = (int)local_n0;
+    tstarts[0] = (int)local_0_start;
+    DEBUG_MSG_WAIT(
+                this->comm,
+                "second subsizes[0] = %d %d %d\n",
+                this->subsizes[0],
+            tsubsizes[0],
+            (int)local_n0);
+    this->mpi_dtype = element_type;
+    this->slice_size = 1;
+    this->full_size = this->sizes[0];
+    for (int i = 1; i < this->ndims; i++)
+    {
+        this->sizes[i] = n[i];
+        this->subsizes[i] = n[i];
+        this->starts[i] = 0;
+        this->slice_size *= this->subsizes[i];
+        this->full_size *= this->sizes[i];
+        tsizes[i] = this->sizes[i];
+        tsubsizes[i] = this->subsizes[i];
+        tstarts[i] = this->starts[i];
+    }
+    tsizes[ndims-1] *= sizeof(rnumber);
+    tsubsizes[ndims-1] *= sizeof(rnumber);
+    tstarts[ndims-1] *= sizeof(rnumber);
+    if (this->mpi_dtype == mpi_real_type<rnumber>::complex())
+    {
+        tsizes[ndims-1] *= 2;
+        tsubsizes[ndims-1] *= 2;
+        tstarts[ndims-1] *= 2;
+    }
+    int local_zero_array[this->nprocs], zero_array[this->nprocs];
+    for (int i=0; i<this->nprocs; i++)
+        local_zero_array[i] = 0;
+    local_zero_array[this->myrank] = (this->subsizes[0] == 0) ? 1 : 0;
+    MPI_Allreduce(
+                local_zero_array,
+                zero_array,
+                this->nprocs,
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+    int no_of_excluded_ranks = 0;
+    for (int i = 0; i<this->nprocs; i++)
+        no_of_excluded_ranks += zero_array[i];
+    DEBUG_MSG_WAIT(
+                this->comm,
+                "subsizes[0] = %d %d\n",
+                this->subsizes[0],
+            tsubsizes[0]);
+    if (no_of_excluded_ranks == 0)
+    {
+        this->io_comm = this->comm;
+        this->io_nprocs = this->nprocs;
+        this->io_myrank = this->myrank;
+    }
+    else
+    {
+        int excluded_rank[no_of_excluded_ranks];
+        for (int i=0, j=0; i<this->nprocs; i++)
+            if (zero_array[i])
+            {
+                excluded_rank[j] = i;
+                j++;
+            }
+        MPI_Group tgroup0, tgroup;
+        MPI_Comm_group(this->comm, &tgroup0);
+        MPI_Group_excl(tgroup0, no_of_excluded_ranks, excluded_rank, &tgroup);
+        MPI_Comm_create(this->comm, tgroup, &this->io_comm);
+        MPI_Group_free(&tgroup0);
+        MPI_Group_free(&tgroup);
+        if (this->subsizes[0] > 0)
+        {
+            MPI_Comm_rank(this->io_comm, &this->io_myrank);
+            MPI_Comm_size(this->io_comm, &this->io_nprocs);
+        }
+        else
+        {
+            this->io_myrank = MPI_PROC_NULL;
+            this->io_nprocs = -1;
+        }
+    }
+    DEBUG_MSG_WAIT(
+                this->comm,
+                "inside field_descriptor constructor, about to call "
+                "MPI_Type_create_subarray "
+                "%d %d %d\n",
+                this->sizes[0],
+            this->subsizes[0],
+            this->starts[0]);
+    for (int i=0; i<this->ndims; i++)
+        DEBUG_MSG_WAIT(
+                    this->comm,
+                    "tsizes "
+                    "%d %d %d\n",
+                    tsizes[i],
+                    tsubsizes[i],
+                    tstarts[i]);
+    if (this->subsizes[0] > 0)
+    {
+        DEBUG_MSG("creating subarray\n");
+        MPI_Type_create_subarray(
+                    ndims,
+                    tsizes,
+                    tsubsizes,
+                    tstarts,
+                    MPI_ORDER_C,
+                    MPI_UNSIGNED_CHAR,
+                    &this->mpi_array_dtype);
+        MPI_Type_commit(&this->mpi_array_dtype);
+    }
+    this->rank = new int[this->sizes[0]];
+    int *local_rank = new int[this->sizes[0]];
+    std::fill_n(local_rank, this->sizes[0], 0);
+    for (int i = 0; i < this->sizes[0]; i++)
+        if (i >= this->starts[0] && i < this->starts[0] + this->subsizes[0])
+            local_rank[i] = this->myrank;
+    MPI_Allreduce(
+                local_rank,
+                this->rank,
+                this->sizes[0],
+            MPI_INT,
+            MPI_SUM,
+            this->comm);
+    delete[] local_rank;
+    this->all_start0 = new int[this->nprocs];
+    int *local_start0 = new int[this->nprocs];
+    std::fill_n(local_start0, this->nprocs, 0);
+    for (int i = 0; i < this->nprocs; i++)
+        if (this->myrank == i)
+            local_start0[i] = this->starts[0];
+    MPI_Allreduce(
+                local_start0,
+                this->all_start0,
+                this->nprocs,
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+    delete[] local_start0;
+    this->all_size0  = new int[this->nprocs];
+    int *local_size0 = new int[this->nprocs];
+    std::fill_n(local_size0, this->nprocs, 0);
+    for (int i = 0; i < this->nprocs; i++)
+        if (this->myrank == i)
+            local_size0[i] = this->subsizes[0];
+    MPI_Allreduce(
+                local_size0,
+                this->all_size0,
+                this->nprocs,
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+    delete[] local_size0;
+    DEBUG_MSG("exiting field_descriptor constructor\n");
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::read(
+        const char *fname,
+        void *buffer)
+{
+    TIMEZONE("field_descriptor::read");
+    DEBUG_MSG("entered field_descriptor::read\n");
+    char representation[] = "native";
+    if (this->subsizes[0] > 0)
+    {
+        MPI_Info info;
+        MPI_Info_create(&info);
+        MPI_File f;
+        ptrdiff_t read_size = this->local_size*sizeof(rnumber);
+        DEBUG_MSG("read size is %ld\n", read_size);
+        char ffname[200];
+        if (this->mpi_dtype == mpi_real_type<rnumber>::complex())
+            read_size *= 2;
+        DEBUG_MSG("read size is %ld\n", read_size);
+        sprintf(ffname, "%s", fname);
+
+        MPI_File_open(
+                    this->io_comm,
+                    ffname,
+                    MPI_MODE_RDONLY,
+                    info,
+                    &f);
+        DEBUG_MSG("opened file\n");
+        MPI_File_set_view(
+                    f,
+                    0,
+                    MPI_UNSIGNED_CHAR,
+                    this->mpi_array_dtype,
+                    representation,
+                    info);
+        DEBUG_MSG("view is set\n");
+        MPI_File_read_all(
+                    f,
+                    buffer,
+                    read_size,
+                    MPI_UNSIGNED_CHAR,
+                    MPI_STATUS_IGNORE);
+        DEBUG_MSG("info is read\n");
+        MPI_File_close(&f);
+    }
+    DEBUG_MSG("finished with field_descriptor::read\n");
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::write(
+        const char *fname,
+        void *buffer)
+{
+    TIMEZONE("field_descriptor::write");
+    char representation[] = "native";
+    if (this->subsizes[0] > 0)
+    {
+        MPI_Info info;
+        MPI_Info_create(&info);
+        MPI_File f;
+        ptrdiff_t read_size = this->local_size*sizeof(rnumber);
+        char ffname[200];
+        if (this->mpi_dtype == mpi_real_type<rnumber>::complex())
+            read_size *= 2;
+        sprintf(ffname, "%s", fname);
+
+        MPI_File_open(
+                    this->io_comm,
+                    ffname,
+                    MPI_MODE_CREATE | MPI_MODE_WRONLY,
+                    info,
+                    &f);
+        MPI_File_set_view(
+                    f,
+                    0,
+                    MPI_UNSIGNED_CHAR,
+                    this->mpi_array_dtype,
+                    representation,
+                    info);
+        MPI_File_write_all(
+                    f,
+                    buffer,
+                    read_size,
+                    MPI_UNSIGNED_CHAR,
+                    MPI_STATUS_IGNORE);
+        MPI_File_close(&f);
+    }
+
+    return EXIT_SUCCESS;
+}
 
+template <class rnumber>
+int field_descriptor<rnumber>::transpose(
+        rnumber *input,
+        rnumber *output)
+{
+    TIMEZONE("field_descriptor::transpose");
+    /* IMPORTANT NOTE:
+     for 3D transposition, the input data is messed up */
+    typename fftw_interface<rnumber>::plan tplan;
+    if (this->ndims == 3)
+    {
+        /* transpose the two local dimensions 1 and 2 */
+        rnumber *atmp;
+        atmp = fftw_interface<rnumber>::alloc_real(this->slice_size);
+        for (int k = 0; k < this->subsizes[0]; k++)
+        {
+            /* put transposed slice in atmp */
+            for (int j = 0; j < this->sizes[1]; j++)
+                for (int i = 0; i < this->sizes[2]; i++)
+                    atmp[i*this->sizes[1] + j] =
+                            input[(k*this->sizes[1] + j)*this->sizes[2] + i];
+            /* copy back transposed slice */
+            std::copy(
+                        atmp,
+                        atmp + this->slice_size,
+                        input + k*this->slice_size);
+        }
+        fftw_interface<rnumber>::free(atmp);
+    }
+    tplan = fftw_interface<rnumber>::mpi_plan_transpose(
+                this->sizes[0], this->slice_size,
+            input, output,
+            this->comm,
+            DEFAULT_FFTW_FLAG);
+    fftw_interface<rnumber>::execute(tplan);
+    fftw_interface<rnumber>::destroy_plan(tplan);
+    return EXIT_SUCCESS;
+}
 
+template <class rnumber>
+int field_descriptor<rnumber>::transpose(
+        typename fftw_interface<rnumber>::complex *input,
+        typename fftw_interface<rnumber>::complex *output)
+{
+    TIMEZONE("field_descriptor::transpose2");
+    switch (this->ndims)
+    {
+    case 2:
+        /* do a global transpose over the 2 dimensions */
+        if (output == NULL)
+        {
+            std::cerr << "bad arguments for transpose.\n" << std::endl;
+            return EXIT_FAILURE;
+        }
+        typename fftw_interface<rnumber>::plan tplan;
+        tplan = fftw_interface<rnumber>::mpi_plan_many_transpose(
+                    this->sizes[0], this->sizes[1], 2,
+                FFTW_MPI_DEFAULT_BLOCK,
+                FFTW_MPI_DEFAULT_BLOCK,
+                (rnumber*)input, (rnumber*)output,
+                this->comm,
+                DEFAULT_FFTW_FLAG);
+        fftw_interface<rnumber>::execute(tplan);
+        fftw_interface<rnumber>::destroy_plan(tplan);
+        break;
+    case 3:
+        /* transpose the two local dimensions 1 and 2 */
+        typename fftw_interface<rnumber>::complex *atmp;
+        atmp = fftw_interface<rnumber>::alloc_complex(this->slice_size);
+        for (int k = 0; k < this->subsizes[0]; k++)
+        {
+            /* put transposed slice in atmp */
+            for (int j = 0; j < this->sizes[1]; j++)
+                for (int i = 0; i < this->sizes[2]; i++)
+                {
+                    atmp[i*this->sizes[1] + j][0] =
+                            input[(k*this->sizes[1] + j)*this->sizes[2] + i][0];
+                    atmp[i*this->sizes[1] + j][1] =
+                            input[(k*this->sizes[1] + j)*this->sizes[2] + i][1];
+                }
+            /* copy back transposed slice */
+            std::copy(
+                        (rnumber*)(atmp),
+                        (rnumber*)(atmp + this->slice_size),
+                        (rnumber*)(input + k*this->slice_size));
+        }
+        fftw_interface<rnumber>::free(atmp);
+        break;
+    default:
+        return EXIT_FAILURE;
+        break;
+    }
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::interleave(
+        rnumber *a,
+        int dim)
+{
+     TIMEZONE("field_descriptor::interleav");
+    /* the following is copied from
+ * http://agentzlerich.blogspot.com/2010/01/using-fftw-for-in-place-matrix.html
+ * */
+    typename fftw_interface<rnumber>::iodim howmany_dims[2];
+    howmany_dims[0].n  = dim;
+    howmany_dims[0].is = this->local_size;
+    howmany_dims[0].os = 1;
+    howmany_dims[1].n  = this->local_size;
+    howmany_dims[1].is = 1;
+    howmany_dims[1].os = dim;
+    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]);
+
+    typename fftw_interface<rnumber>::plan tmp = fftw_interface<rnumber>::plan_guru_r2r(
+                /*rank*/0,
+                /*dims*/nullptr,
+                howmany_rank,
+                howmany_dims,
+                a,
+                a,
+                /*kind*/nullptr,
+                DEFAULT_FFTW_FLAG);
+    fftw_interface<rnumber>::execute(tmp);
+    fftw_interface<rnumber>::destroy_plan(tmp);
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::interleave(
+        typename fftw_interface<rnumber>::complex *a,
+        int dim)
+{
+     TIMEZONE("field_descriptor::interleave2");
+    typename fftw_interface<rnumber>::iodim howmany_dims[2];
+    howmany_dims[0].n  = dim;
+    howmany_dims[0].is = this->local_size;
+    howmany_dims[0].os = 1;
+    howmany_dims[1].n  = this->local_size;
+    howmany_dims[1].is = 1;
+    howmany_dims[1].os = dim;
+    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]);
+
+    typename fftw_interface<rnumber>::plan tmp = fftw_interface<rnumber>::plan_guru_dft(
+                /*rank*/0,
+                /*dims*/nullptr,
+                howmany_rank,
+                howmany_dims,
+                a,
+                a,
+                +1,
+                DEFAULT_FFTW_FLAG);
+    fftw_interface<rnumber>::execute(tmp);
+    fftw_interface<rnumber>::destroy_plan(tmp);
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+field_descriptor<rnumber>* field_descriptor<rnumber>::get_transpose()
+{
+    TIMEZONE("field_descriptor::get_transpose");
+    int n[this->ndims];
+    for (int i=0; i<this->ndims; i++)
+        n[i] = this->sizes[this->ndims - i - 1];
+    return new field_descriptor<rnumber>(this->ndims, n, this->mpi_dtype, this->comm);
+}
 
 /*****************************************************************************/
-/* now actually use the macro defined above                                  */
-CLASS_IMPLEMENTATION(
-        FFTW_MANGLE_FLOAT,
-        float,
-        MPI_FLOAT,
-        MPI_COMPLEX)
-CLASS_IMPLEMENTATION(
-        FFTW_MANGLE_DOUBLE,
-        double,
-        MPI_DOUBLE,
-        BFPS_MPICXX_DOUBLE_COMPLEX)
 /*****************************************************************************/
 
 
@@ -511,23 +505,23 @@ template <class rnumber>
 field_descriptor<rnumber>::~field_descriptor()
 {
     DEBUG_MSG_WAIT(
-            MPI_COMM_WORLD,
-            this->io_comm == MPI_COMM_NULL ? "null\n" : "not null\n");
+                MPI_COMM_WORLD,
+                this->io_comm == MPI_COMM_NULL ? "null\n" : "not null\n");
     DEBUG_MSG_WAIT(
-            MPI_COMM_WORLD,
-            "subsizes[0] = %d \n", this->subsizes[0]);
+                MPI_COMM_WORLD,
+                "subsizes[0] = %d \n", this->subsizes[0]);
     if (this->subsizes[0] > 0)
     {
         DEBUG_MSG_WAIT(
-                this->io_comm,
-                "deallocating mpi_array_dtype\n");
+                    this->io_comm,
+                    "deallocating mpi_array_dtype\n");
         MPI_Type_free(&this->mpi_array_dtype);
     }
     if (this->nprocs != this->io_nprocs && this->io_myrank != MPI_PROC_NULL)
     {
         DEBUG_MSG_WAIT(
-                this->io_comm,
-                "freeing io_comm\n");
+                    this->io_comm,
+                    "freeing io_comm\n");
         MPI_Comm_free(&this->io_comm);
     }
     delete[] this->sizes;
diff --git a/bfps/cpp/field_descriptor.hpp b/bfps/cpp/field_descriptor.hpp
index bfcf52ed415ddb90bd77a6c6793974aea6a94734..2fb491bca7c130704fc5de5d22c3393cb196eec7 100644
--- a/bfps/cpp/field_descriptor.hpp
+++ b/bfps/cpp/field_descriptor.hpp
@@ -26,6 +26,7 @@
 
 #include <mpi.h>
 #include <fftw3-mpi.h>
+#include "fftw_interface.hpp"
 
 #ifndef FIELD_DESCRIPTOR
 
@@ -85,14 +86,14 @@ class field_descriptor
                 rnumber *input,
                 rnumber *output);
         int transpose(
-                cnumber *input,
-                cnumber *output = NULL);
+                typename fftw_interface<rnumber>::complex *input,
+                typename fftw_interface<rnumber>::complex *output = NULL);
 
         int interleave(
                 rnumber *input,
                 int dim);
         int interleave(
-                cnumber *input,
+                typename fftw_interface<rnumber>::complex *input,
                 int dim);
 };
 
diff --git a/bfps/cpp/field_layout.cpp b/bfps/cpp/field_layout.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..908904991d5d95b0c89ba679b402d8d5727b8c85
--- /dev/null
+++ b/bfps/cpp/field_layout.cpp
@@ -0,0 +1,111 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+
+#include <cassert>
+#include "field_layout.hpp"
+#include "scope_timer.hpp"
+
+template <field_components fc>
+field_layout<fc>::field_layout(
+        const hsize_t *SIZES,
+        const hsize_t *SUBSIZES,
+        const hsize_t *STARTS,
+        const MPI_Comm COMM_TO_USE)
+{
+    TIMEZONE("field_layout::field_layout");
+    this->comm = COMM_TO_USE;
+    MPI_Comm_rank(this->comm, &this->myrank);
+    MPI_Comm_size(this->comm, &this->nprocs);
+
+    std::copy(SIZES, SIZES + 3, this->sizes);
+    std::copy(SUBSIZES, SUBSIZES + 3, this->subsizes);
+    std::copy(STARTS, STARTS + 3, this->starts);
+    if (fc == THREE || fc == THREExTHREE)
+    {
+        this->sizes[3] = 3;
+        this->subsizes[3] = 3;
+        this->starts[3] = 0;
+    }
+    if (fc == THREExTHREE)
+    {
+        this->sizes[4] = 3;
+        this->subsizes[4] = 3;
+        this->starts[4] = 0;
+    }
+    this->local_size = 1;
+    this->full_size = 1;
+    for (unsigned int i=0; i<ndim(fc); i++)
+    {
+        this->local_size *= this->subsizes[i];
+        this->full_size *= this->sizes[i];
+    }
+
+    /*field will at most be distributed in 2D*/
+    this->rank.resize(2);
+    this->all_start.resize(2);
+    this->all_size.resize(2);
+    for (int i=0; i<2; i++)
+    {
+        this->rank[i].resize(this->sizes[i]);
+        std::vector<int> local_rank;
+        local_rank.resize(this->sizes[i], 0);
+        for (unsigned int ii=this->starts[i]; ii<this->starts[i]+this->subsizes[i]; ii++)
+            local_rank[ii] = this->myrank;
+        MPI_Allreduce(
+                &local_rank.front(),
+                &this->rank[i].front(),
+                this->sizes[i],
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+        this->all_start[i].resize(this->nprocs);
+        std::vector<int> local_start;
+        local_start.resize(this->nprocs, 0);
+        local_start[this->myrank] = this->starts[i];
+        MPI_Allreduce(
+                &local_start.front(),
+                &this->all_start[i].front(),
+                this->nprocs,
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+        this->all_size[i].resize(this->nprocs);
+        std::vector<int> local_subsize;
+        local_subsize.resize(this->nprocs, 0);
+        local_subsize[this->myrank] = this->subsizes[i];
+        MPI_Allreduce(
+                &local_subsize.front(),
+                &this->all_size[i].front(),
+                this->nprocs,
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+    }
+}
+
+template class field_layout<ONE>;
+template class field_layout<THREE>;
+template class field_layout<THREExTHREE>;
+
diff --git a/bfps/cpp/field_layout.hpp b/bfps/cpp/field_layout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..770119c2dcb05017d495b62559f050646872dc84
--- /dev/null
+++ b/bfps/cpp/field_layout.hpp
@@ -0,0 +1,79 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+
+
+#include <vector>
+#include "base.hpp"
+
+#ifndef FIELD_LAYOUT_HPP
+
+#define FIELD_LAYOUT_HPP
+
+enum field_components {ONE, THREE, THREExTHREE};
+
+constexpr unsigned int ncomp(
+        field_components fc)
+    /* return actual number of field components for each enum value */
+{
+    return ((fc == THREE) ? 3 : (
+            (fc == THREExTHREE) ? 9 : 1));
+}
+
+constexpr unsigned int ndim(
+        field_components fc)
+    /* return actual number of field dimensions for each enum value */
+{
+    return ((fc == THREE) ? 4 : (
+            (fc == THREExTHREE) ? 5 : 3));
+}
+
+template <field_components fc>
+class field_layout
+{
+    public:
+        /* description */
+        hsize_t sizes[ndim(fc)];
+        hsize_t subsizes[ndim(fc)];
+        hsize_t starts[ndim(fc)];
+        hsize_t local_size, full_size;
+
+        int myrank, nprocs;
+        MPI_Comm comm;
+
+        std::vector<std::vector<int>> rank;
+        std::vector<std::vector<int>> all_start;
+        std::vector<std::vector<int>> all_size;
+
+        /* methods */
+        field_layout(
+                const hsize_t *SIZES,
+                const hsize_t *SUBSIZES,
+                const hsize_t *STARTS,
+                const MPI_Comm COMM_TO_USE);
+        ~field_layout(){}
+};
+
+#endif//FIELD_LAYOUT_HPP
+
diff --git a/bfps/cpp/fluid_solver.cpp b/bfps/cpp/fluid_solver.cpp
index a634117bc43075db475be47256f1579b39bc1193..319186103797f8135d4d3e2244ed5e3a8f271b00 100644
--- a/bfps/cpp/fluid_solver.cpp
+++ b/bfps/cpp/fluid_solver.cpp
@@ -31,7 +31,8 @@
 #include <cstring>
 #include "fluid_solver.hpp"
 #include "fftw_tools.hpp"
-
+#include "scope_timer.hpp"
+#include "shared_array.hpp"
 
 
 template <class rnumber>
@@ -48,911 +49,1003 @@ void fluid_solver<rnumber>::impose_zero_modes()
 /*****************************************************************************/
 /* macro for specializations to numeric types compatible with FFTW           */
 
-#define FLUID_SOLVER_DEFINITIONS(FFTW, R, MPI_RNUM, MPI_CNUM) \
- \
-template<> \
-fluid_solver<R>::fluid_solver( \
-        const char *NAME, \
-        int nx, \
-        int ny, \
-        int nz, \
-        double DKX, \
-        double DKY, \
-        double DKZ, \
-        int DEALIAS_TYPE, \
-        unsigned FFTW_PLAN_RIGOR) : fluid_solver_base<R>( \
-                NAME, \
-                nx , ny , nz, \
-                DKX, DKY, DKZ, \
-                DEALIAS_TYPE, \
-                FFTW_PLAN_RIGOR) \
-{ \
-    this->cvorticity = FFTW(alloc_complex)(this->cd->local_size);\
-    this->cvelocity  = FFTW(alloc_complex)(this->cd->local_size);\
-    this->rvorticity = FFTW(alloc_real)(this->cd->local_size*2);\
-    /*this->rvelocity  = (R*)(this->cvelocity);*/\
-    this->rvelocity  = FFTW(alloc_real)(this->cd->local_size*2);\
- \
-    this->ru = this->rvelocity;\
-    this->cu = this->cvelocity;\
- \
-    this->rv[0] = this->rvorticity;\
-    this->rv[3] = this->rvorticity;\
-    this->cv[0] = this->cvorticity;\
-    this->cv[3] = this->cvorticity;\
- \
-    this->cv[1] = FFTW(alloc_complex)(this->cd->local_size);\
-    this->cv[2] = this->cv[1];\
-    this->rv[1] = FFTW(alloc_real)(this->cd->local_size*2);\
-    this->rv[2] = this->rv[1];\
- \
-    this->c2r_vorticity = new FFTW(plan);\
-    this->r2c_vorticity = new FFTW(plan);\
-    this->c2r_velocity  = new FFTW(plan);\
-    this->r2c_velocity  = new FFTW(plan);\
- \
-    ptrdiff_t sizes[] = {nz, \
-                         ny, \
-                         nx};\
- \
-    *(FFTW(plan)*)this->c2r_vorticity = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->cvorticity, this->rvorticity, \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
- \
-    *(FFTW(plan)*)this->r2c_vorticity = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->rvorticity, this->cvorticity, \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \
- \
-    *(FFTW(plan)*)this->c2r_velocity = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->cvelocity, this->rvelocity, \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
- \
-    *(FFTW(plan)*)this->r2c_velocity = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->rvelocity, this->cvelocity, \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \
- \
-    this->uc2r = this->c2r_velocity;\
-    this->ur2c = this->r2c_velocity;\
-    this->vc2r[0] = this->c2r_vorticity;\
-    this->vr2c[0] = this->r2c_vorticity;\
- \
-    this->vc2r[1] = new FFTW(plan);\
-    this->vr2c[1] = new FFTW(plan);\
-    this->vc2r[2] = new FFTW(plan);\
-    this->vr2c[2] = new FFTW(plan);\
- \
-    *(FFTW(plan)*)(this->vc2r[1]) = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->cv[1], this->rv[1], \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
- \
-    *(FFTW(plan)*)this->vc2r[2] = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->cv[2], this->rv[2], \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
- \
-    *(FFTW(plan)*)this->vr2c[1] = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->rv[1], this->cv[1], \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \
- \
-    *(FFTW(plan)*)this->vr2c[2] = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->rv[2], this->cv[2], \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \
- \
-    /* ``physical'' parameters etc, initialized here just in case */ \
- \
-    this->nu = 0.1; \
-    this->fmode = 1; \
-    this->famplitude = 1.0; \
-    this->fk0  = 0; \
-    this->fk1 = 3.0; \
-    /* initialization of fields must be done AFTER planning */ \
-    std::fill_n((R*)this->cvorticity, this->cd->local_size*2, 0.0); \
-    std::fill_n((R*)this->cvelocity, this->cd->local_size*2, 0.0); \
-    std::fill_n(this->rvelocity, this->cd->local_size*2, 0.0); \
-    std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0); \
-    std::fill_n((R*)this->cv[1], this->cd->local_size*2, 0.0); \
-    std::fill_n(this->rv[1], this->cd->local_size*2, 0.0); \
-    std::fill_n(this->rv[2], this->cd->local_size*2, 0.0); \
-} \
- \
-template<> \
-fluid_solver<R>::~fluid_solver() \
-{ \
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->c2r_vorticity);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->r2c_vorticity);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->c2r_velocity );\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->r2c_velocity );\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->vc2r[1]);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->vr2c[1]);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->vc2r[2]);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->vr2c[2]);\
- \
-    delete (FFTW(plan)*)this->c2r_vorticity;\
-    delete (FFTW(plan)*)this->r2c_vorticity;\
-    delete (FFTW(plan)*)this->c2r_velocity ;\
-    delete (FFTW(plan)*)this->r2c_velocity ;\
-    delete (FFTW(plan)*)this->vc2r[1];\
-    delete (FFTW(plan)*)this->vr2c[1];\
-    delete (FFTW(plan)*)this->vc2r[2];\
-    delete (FFTW(plan)*)this->vr2c[2];\
- \
-    FFTW(free)(this->cv[1]);\
-    FFTW(free)(this->rv[1]);\
-    FFTW(free)(this->cvorticity);\
-    FFTW(free)(this->rvorticity);\
-    FFTW(free)(this->cvelocity);\
-    FFTW(free)(this->rvelocity);\
-} \
- \
-template<> \
-void fluid_solver<R>::compute_vorticity() \
-{ \
-    ptrdiff_t tindex; \
-    CLOOP_K2( \
-            this, \
-            tindex = 3*cindex; \
-            if (k2 <= this->kM2) \
-            { \
-                this->cvorticity[tindex+0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]); \
-                this->cvorticity[tindex+1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]); \
-                this->cvorticity[tindex+2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]); \
-                this->cvorticity[tindex+0][1] =  (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]); \
-                this->cvorticity[tindex+1][1] =  (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]); \
-                this->cvorticity[tindex+2][1] =  (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]); \
-            } \
-            else \
-                std::fill_n((R*)(this->cvorticity+tindex), 6, 0.0); \
-            ); \
-    this->symmetrize(this->cvorticity, 3); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_velocity(FFTW(complex) *vorticity) \
-{ \
-    ptrdiff_t tindex; \
-    CLOOP_K2( \
-            this, \
-            tindex = 3*cindex; \
-            if (k2 <= this->kM2 && k2 > 0) \
-            { \
-                this->cu[tindex+0][0] = -(this->ky[yindex]*vorticity[tindex+2][1] - this->kz[zindex]*vorticity[tindex+1][1]) / k2; \
-                this->cu[tindex+1][0] = -(this->kz[zindex]*vorticity[tindex+0][1] - this->kx[xindex]*vorticity[tindex+2][1]) / k2; \
-                this->cu[tindex+2][0] = -(this->kx[xindex]*vorticity[tindex+1][1] - this->ky[yindex]*vorticity[tindex+0][1]) / k2; \
-                this->cu[tindex+0][1] =  (this->ky[yindex]*vorticity[tindex+2][0] - this->kz[zindex]*vorticity[tindex+1][0]) / k2; \
-                this->cu[tindex+1][1] =  (this->kz[zindex]*vorticity[tindex+0][0] - this->kx[xindex]*vorticity[tindex+2][0]) / k2; \
-                this->cu[tindex+2][1] =  (this->kx[xindex]*vorticity[tindex+1][0] - this->ky[yindex]*vorticity[tindex+0][0]) / k2; \
-            } \
-            else \
-                std::fill_n((R*)(this->cu+tindex), 6, 0.0); \
-            ); \
-    /*this->symmetrize(this->cu, 3);*/ \
-} \
- \
-template<> \
-void fluid_solver<R>::ift_velocity() \
-{ \
-    FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity )); \
-} \
- \
-template<> \
-void fluid_solver<R>::ift_vorticity() \
-{ \
-    std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0); \
-    FFTW(execute)(*((FFTW(plan)*)this->c2r_vorticity )); \
-} \
- \
-template<> \
-void fluid_solver<R>::dft_velocity() \
-{ \
-    FFTW(execute)(*((FFTW(plan)*)this->r2c_velocity )); \
-} \
- \
-template<> \
-void fluid_solver<R>::dft_vorticity() \
-{ \
-    std::fill_n((R*)this->cvorticity, this->cd->local_size*2, 0.0); \
-    FFTW(execute)(*((FFTW(plan)*)this->r2c_vorticity )); \
-} \
- \
-template<> \
-void fluid_solver<R>::add_forcing(\
-        FFTW(complex) *acc_field, FFTW(complex) *vort_field, R factor) \
-{ \
-    if (strcmp(this->forcing_type, "none") == 0) \
-        return; \
-    if (strcmp(this->forcing_type, "Kolmogorov") == 0) \
-    { \
-        ptrdiff_t cindex; \
-        if (this->cd->myrank == this->cd->rank[this->fmode]) \
-        { \
-            cindex = ((this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3; \
-            acc_field[cindex+2][0] -= this->famplitude*factor/2; \
-        } \
-        if (this->cd->myrank == this->cd->rank[this->cd->sizes[0] - this->fmode]) \
-        { \
-            cindex = ((this->cd->sizes[0] - this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3; \
-            acc_field[cindex+2][0] -= this->famplitude*factor/2; \
-        } \
-        return; \
-    } \
-    if (strcmp(this->forcing_type, "linear") == 0) \
-    { \
-        double knorm; \
-        CLOOP( \
-                this, \
-                knorm = sqrt(this->kx[xindex]*this->kx[xindex] + \
-                             this->ky[yindex]*this->ky[yindex] + \
-                             this->kz[zindex]*this->kz[zindex]); \
-                if ((this->fk0 <= knorm) && \
-                    (this->fk1 >= knorm)) \
-                    for (int c=0; c<3; c++) \
-                    for (int i=0; i<2; i++) \
-                        acc_field[cindex*3+c][i] += this->famplitude*vort_field[cindex*3+c][i]*factor; \
-             ); \
-        return; \
-    } \
-} \
- \
-template<> \
-void fluid_solver<R>::omega_nonlin( \
-        int src) \
-{ \
-    assert(src >= 0 && src < 3); \
-    this->compute_velocity(this->cv[src]); \
-    /* get fields from Fourier space to real space */ \
-    FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity ));  \
-    FFTW(execute)(*((FFTW(plan)*)this->vc2r[src]));      \
-    /* compute cross product $u \times \omega$, and normalize */ \
-    R tmp[3][2]; \
-    ptrdiff_t tindex; \
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                tmp[cc][0] = (this->ru[tindex+(cc+1)%3]*this->rv[src][tindex+(cc+2)%3] - \
-                              this->ru[tindex+(cc+2)%3]*this->rv[src][tindex+(cc+1)%3]); \
-            for (int cc=0; cc<3; cc++) \
-                this->ru[(3*rindex)+cc] = tmp[cc][0] / this->normalization_factor; \
-            ); \
-    /* go back to Fourier space */ \
-    this->clean_up_real_space(this->ru, 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->r2c_velocity )); \
-    this->dealias(this->cu, 3); \
-    /* $\imath k \times Fourier(u \times \omega)$ */ \
-    CLOOP( \
-            this, \
-            tindex = 3*cindex; \
-            { \
-                tmp[0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]); \
-                tmp[1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]); \
-                tmp[2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]); \
-                tmp[0][1] =  (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]); \
-                tmp[1][1] =  (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]); \
-                tmp[2][1] =  (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]); \
-            } \
-            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \
-                this->cu[tindex+cc][i] = tmp[cc][i]; \
-            ); \
-    this->add_forcing(this->cu, this->cv[src], 1.0); \
-    this->force_divfree(this->cu); \
-} \
- \
-template<> \
-void fluid_solver<R>::step(double dt) \
-{ \
-    double factor0, factor1; \
-    std::fill_n((R*)this->cv[1], this->cd->local_size*2, 0.0); \
-    this->omega_nonlin(0); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                factor0 = exp(-this->nu * k2 * dt); \
-                for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \
-                    this->cv[1][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i] + \
-                                                   dt*this->cu[3*cindex+cc][i])*factor0; \
-            } \
-            ); \
- \
-    this->omega_nonlin(1); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                factor0 = exp(-this->nu * k2 * dt/2); \
-                factor1 = exp( this->nu * k2 * dt/2); \
-                for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \
-                    this->cv[2][3*cindex+cc][i] = (3*this->cv[0][3*cindex+cc][i]*factor0 + \
-                                                   (this->cv[1][3*cindex+cc][i] + \
-                                                    dt*this->cu[3*cindex+cc][i])*factor1)*0.25; \
-            } \
-            ); \
- \
-    this->omega_nonlin(2); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                factor0 = exp(-this->nu * k2 * dt * 0.5); \
-                for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \
-                    this->cv[3][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i]*factor0 + \
-                                                   2*(this->cv[2][3*cindex+cc][i] + \
-                                                      dt*this->cu[3*cindex+cc][i]))*factor0/3; \
-            } \
-            ); \
- \
-    this->force_divfree(this->cvorticity); \
-    this->symmetrize(this->cvorticity, 3); \
-    this->iteration++; \
-} \
- \
-template<> \
-int fluid_solver<R>::read(char field, char representation) \
-{ \
-    char fname[512]; \
-    int read_result; \
-    if (field == 'v') \
-    { \
-        if (representation == 'c') \
-        { \
-            this->fill_up_filename("cvorticity", fname); \
-            read_result = this->cd->read(fname, (void*)this->cvorticity); \
-            if (read_result != EXIT_SUCCESS) \
-                return read_result; \
-        } \
-        if (representation == 'r') \
-        { \
-            read_result = this->read_base("rvorticity", this->rvorticity); \
-            if (read_result != EXIT_SUCCESS) \
-                return read_result; \
-            else \
-                FFTW(execute)(*((FFTW(plan)*)this->r2c_vorticity )); \
-        } \
-        this->low_pass_Fourier(this->cvorticity, 3, this->kM); \
-        this->force_divfree(this->cvorticity); \
-        this->symmetrize(this->cvorticity, 3); \
-        return EXIT_SUCCESS; \
-    } \
-    if ((field == 'u') && (representation == 'c')) \
-    { \
-        read_result = this->read_base("cvelocity", this->cvelocity); \
-        this->low_pass_Fourier(this->cvelocity, 3, this->kM); \
-        this->force_divfree(this->cvorticity); \
-        this->symmetrize(this->cvorticity, 3); \
-        return read_result; \
-    } \
-    if ((field == 'u') && (representation == 'r')) \
-        return this->read_base("rvelocity", this->rvelocity); \
-    return EXIT_FAILURE; \
-} \
- \
-template<> \
-int fluid_solver<R>::write(char field, char representation) \
-{ \
-    char fname[512]; \
-    if ((field == 'v') && (representation == 'c')) \
-    { \
-        this->fill_up_filename("cvorticity", fname); \
-        return this->cd->write(fname, (void*)this->cvorticity); \
-    } \
-    if ((field == 'v') && (representation == 'r')) \
-    { \
-        FFTW(execute)(*((FFTW(plan)*)this->c2r_vorticity )); \
-        clip_zero_padding<R>(this->rd, this->rvorticity, 3); \
-        this->fill_up_filename("rvorticity", fname); \
-        return this->rd->write(fname, this->rvorticity); \
-    } \
-    this->compute_velocity(this->cvorticity); \
-    if ((field == 'u') && (representation == 'c')) \
-    { \
-        this->fill_up_filename("cvelocity", fname); \
-        return this->cd->write(fname, this->cvelocity); \
-    } \
-    if ((field == 'u') && (representation == 'r')) \
-    { \
-        this->ift_velocity(); \
-        clip_zero_padding<R>(this->rd, this->rvelocity, 3); \
-        this->fill_up_filename("rvelocity", fname); \
-        return this->rd->write(fname, this->rvelocity); \
-    } \
-    return EXIT_FAILURE; \
-} \
- \
-template<> \
-int fluid_solver<R>::write_rTrS2() \
-{ \
-    char fname[512]; \
-    this->fill_up_filename("rTrS2", fname); \
-    FFTW(complex) *ca; \
-    R *ra; \
-    ca = FFTW(alloc_complex)(this->cd->local_size*3); \
-    ra = (R*)(ca); \
-    this->compute_velocity(this->cvorticity); \
-    this->compute_vector_gradient(ca, this->cvelocity); \
-    for (int cc=0; cc<3; cc++) \
-    { \
-        std::copy( \
-                (R*)(ca + cc*this->cd->local_size), \
-                (R*)(ca + (cc+1)*this->cd->local_size), \
-                (R*)this->cv[1]); \
-        FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \
-        std::copy( \
-                this->rv[1], \
-                this->rv[1] + this->cd->local_size*2, \
-                ra + cc*this->cd->local_size*2); \
-    } \
-    /* velocity gradient is now stored, in real space, in ra */ \
-    R *dx_u, *dy_u, *dz_u; \
-    dx_u = ra; \
-    dy_u = ra + 2*this->cd->local_size; \
-    dz_u = ra + 4*this->cd->local_size; \
-    R *trS2 = FFTW(alloc_real)((this->cd->local_size/3)*2); \
-    double average_local = 0; \
-    RLOOP( \
-            this, \
-            R AxxAxx; \
-            R AyyAyy; \
-            R AzzAzz; \
-            R Sxy; \
-            R Syz; \
-            R Szx; \
-            ptrdiff_t tindex = 3*rindex; \
-            AxxAxx = dx_u[tindex+0]*dx_u[tindex+0]; \
-            AyyAyy = dy_u[tindex+1]*dy_u[tindex+1]; \
-            AzzAzz = dz_u[tindex+2]*dz_u[tindex+2]; \
-            Sxy = dx_u[tindex+1]+dy_u[tindex+0]; \
-            Syz = dy_u[tindex+2]+dz_u[tindex+1]; \
-            Szx = dz_u[tindex+0]+dx_u[tindex+2]; \
-            trS2[rindex] = (AxxAxx + AyyAyy + AzzAzz + \
-                            (Sxy*Sxy + Syz*Syz + Szx*Szx)/2); \
-            average_local += trS2[rindex]; \
-            ); \
-    double average; \
-    MPI_Allreduce( \
-            &average_local, \
-            &average, \
-            1, \
-            MPI_DOUBLE, MPI_SUM, this->cd->comm); \
-    DEBUG_MSG("average TrS2 is %g\n", average); \
-    FFTW(free)(ca); \
-    /* output goes here */ \
-    int ntmp[3]; \
-    ntmp[0] = this->rd->sizes[0]; \
-    ntmp[1] = this->rd->sizes[1]; \
-    ntmp[2] = this->rd->sizes[2]; \
-    field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \
-    clip_zero_padding<R>(scalar_descriptor, trS2, 1); \
-    int return_value = scalar_descriptor->write(fname, trS2); \
-    delete scalar_descriptor; \
-    FFTW(free)(trS2); \
-    return return_value; \
-} \
- \
-template<> \
-int fluid_solver<R>::write_renstrophy() \
-{ \
-    char fname[512]; \
-    this->fill_up_filename("renstrophy", fname); \
-    R *enstrophy = FFTW(alloc_real)((this->cd->local_size/3)*2); \
-    this->ift_vorticity(); \
-    double average_local = 0; \
-    RLOOP( \
-            this, \
-            ptrdiff_t tindex = 3*rindex; \
-            enstrophy[rindex] = ( \
-                this->rvorticity[tindex+0]*this->rvorticity[tindex+0] + \
-                this->rvorticity[tindex+1]*this->rvorticity[tindex+1] + \
-                this->rvorticity[tindex+2]*this->rvorticity[tindex+2] \
-                )/2; \
-            average_local += enstrophy[rindex]; \
-            ); \
-    double average; \
-    MPI_Allreduce( \
-            &average_local, \
-            &average, \
-            1, \
-            MPI_DOUBLE, MPI_SUM, this->cd->comm); \
-    DEBUG_MSG("average enstrophy is %g\n", average); \
-    /* output goes here */ \
-    int ntmp[3]; \
-    ntmp[0] = this->rd->sizes[0]; \
-    ntmp[1] = this->rd->sizes[1]; \
-    ntmp[2] = this->rd->sizes[2]; \
-    field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \
-    clip_zero_padding<R>(scalar_descriptor, enstrophy, 1); \
-    int return_value = scalar_descriptor->write(fname, enstrophy); \
-    delete scalar_descriptor; \
-    FFTW(free)(enstrophy); \
-    return return_value; \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_pressure(FFTW(complex) *pressure) \
-{ \
-    /* assume velocity is already in real space representation */ \
-    ptrdiff_t tindex; \
-    \
-    /* diagonal terms 11 22 33 */\
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc]; \
-            ); \
-    this->clean_up_real_space(this->rv[1], 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \
-    this->dealias(this->cv[1], 3); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2 && k2 > 0) \
-            { \
-                tindex = 3*cindex; \
-                for (int i=0; i<2; i++) \
-                { \
-                    pressure[cindex][i] = -(this->kx[xindex]*this->kx[xindex]*this->cv[1][tindex+0][i] + \
-                                            this->ky[yindex]*this->ky[yindex]*this->cv[1][tindex+1][i] + \
-                                            this->kz[zindex]*this->kz[zindex]*this->cv[1][tindex+2][i]); \
-                } \
-            } \
-            else \
-                std::fill_n((R*)(pressure+cindex), 2, 0.0); \
-            ); \
-    /* off-diagonal terms 12 23 31 */\
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3]; \
-            ); \
-    this->clean_up_real_space(this->rv[1], 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \
-    this->dealias(this->cv[1], 3); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2 && k2 > 0) \
-            { \
-                tindex = 3*cindex; \
-                for (int i=0; i<2; i++) \
-                { \
-                    pressure[cindex][i] -= 2*(this->kx[xindex]*this->ky[yindex]*this->cv[1][tindex+0][i] + \
-                                              this->ky[yindex]*this->kz[zindex]*this->cv[1][tindex+1][i] + \
-                                              this->kz[zindex]*this->kx[xindex]*this->cv[1][tindex+2][i]); \
-                    pressure[cindex][i] /= this->normalization_factor*k2; \
-                } \
-            } \
-            ); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_gradient_statistics( \
-        FFTW(complex) *vec, \
-        double *gradu_moments, \
-        double *trS2QR_moments, \
-        ptrdiff_t *gradu_hist, \
-        ptrdiff_t *trS2QR_hist, \
-        ptrdiff_t *QR2D_hist, \
-        double trS2QR_max_estimates[], \
-        double gradu_max_estimates[], \
-        int nbins, \
-        int QR2D_nbins) \
-{ \
-    FFTW(complex) *ca; \
-    R *ra; \
-    ca = FFTW(alloc_complex)(this->cd->local_size*3); \
-    ra = (R*)(ca); \
-    this->compute_vector_gradient(ca, vec); \
-    for (int cc=0; cc<3; cc++) \
-    { \
-        std::copy( \
-                (R*)(ca + cc*this->cd->local_size), \
-                (R*)(ca + (cc+1)*this->cd->local_size), \
-                (R*)this->cv[1]); \
-        FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \
-        std::copy( \
-                this->rv[1], \
-                this->rv[1] + this->cd->local_size*2, \
-                ra + cc*this->cd->local_size*2); \
-    } \
-    /* velocity gradient is now stored, in real space, in ra */ \
-    std::fill_n(this->rv[1], 2*this->cd->local_size, 0.0); \
-    R *dx_u, *dy_u, *dz_u; \
-    dx_u = ra; \
-    dy_u = ra + 2*this->cd->local_size; \
-    dz_u = ra + 4*this->cd->local_size; \
-    double binsize[2]; \
-    double tmp_max_estimate[3]; \
-    tmp_max_estimate[0] = trS2QR_max_estimates[0]; \
-    tmp_max_estimate[1] = trS2QR_max_estimates[1]; \
-    tmp_max_estimate[2] = trS2QR_max_estimates[2]; \
-    binsize[0] = 2*tmp_max_estimate[2] / QR2D_nbins; \
-    binsize[1] = 2*tmp_max_estimate[1] / QR2D_nbins; \
-    ptrdiff_t *local_hist = new ptrdiff_t[QR2D_nbins*QR2D_nbins]; \
-    std::fill_n(local_hist, QR2D_nbins*QR2D_nbins, 0); \
-    RLOOP( \
-            this, \
-            R AxxAxx; \
-            R AyyAyy; \
-            R AzzAzz; \
-            R AxyAyx; \
-            R AyzAzy; \
-            R AzxAxz; \
-            R Sxy; \
-            R Syz; \
-            R Szx; \
-            ptrdiff_t tindex = 3*rindex; \
-            AxxAxx = dx_u[tindex+0]*dx_u[tindex+0]; \
-            AyyAyy = dy_u[tindex+1]*dy_u[tindex+1]; \
-            AzzAzz = dz_u[tindex+2]*dz_u[tindex+2]; \
-            AxyAyx = dx_u[tindex+1]*dy_u[tindex+0]; \
-            AyzAzy = dy_u[tindex+2]*dz_u[tindex+1]; \
-            AzxAxz = dz_u[tindex+0]*dx_u[tindex+2]; \
-            this->rv[1][tindex+1] = - (AxxAxx + AyyAyy + AzzAzz)/2 - AxyAyx - AyzAzy - AzxAxz; \
-            this->rv[1][tindex+2] = - (dx_u[tindex+0]*(AxxAxx/3 + AxyAyx + AzxAxz) + \
-                                       dy_u[tindex+1]*(AyyAyy/3 + AxyAyx + AyzAzy) + \
-                                       dz_u[tindex+2]*(AzzAzz/3 + AzxAxz + AyzAzy) + \
-                                       dx_u[tindex+1]*dy_u[tindex+2]*dz_u[tindex+0] + \
-                                       dx_u[tindex+2]*dy_u[tindex+0]*dz_u[tindex+1]); \
-            int bin0 = int(floor((this->rv[1][tindex+2] + tmp_max_estimate[2]) / binsize[0])); \
-            int bin1 = int(floor((this->rv[1][tindex+1] + tmp_max_estimate[1]) / binsize[1])); \
-            if ((bin0 >= 0 && bin0 < QR2D_nbins) && \
-                (bin1 >= 0 && bin1 < QR2D_nbins)) \
-                local_hist[bin1*QR2D_nbins + bin0]++; \
-            Sxy = dx_u[tindex+1]+dy_u[tindex+0]; \
-            Syz = dy_u[tindex+2]+dz_u[tindex+1]; \
-            Szx = dz_u[tindex+0]+dx_u[tindex+2]; \
-            this->rv[1][tindex] = (AxxAxx + AyyAyy + AzzAzz + \
-                                   (Sxy*Sxy + Syz*Syz + Szx*Szx)/2); \
-            ); \
-    MPI_Allreduce( \
-            local_hist, \
-            QR2D_hist, \
-            QR2D_nbins * QR2D_nbins, \
-            MPI_INT64_T, MPI_SUM, this->cd->comm); \
-    delete[] local_hist; \
-    this->compute_rspace_stats3( \
-            this->rv[1], \
-            trS2QR_moments, \
-            trS2QR_hist, \
-            tmp_max_estimate, \
-            nbins); \
-    double *tmp_moments = new double[10*3]; \
-    ptrdiff_t *tmp_hist = new ptrdiff_t[nbins*3]; \
-    for (int cc=0; cc<3; cc++) \
-    { \
-        tmp_max_estimate[0] = gradu_max_estimates[cc*3 + 0]; \
-        tmp_max_estimate[1] = gradu_max_estimates[cc*3 + 1]; \
-        tmp_max_estimate[2] = gradu_max_estimates[cc*3 + 2]; \
-        this->compute_rspace_stats3( \
-                dx_u, \
-                tmp_moments, \
-                tmp_hist, \
-                tmp_max_estimate, \
-                nbins); \
-        for (int n = 0; n < 10; n++) \
-        for (int i = 0; i < 3 ; i++) \
-        { \
-            gradu_moments[(n*3 + cc)*3 + i] = tmp_moments[n*3 + i]; \
-        } \
-        for (int n = 0; n < nbins; n++) \
-        for (int i = 0; i < 3; i++) \
-        { \
-            gradu_hist[(n*3 + cc)*3 + i] = tmp_hist[n*3 + i]; \
-        } \
-    } \
-    delete[] tmp_moments; \
-    delete[] tmp_hist; \
-    FFTW(free)(ca); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_Lagrangian_acceleration(R (*acceleration)[2]) \
-{ \
-    ptrdiff_t tindex; \
-    FFTW(complex) *pressure; \
-    pressure = FFTW(alloc_complex)(this->cd->local_size/3); \
-    this->compute_velocity(this->cvorticity); \
-    this->ift_velocity(); \
-    this->compute_pressure(pressure); \
-    this->compute_velocity(this->cvorticity); \
-    std::fill_n((R*)this->cv[1], 2*this->cd->local_size, 0.0); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                for (int cc=0; cc<3; cc++) \
-                    for (int i=0; i<2; i++) \
-                        this->cv[1][tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i]; \
-                if (strcmp(this->forcing_type, "linear") == 0) \
-                { \
-                    double knorm = sqrt(k2); \
-                    if ((this->fk0 <= knorm) && \
-                        (this->fk1 >= knorm)) \
-                        for (int c=0; c<3; c++) \
-                            for (int i=0; i<2; i++) \
-                                this->cv[1][tindex+c][i] += this->famplitude*this->cu[tindex+c][i]; \
-                } \
-                this->cv[1][tindex+0][0] += this->kx[xindex]*pressure[cindex][1]; \
-                this->cv[1][tindex+1][0] += this->ky[yindex]*pressure[cindex][1]; \
-                this->cv[1][tindex+2][0] += this->kz[zindex]*pressure[cindex][1]; \
-                this->cv[1][tindex+0][1] -= this->kx[xindex]*pressure[cindex][0]; \
-                this->cv[1][tindex+1][1] -= this->ky[yindex]*pressure[cindex][0]; \
-                this->cv[1][tindex+2][1] -= this->kz[zindex]*pressure[cindex][0]; \
-            } \
-            ); \
-    std::copy( \
-            (R*)this->cv[1], \
-            (R*)(this->cv[1] + this->cd->local_size), \
-            (R*)acceleration); \
-    FFTW(free)(pressure); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_Eulerian_acceleration(FFTW(complex) *acceleration) \
-{ \
-    std::fill_n((R*)(acceleration), 2*this->cd->local_size, 0.0); \
-    ptrdiff_t tindex; \
-    this->compute_velocity(this->cvorticity); \
-    /* put in linear terms */ \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                for (int cc=0; cc<3; cc++) \
-                    for (int i=0; i<2; i++) \
-                        acceleration[tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i]; \
-                if (strcmp(this->forcing_type, "linear") == 0) \
-                { \
-                    double knorm = sqrt(k2); \
-                    if ((this->fk0 <= knorm) && \
-                        (this->fk1 >= knorm)) \
-                    { \
-                        for (int c=0; c<3; c++) \
-                            for (int i=0; i<2; i++) \
-                                acceleration[tindex+c][i] += this->famplitude*this->cu[tindex+c][i]; \
-                    } \
-                } \
-            } \
-            ); \
-    this->ift_velocity(); \
-    /* compute uu */ \
-    /* 11 22 33 */ \
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc] / this->normalization_factor; \
-            ); \
-    this->clean_up_real_space(this->rv[1], 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \
-    this->dealias(this->cv[1], 3); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                acceleration[tindex+0][0] += \
-                         this->kx[xindex]*this->cv[1][tindex+0][1]; \
-                acceleration[tindex+0][1] += \
-                        -this->kx[xindex]*this->cv[1][tindex+0][0]; \
-                acceleration[tindex+1][0] += \
-                         this->ky[yindex]*this->cv[1][tindex+1][1]; \
-                acceleration[tindex+1][1] += \
-                        -this->ky[yindex]*this->cv[1][tindex+1][0]; \
-                acceleration[tindex+2][0] += \
-                         this->kz[zindex]*this->cv[1][tindex+2][1]; \
-                acceleration[tindex+2][1] += \
-                        -this->kz[zindex]*this->cv[1][tindex+2][0]; \
-            } \
-            ); \
-    /* 12 23 31 */ \
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3] / this->normalization_factor; \
-            ); \
-    this->clean_up_real_space(this->rv[1], 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \
-    this->dealias(this->cv[1], 3); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                acceleration[tindex+0][0] += \
-                        (this->ky[yindex]*this->cv[1][tindex+0][1] + \
-                         this->kz[zindex]*this->cv[1][tindex+2][1]); \
-                acceleration[tindex+0][1] += \
-                      - (this->ky[yindex]*this->cv[1][tindex+0][0] + \
-                         this->kz[zindex]*this->cv[1][tindex+2][0]); \
-                acceleration[tindex+1][0] += \
-                        (this->kz[zindex]*this->cv[1][tindex+1][1] + \
-                         this->kx[xindex]*this->cv[1][tindex+0][1]); \
-                acceleration[tindex+1][1] += \
-                      - (this->kz[zindex]*this->cv[1][tindex+1][0] + \
-                         this->kx[xindex]*this->cv[1][tindex+0][0]); \
-                acceleration[tindex+2][0] += \
-                        (this->kx[xindex]*this->cv[1][tindex+2][1] + \
-                         this->ky[yindex]*this->cv[1][tindex+1][1]); \
-                acceleration[tindex+2][1] += \
-                      - (this->kx[xindex]*this->cv[1][tindex+2][0] + \
-                         this->ky[yindex]*this->cv[1][tindex+1][0]); \
-            } \
-            ); \
-    if (this->cd->myrank == this->cd->rank[0]) \
-        std::fill_n((R*)(acceleration), 6, 0.0); \
-    this->force_divfree(acceleration); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_Lagrangian_acceleration(R *acceleration) \
-{ \
-    this->compute_Lagrangian_acceleration((FFTW(complex)*)acceleration); \
-    FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \
-    std::copy( \
-            this->rv[1], \
-            this->rv[1] + 2*this->cd->local_size, \
-            acceleration); \
-} \
- \
-template<> \
-int fluid_solver<R>::write_rpressure() \
-{ \
-    char fname[512]; \
-    FFTW(complex) *pressure; \
-    pressure = FFTW(alloc_complex)(this->cd->local_size/3); \
-    this->compute_velocity(this->cvorticity); \
-    this->ift_velocity(); \
-    this->compute_pressure(pressure); \
-    this->fill_up_filename("rpressure", fname); \
-    R *rpressure = FFTW(alloc_real)((this->cd->local_size/3)*2); \
-    FFTW(plan) c2r; \
-    c2r = FFTW(mpi_plan_dft_c2r_3d)( \
-            this->rd->sizes[0], this->rd->sizes[1], this->rd->sizes[2], \
-            pressure, rpressure, this->cd->comm, \
-            this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
-    FFTW(execute)(c2r); \
-    /* output goes here */ \
-    int ntmp[3]; \
-    ntmp[0] = this->rd->sizes[0]; \
-    ntmp[1] = this->rd->sizes[1]; \
-    ntmp[2] = this->rd->sizes[2]; \
-    field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \
-    clip_zero_padding<R>(scalar_descriptor, rpressure, 1); \
-    int return_value = scalar_descriptor->write(fname, rpressure); \
-    delete scalar_descriptor; \
-    FFTW(destroy_plan)(c2r); \
-    FFTW(free)(pressure); \
-    FFTW(free)(rpressure); \
-    return return_value; \
-} \
+template <class rnumber>
+fluid_solver<rnumber>::fluid_solver(
+        const char *NAME,
+        int nx,
+        int ny,
+        int nz,
+        double DKX,
+        double DKY,
+        double DKZ,
+        int DEALIAS_TYPE,
+        unsigned FFTW_PLAN_RIGOR) : fluid_solver_base<rnumber>(
+                                        NAME,
+                                        nx , ny , nz,
+                                        DKX, DKY, DKZ,
+                                        DEALIAS_TYPE,
+                                        FFTW_PLAN_RIGOR)
+{
+    TIMEZONE("fluid_solver::fluid_solver");
+    this->cvorticity = fftw_interface<rnumber>::alloc_complex(this->cd->local_size);
+    this->cvelocity  = fftw_interface<rnumber>::alloc_complex(this->cd->local_size);
+    this->rvorticity = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2);
+    /*this->rvelocity  = (rnumber*)(this->cvelocity);*/
+    this->rvelocity  = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2);
+
+    this->ru = this->rvelocity;
+    this->cu = this->cvelocity;
 
-/*****************************************************************************/
+    this->rv[0] = this->rvorticity;
+    this->rv[3] = this->rvorticity;
+    this->cv[0] = this->cvorticity;
+    this->cv[3] = this->cvorticity;
 
+    this->cv[1] = fftw_interface<rnumber>::alloc_complex(this->cd->local_size);
+    this->cv[2] = this->cv[1];
+    this->rv[1] = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2);
+    this->rv[2] = this->rv[1];
 
+    this->c2r_vorticity = new typename fftw_interface<rnumber>::plan;
+    this->r2c_vorticity = new typename fftw_interface<rnumber>::plan;
+    this->c2r_velocity  = new typename fftw_interface<rnumber>::plan;
+    this->r2c_velocity  = new typename fftw_interface<rnumber>::plan;
+
+    ptrdiff_t sizes[] = {nz,
+                         ny,
+                         nx};
+
+    *this->c2r_vorticity = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->cvorticity, this->rvorticity,
+                MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+
+    *this->r2c_vorticity = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->rvorticity, this->cvorticity,
+                MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
+
+    *this->c2r_velocity = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->cvelocity, this->rvelocity,
+                MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+
+    *this->r2c_velocity = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->rvelocity, this->cvelocity,
+                MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
+
+    this->uc2r = this->c2r_velocity;
+    this->ur2c = this->r2c_velocity;
+    this->vc2r[0] = this->c2r_vorticity;
+    this->vr2c[0] = this->r2c_vorticity;
+
+    this->vc2r[1] = new typename fftw_interface<rnumber>::plan;
+    this->vr2c[1] = new typename fftw_interface<rnumber>::plan;
+    this->vc2r[2] = new typename fftw_interface<rnumber>::plan;
+    this->vr2c[2] = new typename fftw_interface<rnumber>::plan;
+
+    *(this->vc2r[1]) = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->cv[1], this->rv[1],
+            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+
+    *this->vc2r[2] = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->cv[2], this->rv[2],
+            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+
+    *this->vr2c[1] = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->rv[1], this->cv[1],
+            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
+
+    *this->vr2c[2] = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->rv[2], this->cv[2],
+            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
+
+    /* ``physical'' parameters etc, initialized here just in case */
+
+    this->nu = 0.1;
+    this->fmode = 1;
+    this->famplitude = 1.0;
+    this->fk0  = 0;
+    this->fk1 = 3.0;
+    /* initialization of fields must be done AFTER planning */
+    std::fill_n((rnumber*)this->cvorticity, this->cd->local_size*2, 0.0);
+    std::fill_n((rnumber*)this->cvelocity, this->cd->local_size*2, 0.0);
+    std::fill_n(this->rvelocity, this->cd->local_size*2, 0.0);
+    std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0);
+    std::fill_n((rnumber*)this->cv[1], this->cd->local_size*2, 0.0);
+    std::fill_n(this->rv[1], this->cd->local_size*2, 0.0);
+    std::fill_n(this->rv[2], this->cd->local_size*2, 0.0);
+}
+
+template <class rnumber>
+fluid_solver<rnumber>::~fluid_solver()
+{
+    fftw_interface<rnumber>::destroy_plan(*this->c2r_vorticity);
+    fftw_interface<rnumber>::destroy_plan(*this->r2c_vorticity);
+    fftw_interface<rnumber>::destroy_plan(*this->c2r_velocity );
+    fftw_interface<rnumber>::destroy_plan(*this->r2c_velocity );
+    fftw_interface<rnumber>::destroy_plan(*this->vc2r[1]);
+    fftw_interface<rnumber>::destroy_plan(*this->vr2c[1]);
+    fftw_interface<rnumber>::destroy_plan(*this->vc2r[2]);
+    fftw_interface<rnumber>::destroy_plan(*this->vr2c[2]);
+
+    delete this->c2r_vorticity;
+    delete this->r2c_vorticity;
+    delete this->c2r_velocity ;
+    delete this->r2c_velocity ;
+    delete this->vc2r[1];
+    delete this->vr2c[1];
+    delete this->vc2r[2];
+    delete this->vr2c[2];
+
+    fftw_interface<rnumber>::free(this->cv[1]);
+    fftw_interface<rnumber>::free(this->rv[1]);
+    fftw_interface<rnumber>::free(this->cvorticity);
+    fftw_interface<rnumber>::free(this->rvorticity);
+    fftw_interface<rnumber>::free(this->cvelocity);
+    fftw_interface<rnumber>::free(this->rvelocity);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_vorticity()
+{
+    TIMEZONE("fluid_solver::compute_vorticity");
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        // cindex indexing is thread safe (and tindex too) + it is a write
+        ptrdiff_t tindex = 3*cindex;
+        if (k2 <= this->kM2)
+        {
+            this->cvorticity[tindex+0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]);
+            this->cvorticity[tindex+1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]);
+            this->cvorticity[tindex+2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]);
+            this->cvorticity[tindex+0][1] =  (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]);
+            this->cvorticity[tindex+1][1] =  (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]);
+            this->cvorticity[tindex+2][1] =  (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]);
+        }
+        else{
+            std::fill_n((rnumber*)(this->cvorticity+tindex), 6, 0.0);
+        }
+    }
+    );
+    this->symmetrize(this->cvorticity, 3);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_velocity(rnumber (*__restrict__ vorticity)[2])
+{
+    TIMEZONE("fluid_solver::compute_velocity");
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        // cindex indexing is thread safe (and tindex too) + it is a write
+        ptrdiff_t tindex = 3*cindex;
+        if (k2 <= this->kM2 && k2 > 0)
+        {
+            this->cu[tindex+0][0] = -(this->ky[yindex]*vorticity[tindex+2][1] - this->kz[zindex]*vorticity[tindex+1][1]) / k2;
+            this->cu[tindex+1][0] = -(this->kz[zindex]*vorticity[tindex+0][1] - this->kx[xindex]*vorticity[tindex+2][1]) / k2;
+            this->cu[tindex+2][0] = -(this->kx[xindex]*vorticity[tindex+1][1] - this->ky[yindex]*vorticity[tindex+0][1]) / k2;
+            this->cu[tindex+0][1] =  (this->ky[yindex]*vorticity[tindex+2][0] - this->kz[zindex]*vorticity[tindex+1][0]) / k2;
+            this->cu[tindex+1][1] =  (this->kz[zindex]*vorticity[tindex+0][0] - this->kx[xindex]*vorticity[tindex+2][0]) / k2;
+            this->cu[tindex+2][1] =  (this->kx[xindex]*vorticity[tindex+1][0] - this->ky[yindex]*vorticity[tindex+0][0]) / k2;
+        }
+        else
+            std::fill_n((rnumber*)(this->cu+tindex), 6, 0.0);
+    }
+    );
+    /*this->symmetrize(this->cu, 3);*/
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::ift_velocity()
+{
+    TIMEZONE("fluid_solver::ift_velocity");
+    fftw_interface<rnumber>::execute(*(this->c2r_velocity ));
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::ift_vorticity()
+{
+    TIMEZONE("fluid_solver::ift_vorticity");
+    std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0);
+    fftw_interface<rnumber>::execute(*(this->c2r_vorticity ));
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::dft_velocity()
+{
+    TIMEZONE("fluid_solver::dft_velocity");
+    fftw_interface<rnumber>::execute(*(this->r2c_velocity ));
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::dft_vorticity()
+{
+    TIMEZONE("fluid_solver::dft_vorticity");
+    std::fill_n((rnumber*)this->cvorticity, this->cd->local_size*2, 0.0);
+    fftw_interface<rnumber>::execute(*(this->r2c_vorticity ));
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::add_forcing(
+        rnumber (*__restrict__ acc_field)[2], rnumber (*__restrict__ vort_field)[2], rnumber factor)
+{
+    TIMEZONE("fluid_solver::add_forcing");
+    if (strcmp(this->forcing_type, "none") == 0)
+        return;
+    if (strcmp(this->forcing_type, "Kolmogorov") == 0)
+    {
+        ptrdiff_t cindex;
+        if (this->cd->myrank == this->cd->rank[this->fmode])
+        {
+            cindex = ((this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3;
+            acc_field[cindex+2][0] -= this->famplitude*factor/2;
+        }
+        if (this->cd->myrank == this->cd->rank[this->cd->sizes[0] - this->fmode])
+        {
+            cindex = ((this->cd->sizes[0] - this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3;
+            acc_field[cindex+2][0] -= this->famplitude*factor/2;
+        }
+        return;
+    }
+    if (strcmp(this->forcing_type, "linear") == 0)
+    {
+        CLOOP(
+                    this,
+                    [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+            // cindex indexing is thread safe (and cindex*3+c too)
+            double knorm = sqrt(this->kx[xindex]*this->kx[xindex] +
+                         this->ky[yindex]*this->ky[yindex] +
+                         this->kz[zindex]*this->kz[zindex]);
+            if ((this->fk0 <= knorm) && (this->fk1 >= knorm))
+                for (int c=0; c<3; c++)
+                    for (int i=0; i<2; i++)
+                        acc_field[cindex*3+c][i] += this->famplitude*vort_field[cindex*3+c][i]*factor;
+        }
+        );
+        return;
+    }
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::omega_nonlin(
+        int src)
+{
+    TIMEZONE("fluid_solver::omega_nonlin");
+    assert(src >= 0 && src < 3);
+    this->compute_velocity(this->cv[src]);
+    /* get fields from Fourier space to real space */
+    {
+        TIMEZONE("fluid_solver::omega_nonlin::fftw");
+        fftw_interface<rnumber>::execute(*(this->c2r_velocity ));
+        fftw_interface<rnumber>::execute(*(this->vc2r[src]));
+    }
+    /* compute cross product $u \times \omega$, and normalize */
+    {
+        TIMEZONE("fluid_solver::omega_nonlin::RLOOP");
+        RLOOP (
+                    this,
+                    [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+            ptrdiff_t tindex = 3*rindex;
+            rnumber tmp[3][2];
+            for (int cc=0; cc<3; cc++)
+                tmp[cc][0] = (this->ru[tindex+(cc+1)%3]*this->rv[src][tindex+(cc+2)%3] -
+                        this->ru[tindex+(cc+2)%3]*this->rv[src][tindex+(cc+1)%3]);
+            // Access to rindex is thread safe so there is no overlap between threads
+            for (int cc=0; cc<3; cc++)
+                this->ru[(3*rindex)+cc] = tmp[cc][0] / this->normalization_factor;
+        }
+        );
+    }
+    /* go back to Fourier space */
+    this->clean_up_real_space(this->ru, 3);
+    {
+        TIMEZONE("fluid_solver::omega_nonlin::fftw-2");
+        fftw_interface<rnumber>::execute(*(this->r2c_velocity ));
+    }
+    this->dealias(this->cu, 3);
+    /* $\imath k \times Fourier(u \times \omega)$ */
+    {
+        TIMEZONE("fluid_solver::omega_nonlin::CLOOP");
+        CLOOP(
+                    this,
+                    [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+            rnumber tmp[3][2];
+            ptrdiff_t tindex = 3*cindex;
+            {
+                tmp[0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]);
+                tmp[1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]);
+                tmp[2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]);
+                tmp[0][1] =  (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]);
+                tmp[1][1] =  (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]);
+                tmp[2][1] =  (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]);
+            }
+            // cindex indexing is thread safe so it is 3*cindex so there is no overlap between threads
+            for (int cc=0; cc<3; cc++)
+                for (int i=0; i<2; i++)
+                    this->cu[tindex+cc][i] = tmp[cc][i];
+        }
+        );
+    }
+    {
+        TIMEZONE("fluid_solver::omega_nonlin::add_forcing");
+        this->add_forcing(this->cu, this->cv[src], 1.0);
+    }
+    {
+        TIMEZONE("fluid_solver::omega_nonlin::force_divfree");
+        this->force_divfree(this->cu);
+    }
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::step(double dt)
+{
+    TIMEZONE("fluid_solver::step");
+    std::fill_n((rnumber*)this->cv[1], this->cd->local_size*2, 0.0);
+    this->omega_nonlin(0);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/, double k2){
+        if (k2 <= this->kM2)
+        {
+            double factor0 = exp(-this->nu * k2 * dt);
+            // cindex indexing is thread safe so there is no overlap between threads
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->cv[1][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i] +
+                    dt*this->cu[3*cindex+cc][i])*factor0;
+        }
+    }
+    );
+
+    this->omega_nonlin(1);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/, double k2){
+        if (k2 <= this->kM2)
+        {
+            double factor0 = exp(-this->nu * k2 * dt/2);
+            double factor1 = exp( this->nu * k2 * dt/2);
+            // cindex indexing is thread safe so there is no overlap between threads
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->cv[2][3*cindex+cc][i] = (3*this->cv[0][3*cindex+cc][i]*factor0 +
+                    (this->cv[1][3*cindex+cc][i] +
+                    dt*this->cu[3*cindex+cc][i])*factor1)*0.25;
+        }
+    }
+    );
+
+    this->omega_nonlin(2);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/, double k2){
+        if (k2 <= this->kM2)
+        {
+            double factor0 = exp(-this->nu * k2 * dt * 0.5);
+            // cindex indexing is thread safe so there is no overlap between threads
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->cv[3][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i]*factor0 +
+                    2*(this->cv[2][3*cindex+cc][i] +
+                    dt*this->cu[3*cindex+cc][i]))*factor0/3;
+        }
+    }
+    );
+
+    this->force_divfree(this->cvorticity);
+    this->symmetrize(this->cvorticity, 3);
+    this->iteration++;
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::read(char field, char representation)
+{
+    TIMEZONE("fluid_solver::read");
+    char fname[512];
+    int read_result;
+    if (field == 'v')
+    {
+        if (representation == 'c')
+        {
+            this->fill_up_filename("cvorticity", fname);
+            read_result = this->cd->read(fname, (void*)this->cvorticity);
+            if (read_result != EXIT_SUCCESS)
+                return read_result;
+        }
+        if (representation == 'r')
+        {
+            read_result = this->read_base("rvorticity", this->rvorticity);
+            if (read_result != EXIT_SUCCESS)
+                return read_result;
+            else
+                fftw_interface<rnumber>::execute(*(this->r2c_vorticity ));
+        }
+        this->low_pass_Fourier(this->cvorticity, 3, this->kM);
+        this->force_divfree(this->cvorticity);
+        this->symmetrize(this->cvorticity, 3);
+        return EXIT_SUCCESS;
+    }
+    if ((field == 'u') && (representation == 'c'))
+    {
+        read_result = this->read_base("cvelocity", this->cvelocity);
+        this->low_pass_Fourier(this->cvelocity, 3, this->kM);
+        this->force_divfree(this->cvorticity);
+        this->symmetrize(this->cvorticity, 3);
+        return read_result;
+    }
+    if ((field == 'u') && (representation == 'r'))
+        return this->read_base("rvelocity", this->rvelocity);
+    return EXIT_FAILURE;
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::write(char field, char representation)
+{
+    TIMEZONE("fluid_solver::write");
+    char fname[512];
+    if ((field == 'v') && (representation == 'c'))
+    {
+        this->fill_up_filename("cvorticity", fname);
+        return this->cd->write(fname, (void*)this->cvorticity);
+    }
+    if ((field == 'v') && (representation == 'r'))
+    {
+        fftw_interface<rnumber>::execute(*(this->c2r_vorticity ));
+        clip_zero_padding<rnumber>(this->rd, this->rvorticity, 3);
+        this->fill_up_filename("rvorticity", fname);
+        return this->rd->write(fname, this->rvorticity);
+    }
+    this->compute_velocity(this->cvorticity);
+    if ((field == 'u') && (representation == 'c'))
+    {
+        this->fill_up_filename("cvelocity", fname);
+        return this->cd->write(fname, this->cvelocity);
+    }
+    if ((field == 'u') && (representation == 'r'))
+    {
+        this->ift_velocity();
+        clip_zero_padding<rnumber>(this->rd, this->rvelocity, 3);
+        this->fill_up_filename("rvelocity", fname);
+        return this->rd->write(fname, this->rvelocity);
+    }
+    return EXIT_FAILURE;
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::write_rTrS2()
+{
+    TIMEZONE("fluid_solver::write_rTrS2");
+    char fname[512];
+    this->fill_up_filename("rTrS2", fname);
+    typename fftw_interface<rnumber>::complex *ca;
+    rnumber *ra;
+    ca = fftw_interface<rnumber>::alloc_complex(this->cd->local_size*3);
+    ra = (rnumber*)(ca);
+    this->compute_velocity(this->cvorticity);
+    this->compute_vector_gradient(ca, this->cvelocity);
+    for (int cc=0; cc<3; cc++)
+    {
+        std::copy(
+                    (rnumber*)(ca + cc*this->cd->local_size),
+                    (rnumber*)(ca + (cc+1)*this->cd->local_size),
+                    (rnumber*)this->cv[1]);
+        fftw_interface<rnumber>::execute(*(this->vc2r[1]));
+        std::copy(
+                    this->rv[1],
+                this->rv[1] + this->cd->local_size*2,
+                ra + cc*this->cd->local_size*2);
+    }
+    /* velocity gradient is now stored, in real space, in ra */
+    rnumber *dx_u, *dy_u, *dz_u;
+    dx_u = ra;
+    dy_u = ra + 2*this->cd->local_size;
+    dz_u = ra + 4*this->cd->local_size;
+    rnumber *trS2 = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2);
+    shared_array<double> average_local(1, [&](double* data){
+        data[0] = 0;
+    });
+
+    RLOOP(
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        rnumber AxxAxx;
+        rnumber AyyAyy;
+        rnumber AzzAzz;
+        rnumber Sxy;
+        rnumber Syz;
+        rnumber Szx;
+        ptrdiff_t tindex = 3*rindex;
+        AxxAxx = dx_u[tindex+0]*dx_u[tindex+0];
+        AyyAyy = dy_u[tindex+1]*dy_u[tindex+1];
+        AzzAzz = dz_u[tindex+2]*dz_u[tindex+2];
+        Sxy = dx_u[tindex+1]+dy_u[tindex+0];
+        Syz = dy_u[tindex+2]+dz_u[tindex+1];
+        Szx = dz_u[tindex+0]+dx_u[tindex+2];
+        // rindex is thread safe + No overlap between thread it is a write
+        trS2[rindex] = (AxxAxx + AyyAyy + AzzAzz +
+                        (Sxy*Sxy + Syz*Syz + Szx*Szx)/2);
+        average_local.getMine()[0] += trS2[rindex];
+    }
+    );
+    average_local.mergeParallel();
+    double average;
+    MPI_Allreduce(
+                average_local.getMasterData(),
+                &average,
+                1,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
+    DEBUG_MSG("average TrS2 is %g\n", average);
+    fftw_interface<rnumber>::free(ca);
+    /* output goes here */
+    int ntmp[3];
+    ntmp[0] = this->rd->sizes[0];
+    ntmp[1] = this->rd->sizes[1];
+    ntmp[2] = this->rd->sizes[2];
+    field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm);
+    clip_zero_padding<rnumber>(scalar_descriptor, trS2, 1);
+    int return_value = scalar_descriptor->write(fname, trS2);
+    delete scalar_descriptor;
+    fftw_interface<rnumber>::free(trS2);
+    return return_value;
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::write_renstrophy()
+{
+    TIMEZONE("fluid_solver::write_renstrophy");
+    char fname[512];
+    this->fill_up_filename("renstrophy", fname);
+    rnumber *enstrophy = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2);
+    this->ift_vorticity();
+    shared_array<double> average_local(1, [&](double* data){
+        data[0] = 0;
+    });
+
+    RLOOP(
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        ptrdiff_t tindex = 3*rindex;
+        // rindex indexing is thread safe so there is no overlap between threads
+        enstrophy[rindex] = (
+                    this->rvorticity[tindex+0]*this->rvorticity[tindex+0] +
+                this->rvorticity[tindex+1]*this->rvorticity[tindex+1] +
+                this->rvorticity[tindex+2]*this->rvorticity[tindex+2]
+                )/2;
+        average_local.getMine()[0] += enstrophy[rindex];
+    }
+    );
+    average_local.mergeParallel();
+    double average;
+    MPI_Allreduce(
+                average_local.getMasterData(),
+                &average,
+                1,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
+    DEBUG_MSG("average enstrophy is %g\n", average);
+    /* output goes here */
+    int ntmp[3];
+    ntmp[0] = this->rd->sizes[0];
+    ntmp[1] = this->rd->sizes[1];
+    ntmp[2] = this->rd->sizes[2];
+    field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm);
+    clip_zero_padding<rnumber>(scalar_descriptor, enstrophy, 1);
+    int return_value = scalar_descriptor->write(fname, enstrophy);
+    delete scalar_descriptor;
+    fftw_interface<rnumber>::free(enstrophy);
+    return return_value;
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_pressure(rnumber (*__restrict__ pressure)[2])
+{
+    TIMEZONE("fluid_solver::compute_pressure");
+    /* assume velocity is already in real space representation */
+    /* diagonal terms 11 22 33 */
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        // rindex indexing is thread safe so there is no overlap between threads
+        ptrdiff_t tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc];
+    }
+    );
+    this->clean_up_real_space(this->rv[1], 3);
+    {
+        TIMEZONE("fftw_interface<rnumber>::execute");
+        fftw_interface<rnumber>::execute(*(this->vr2c[1]));
+    }
+    this->dealias(this->cv[1], 3);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2 && k2 > 0)
+        {
+            // cindex indexing is thread safe so there is no overlap between threads
+            ptrdiff_t tindex = 3*cindex;
+            for (int i=0; i<2; i++)
+            {
+                pressure[cindex][i] = -(this->kx[xindex]*this->kx[xindex]*this->cv[1][tindex+0][i] +
+                        this->ky[yindex]*this->ky[yindex]*this->cv[1][tindex+1][i] +
+                        this->kz[zindex]*this->kz[zindex]*this->cv[1][tindex+2][i]);
+            }
+        }
+        else
+            std::fill_n((rnumber*)(pressure+cindex), 2, 0.0);
+    }
+    );
+    /* off-diagonal terms 12 23 31 */
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        // rindex indexing is thread safe so there is no overlap between threads
+        ptrdiff_t tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3];
+    }
+    );
+    this->clean_up_real_space(this->rv[1], 3);
+    {
+        TIMEZONE("fftw_interface<rnumber>::execute");
+        fftw_interface<rnumber>::execute(*(this->vr2c[1]));
+    }
+    this->dealias(this->cv[1], 3);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2 && k2 > 0)
+        {
+            // cindex indexing is thread safe so there is no overlap between threads
+            ptrdiff_t tindex = 3*cindex;
+            for (int i=0; i<2; i++)
+            {
+                pressure[cindex][i] -= 2*(this->kx[xindex]*this->ky[yindex]*this->cv[1][tindex+0][i] +
+                        this->ky[yindex]*this->kz[zindex]*this->cv[1][tindex+1][i] +
+                        this->kz[zindex]*this->kx[xindex]*this->cv[1][tindex+2][i]);
+                pressure[cindex][i] /= this->normalization_factor*k2;
+            }
+        }
+    }
+    );
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_gradient_statistics(
+        rnumber (*__restrict__ vec)[2],
+double *gradu_moments,
+double *trS2QR_moments,
+ptrdiff_t *gradu_hist,
+ptrdiff_t *trS2QR_hist,
+ptrdiff_t *QR2D_hist,
+double trS2QR_max_estimates[],
+double gradu_max_estimates[],
+int nbins,
+int QR2D_nbins)
+{
+    TIMEZONE("fluid_solver::compute_gradient_statistics");
+    typename fftw_interface<rnumber>::complex *ca;
+    rnumber *ra;
+    ca = fftw_interface<rnumber>::alloc_complex(this->cd->local_size*3);
+    ra = (rnumber*)(ca);
+    this->compute_vector_gradient(ca, vec);
+    for (int cc=0; cc<3; cc++)
+    {
+        std::copy(
+                    (rnumber*)(ca + cc*this->cd->local_size),
+                    (rnumber*)(ca + (cc+1)*this->cd->local_size),
+                    (rnumber*)this->cv[1]);
+        fftw_interface<rnumber>::execute(*(this->vc2r[1]));
+        std::copy(
+                    this->rv[1],
+                this->rv[1] + this->cd->local_size*2,
+                ra + cc*this->cd->local_size*2);
+    }
+    /* velocity gradient is now stored, in real space, in ra */
+    std::fill_n(this->rv[1], 2*this->cd->local_size, 0.0);
+    rnumber *dx_u, *dy_u, *dz_u;
+    dx_u = ra;
+    dy_u = ra + 2*this->cd->local_size;
+    dz_u = ra + 4*this->cd->local_size;
+    double binsize[2];
+    double tmp_max_estimate[3];
+    tmp_max_estimate[0] = trS2QR_max_estimates[0];
+    tmp_max_estimate[1] = trS2QR_max_estimates[1];
+    tmp_max_estimate[2] = trS2QR_max_estimates[2];
+    binsize[0] = 2*tmp_max_estimate[2] / QR2D_nbins;
+    binsize[1] = 2*tmp_max_estimate[1] / QR2D_nbins;
+    ptrdiff_t *local_hist = new ptrdiff_t[QR2D_nbins*QR2D_nbins];
+    std::fill_n(local_hist, QR2D_nbins*QR2D_nbins, 0);
+    RLOOP(
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        rnumber AxxAxx;
+        rnumber AyyAyy;
+        rnumber AzzAzz;
+        rnumber AxyAyx;
+        rnumber AyzAzy;
+        rnumber AzxAxz;
+        rnumber Sxy;
+        rnumber Syz;
+        rnumber Szx;
+        // rindex indexing is thread safe so there is no overlap between threads
+        // tindex[0:2] is thread safe too
+        ptrdiff_t tindex = 3*rindex;
+        AxxAxx = dx_u[tindex+0]*dx_u[tindex+0];
+        AyyAyy = dy_u[tindex+1]*dy_u[tindex+1];
+        AzzAzz = dz_u[tindex+2]*dz_u[tindex+2];
+        AxyAyx = dx_u[tindex+1]*dy_u[tindex+0];
+        AyzAzy = dy_u[tindex+2]*dz_u[tindex+1];
+        AzxAxz = dz_u[tindex+0]*dx_u[tindex+2];
+        this->rv[1][tindex+1] = - (AxxAxx + AyyAyy + AzzAzz)/2 - AxyAyx - AyzAzy - AzxAxz;
+        this->rv[1][tindex+2] = - (dx_u[tindex+0]*(AxxAxx/3 + AxyAyx + AzxAxz) +
+                dy_u[tindex+1]*(AyyAyy/3 + AxyAyx + AyzAzy) +
+                dz_u[tindex+2]*(AzzAzz/3 + AzxAxz + AyzAzy) +
+                dx_u[tindex+1]*dy_u[tindex+2]*dz_u[tindex+0] +
+                dx_u[tindex+2]*dy_u[tindex+0]*dz_u[tindex+1]);
+        int bin0 = int(floor((this->rv[1][tindex+2] + tmp_max_estimate[2]) / binsize[0]));
+        int bin1 = int(floor((this->rv[1][tindex+1] + tmp_max_estimate[1]) / binsize[1]));
+        if ((bin0 >= 0 && bin0 < QR2D_nbins) &&
+                (bin1 >= 0 && bin1 < QR2D_nbins))
+            local_hist[bin1*QR2D_nbins + bin0]++;
+        Sxy = dx_u[tindex+1]+dy_u[tindex+0];
+        Syz = dy_u[tindex+2]+dz_u[tindex+1];
+        Szx = dz_u[tindex+0]+dx_u[tindex+2];
+        this->rv[1][tindex] = (AxxAxx + AyyAyy + AzzAzz +
+                               (Sxy*Sxy + Syz*Syz + Szx*Szx)/2);
+    }
+    );
+    MPI_Allreduce(
+                local_hist,
+                QR2D_hist,
+                QR2D_nbins * QR2D_nbins,
+                MPI_INT64_T, MPI_SUM, this->cd->comm);
+    delete[] local_hist;
+    this->compute_rspace_stats3(
+                this->rv[1],
+            trS2QR_moments,
+            trS2QR_hist,
+            tmp_max_estimate,
+            nbins);
+    double *tmp_moments = new double[10*3];
+    ptrdiff_t *tmp_hist = new ptrdiff_t[nbins*3];
+    for (int cc=0; cc<3; cc++)
+    {
+        tmp_max_estimate[0] = gradu_max_estimates[cc*3 + 0];
+        tmp_max_estimate[1] = gradu_max_estimates[cc*3 + 1];
+        tmp_max_estimate[2] = gradu_max_estimates[cc*3 + 2];
+        this->compute_rspace_stats3(
+                    dx_u + cc*2*this->cd->local_size,
+                    tmp_moments,
+                    tmp_hist,
+                    tmp_max_estimate,
+                    nbins);
+        for (int n = 0; n < 10; n++)
+            for (int i = 0; i < 3 ; i++)
+            {
+                gradu_moments[(n*3 + cc)*3 + i] = tmp_moments[n*3 + i];
+            }
+        for (int n = 0; n < nbins; n++)
+            for (int i = 0; i < 3; i++)
+            {
+                gradu_hist[(n*3 + cc)*3 + i] = tmp_hist[n*3 + i];
+            }
+    }
+    delete[] tmp_moments;
+    delete[] tmp_hist;
+    fftw_interface<rnumber>::free(ca);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_Lagrangian_acceleration(rnumber (*acceleration)[2])
+{
+    TIMEZONE("fluid_solver::compute_Lagrangian_acceleration");
+    typename fftw_interface<rnumber>::complex *pressure;
+    pressure = fftw_interface<rnumber>::alloc_complex(this->cd->local_size/3);
+    this->compute_velocity(this->cvorticity);
+    this->ift_velocity();
+    this->compute_pressure(pressure);
+    this->compute_velocity(this->cvorticity);
+    std::fill_n((rnumber*)this->cv[1], 2*this->cd->local_size, 0.0);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            // cindex indexing is thread safe so there is no overlap between threads
+            ptrdiff_t tindex = 3*cindex;
+            for (int cc=0; cc<3; cc++)
+                for (int i=0; i<2; i++)
+                    this->cv[1][tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i];
+            if (strcmp(this->forcing_type, "linear") == 0)
+            {
+                double knorm = sqrt(k2);
+                if ((this->fk0 <= knorm) &&
+                        (this->fk1 >= knorm))
+                    for (int c=0; c<3; c++)
+                        for (int i=0; i<2; i++)
+                            this->cv[1][tindex+c][i] += this->famplitude*this->cu[tindex+c][i];
+            }
+            this->cv[1][tindex+0][0] += this->kx[xindex]*pressure[cindex][1];
+            this->cv[1][tindex+1][0] += this->ky[yindex]*pressure[cindex][1];
+            this->cv[1][tindex+2][0] += this->kz[zindex]*pressure[cindex][1];
+            this->cv[1][tindex+0][1] -= this->kx[xindex]*pressure[cindex][0];
+            this->cv[1][tindex+1][1] -= this->ky[yindex]*pressure[cindex][0];
+            this->cv[1][tindex+2][1] -= this->kz[zindex]*pressure[cindex][0];
+        }
+    }
+    );
+    std::copy(
+                (rnumber*)this->cv[1],
+            (rnumber*)(this->cv[1] + this->cd->local_size),
+            (rnumber*)acceleration);
+    fftw_interface<rnumber>::free(pressure);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_Eulerian_acceleration(rnumber (*__restrict__ acceleration)[2])
+{
+    TIMEZONE("fluid_solver::compute_Eulerian_acceleration");
+    std::fill_n((rnumber*)(acceleration), 2*this->cd->local_size, 0.0);
+    this->compute_velocity(this->cvorticity);
+    /* put in linear terms */
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/, double k2){
+        if (k2 <= this->kM2)
+        {
+            // cindex indexing is thread safe so there is no overlap between threads
+            ptrdiff_t tindex = 3*cindex;
+            for (int cc=0; cc<3; cc++)
+                for (int i=0; i<2; i++)
+                    acceleration[tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i];
+            if (strcmp(this->forcing_type, "linear") == 0)
+            {
+                double knorm = sqrt(k2);
+                if ((this->fk0 <= knorm) &&
+                        (this->fk1 >= knorm))
+                {
+                    for (int c=0; c<3; c++)
+                        for (int i=0; i<2; i++)
+                            acceleration[tindex+c][i] += this->famplitude*this->cu[tindex+c][i];
+                }
+            }
+        }
+    }
+    );
+    this->ift_velocity();
+    /* compute uu */
+    /* 11 22 33 */
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        // cindex indexing is thread safe so there is no overlap between threads
+        ptrdiff_t tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc] / this->normalization_factor;
+    }
+    );
+    this->clean_up_real_space(this->rv[1], 3);
+    fftw_interface<rnumber>::execute(*(this->vr2c[1]));
+    this->dealias(this->cv[1], 3);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            // cindex indexing is thread safe so there is no overlap between threads
+            ptrdiff_t tindex = 3*cindex;
+            acceleration[tindex+0][0] +=
+                    this->kx[xindex]*this->cv[1][tindex+0][1];
+            acceleration[tindex+0][1] +=
+                    -this->kx[xindex]*this->cv[1][tindex+0][0];
+            acceleration[tindex+1][0] +=
+                    this->ky[yindex]*this->cv[1][tindex+1][1];
+            acceleration[tindex+1][1] +=
+                    -this->ky[yindex]*this->cv[1][tindex+1][0];
+            acceleration[tindex+2][0] +=
+                    this->kz[zindex]*this->cv[1][tindex+2][1];
+            acceleration[tindex+2][1] +=
+                    -this->kz[zindex]*this->cv[1][tindex+2][0];
+        }
+    }
+    );
+    /* 12 23 31 */
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        // cindex indexing is thread safe so there is no overlap between threads
+        ptrdiff_t tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3] / this->normalization_factor;
+    }
+    );
+    this->clean_up_real_space(this->rv[1], 3);
+    fftw_interface<rnumber>::execute(*(this->vr2c[1]));
+    this->dealias(this->cv[1], 3);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            // cindex indexing is thread safe so there is no overlap between threads
+            ptrdiff_t tindex = 3*cindex;
+            acceleration[tindex+0][0] +=
+                    (this->ky[yindex]*this->cv[1][tindex+0][1] +
+                    this->kz[zindex]*this->cv[1][tindex+2][1]);
+            acceleration[tindex+0][1] +=
+                    - (this->ky[yindex]*this->cv[1][tindex+0][0] +
+                    this->kz[zindex]*this->cv[1][tindex+2][0]);
+            acceleration[tindex+1][0] +=
+                    (this->kz[zindex]*this->cv[1][tindex+1][1] +
+                    this->kx[xindex]*this->cv[1][tindex+0][1]);
+            acceleration[tindex+1][1] +=
+                    - (this->kz[zindex]*this->cv[1][tindex+1][0] +
+                    this->kx[xindex]*this->cv[1][tindex+0][0]);
+            acceleration[tindex+2][0] +=
+                    (this->kx[xindex]*this->cv[1][tindex+2][1] +
+                    this->ky[yindex]*this->cv[1][tindex+1][1]);
+            acceleration[tindex+2][1] +=
+                    - (this->kx[xindex]*this->cv[1][tindex+2][0] +
+                    this->ky[yindex]*this->cv[1][tindex+1][0]);
+        }
+    }
+    );
+    if (this->cd->myrank == this->cd->rank[0])
+        std::fill_n((rnumber*)(acceleration), 6, 0.0);
+    this->force_divfree(acceleration);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_Lagrangian_acceleration(rnumber *__restrict__ acceleration)
+{
+    TIMEZONE("fluid_solver::compute_Lagrangian_acceleration");
+    this->compute_Lagrangian_acceleration((typename fftw_interface<rnumber>::complex*)acceleration);
+    fftw_interface<rnumber>::execute(*(this->vc2r[1]));
+    std::copy(
+                this->rv[1],
+            this->rv[1] + 2*this->cd->local_size,
+            acceleration);
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::write_rpressure()
+{
+    TIMEZONE("fluid_solver::write_rpressure");
+    char fname[512];
+    typename fftw_interface<rnumber>::complex *pressure;
+    pressure = fftw_interface<rnumber>::alloc_complex(this->cd->local_size/3);
+    this->compute_velocity(this->cvorticity);
+    this->ift_velocity();
+    this->compute_pressure(pressure);
+    this->fill_up_filename("rpressure", fname);
+    rnumber *rpressure = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2);
+    typename fftw_interface<rnumber>::plan c2r;
+    c2r = fftw_interface<rnumber>::mpi_plan_dft_c2r_3d(
+                this->rd->sizes[0], this->rd->sizes[1], this->rd->sizes[2],
+            pressure, rpressure, this->cd->comm,
+            this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+    fftw_interface<rnumber>::execute(c2r);
+    /* output goes here */
+    int ntmp[3];
+    ntmp[0] = this->rd->sizes[0];
+    ntmp[1] = this->rd->sizes[1];
+    ntmp[2] = this->rd->sizes[2];
+    field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm);
+    clip_zero_padding<rnumber>(scalar_descriptor, rpressure, 1);
+    int return_value = scalar_descriptor->write(fname, rpressure);
+    delete scalar_descriptor;
+    fftw_interface<rnumber>::destroy_plan(c2r);
+    fftw_interface<rnumber>::free(pressure);
+    fftw_interface<rnumber>::free(rpressure);
+    return return_value;
+}
 
 /*****************************************************************************/
-/* now actually use the macro defined above                                  */
-FLUID_SOLVER_DEFINITIONS(
-        FFTW_MANGLE_FLOAT,
-        float,
-        MPI_FLOAT,
-        MPI_COMPLEX)
-FLUID_SOLVER_DEFINITIONS(
-        FFTW_MANGLE_DOUBLE,
-        double,
-        MPI_DOUBLE,
-        BFPS_MPICXX_DOUBLE_COMPLEX)
-/*****************************************************************************/
+
 
 
 
diff --git a/bfps/cpp/fluid_solver.hpp b/bfps/cpp/fluid_solver.hpp
index 2b6ec64de12cc133687074c83c71696ffc507509..4cc75cee4385353f64dc9bc9e7d34c6efba9ad48 100644
--- a/bfps/cpp/fluid_solver.hpp
+++ b/bfps/cpp/fluid_solver.hpp
@@ -55,12 +55,12 @@ class fluid_solver:public fluid_solver_base<rnumber>
         typename fluid_solver_base<rnumber>::cnumber *cu, *cv[4];
 
         /* plans */
-        void *c2r_vorticity;
-        void *r2c_vorticity;
-        void *c2r_velocity;
-        void *r2c_velocity;
-        void *uc2r, *ur2c;
-        void *vr2c[3], *vc2r[3];
+        typename fftw_interface<rnumber>::plan *c2r_vorticity;
+        typename fftw_interface<rnumber>::plan *r2c_vorticity;
+        typename fftw_interface<rnumber>::plan *c2r_velocity;
+        typename fftw_interface<rnumber>::plan *r2c_velocity;
+        typename fftw_interface<rnumber>::plan *uc2r, *ur2c;
+        typename fftw_interface<rnumber>::plan *vr2c[3], *vc2r[3];
 
         /* physical parameters */
         double nu;
diff --git a/bfps/cpp/fluid_solver_base.cpp b/bfps/cpp/fluid_solver_base.cpp
index 2f2aeee9a8ae699b7863c90dcffb550bc905390a..1ac50f29c8c5d58a7efb064302055430901ab24a 100644
--- a/bfps/cpp/fluid_solver_base.cpp
+++ b/bfps/cpp/fluid_solver_base.cpp
@@ -32,7 +32,8 @@
 #include "base.hpp"
 #include "fluid_solver_base.hpp"
 #include "fftw_tools.hpp"
-
+#include "scope_timer.hpp"
+#include "shared_array.hpp"
 
 template <class rnumber>
 void fluid_solver_base<rnumber>::fill_up_filename(const char *base_name, char *destination)
@@ -43,6 +44,7 @@ void fluid_solver_base<rnumber>::fill_up_filename(const char *base_name, char *d
 template <class rnumber>
 void fluid_solver_base<rnumber>::clean_up_real_space(rnumber *a, int howmany)
 {
+    TIMEZONE("fluid_solver_base::clean_up_real_space");
     for (ptrdiff_t rindex = 0; rindex < this->cd->local_size*2; rindex += howmany*(this->rd->subsizes[2]+2))
         std::fill_n(a+rindex+this->rd->subsizes[2]*howmany, 2*howmany, 0.0);
 }
@@ -65,65 +67,76 @@ double fluid_solver_base<rnumber>::autocorrel(cnumber *a)
 template <class rnumber>
 void fluid_solver_base<rnumber>::cospectrum(cnumber *a, cnumber *b, double *spec)
 {
-    double *cospec_local = fftw_alloc_real(this->nshells*9);
-    std::fill_n(cospec_local, this->nshells*9, 0);
-    int tmp_int;
+    TIMEZONE("fluid_solver_base::cospectrum");
+    shared_array<double> cospec_local_thread(this->nshells*9,[&](double* cospec_local){
+        std::fill_n(cospec_local, this->nshells*9, 0);
+    });
+
     CLOOP_K2_NXMODES(
-            this,
-            if (k2 <= this->kMspec2)
-            {
-                tmp_int = int(sqrt(k2)/this->dk)*9;
-                for (int i=0; i<3; i++)
-                    for (int j=0; j<3; j++)
-                    {
-                        cospec_local[tmp_int+i*3+j] += nxmodes * (
-                        (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] +
-                        (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]);
-                    }
-            }
-            );
+                this,
+
+                [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/,
+                ptrdiff_t /*zindex*/, double k2, int nxmodes){
+        if (k2 <= this->kMspec2)
+        {
+            int tmp_int = int(sqrt(k2)/this->dk)*9;
+            double* cospec_local = cospec_local_thread.getMine();
+            for (int i=0; i<3; i++)
+                for (int j=0; j<3; j++)
+                {
+                    cospec_local[tmp_int+i*3+j] += nxmodes * (
+                                (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] +
+                            (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]);
+                }
+        }}
+    );
+    cospec_local_thread.mergeParallel();
     MPI_Allreduce(
-            (void*)cospec_local,
-            (void*)spec,
-            this->nshells*9,
-            MPI_DOUBLE, MPI_SUM, this->cd->comm);
-    fftw_free(cospec_local);
+                cospec_local_thread.getMasterData(),
+                (void*)spec,
+                this->nshells*9,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
 }
 
 template <class rnumber>
 void fluid_solver_base<rnumber>::cospectrum(cnumber *a, cnumber *b, double *spec, const double k2exponent)
 {
-    double *cospec_local = fftw_alloc_real(this->nshells*9);
-    std::fill_n(cospec_local, this->nshells*9, 0);
-    double factor = 1;
-    int tmp_int;
+    TIMEZONE("fluid_solver_base::cospectrum2");
+    shared_array<double> cospec_local_thread(this->nshells*9,[&](double* cospec_local){
+        std::fill_n(cospec_local, this->nshells*9, 0);
+    });
+
     CLOOP_K2_NXMODES(
-            this,
-            if (k2 <= this->kMspec2)
-            {
-                factor = nxmodes*pow(k2, k2exponent);
-                tmp_int = int(sqrt(k2)/this->dk)*9;
-                for (int i=0; i<3; i++)
-                    for (int j=0; j<3; j++)
-                    {
-                        cospec_local[tmp_int+i*3+j] += factor * (
-                        (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] +
-                        (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]);
-                    }
-            }
-            );
+                this,
+
+                [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/,
+                ptrdiff_t /*zindex*/, double k2, int nxmodes){
+        if (k2 <= this->kMspec2)
+        {
+            double factor = nxmodes*pow(k2, k2exponent);
+            int tmp_int = int(sqrt(k2)/this->dk)*9;
+            double* cospec_local = cospec_local_thread.getMine();
+            for (int i=0; i<3; i++)
+                for (int j=0; j<3; j++)
+                {
+                    cospec_local[tmp_int+i*3+j] += factor * (
+                                (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] +
+                            (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]);
+                }
+        }}
+    );
+    cospec_local_thread.mergeParallel();
     MPI_Allreduce(
-            (void*)cospec_local,
-            (void*)spec,
-            this->nshells*9,
-            MPI_DOUBLE, MPI_SUM, this->cd->comm);
+                cospec_local_thread.getMasterData(),
+                (void*)spec,
+                this->nshells*9,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
     //for (int n=0; n<this->nshells; n++)
     //{
     //    spec[n] *= 12.5663706144*pow(this->kshell[n], 2) / this->nshell[n];
     //    /*is normalization needed?
     //     * spec[n] /= this->normalization_factor*/
     //}
-    fftw_free(cospec_local);
 }
 
 template <class rnumber>
@@ -134,6 +147,7 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
         const hsize_t toffset,
         const std::vector<double> max_estimate)
 {
+    TIMEZONE("fluid_solver_base::compute_rspace_stats");
     const int nmoments = 10;
     int nvals, nbins;
     if (this->rd->myrank == 0)
@@ -145,6 +159,7 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
         wspace = H5Dget_space(dset);
         ndims = H5Sget_simple_extent_dims(wspace, dims, NULL);
         assert(ndims == 3);
+        variable_used_only_in_assert(ndims);
         assert(dims[1] == nmoments);
         nvals = dims[2];
         H5Sclose(wspace);
@@ -161,22 +176,29 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
     MPI_Bcast(&nvals, 1, MPI_INT, 0, this->rd->comm);
     MPI_Bcast(&nbins, 1, MPI_INT, 0, this->rd->comm);
     assert(nvals == max_estimate.size());
-    double *moments = new double[nmoments*nvals];
-    double *local_moments = new double[nmoments*nvals];
-    double *val_tmp = new double[nvals];
+    shared_array<double> threaded_local_moments(nmoments*nvals, [&](double* local_moments){
+        std::fill_n(local_moments, nmoments*nvals, 0);
+        if (nvals == 4) local_moments[3] = max_estimate[3];
+    });
+
+    shared_array<double> threaded_val_tmp(nvals);
+
+    shared_array<ptrdiff_t> threaded_local_hist(nbins*nvals, [&](ptrdiff_t* local_hist){
+        std::fill_n(local_hist, nbins*nvals, 0);
+    });
+
+    // Not written by threads
     double *binsize = new double[nvals];
-    double *pow_tmp = new double[nvals];
-    ptrdiff_t *hist = new ptrdiff_t[nbins*nvals];
-    ptrdiff_t *local_hist = new ptrdiff_t[nbins*nvals];
-    int bin;
     for (int i=0; i<nvals; i++)
         binsize[i] = 2*max_estimate[i] / nbins;
-    std::fill_n(local_hist, nbins*nvals, 0);
-    std::fill_n(local_moments, nmoments*nvals, 0);
-    if (nvals == 4) local_moments[3] = max_estimate[3];
+
     RLOOP(
-        this,
-        std::fill_n(pow_tmp, nvals, 1.0);
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        double *val_tmp = threaded_val_tmp.getMine();
+        ptrdiff_t* local_hist = threaded_local_hist.getMine();
+        double *local_moments = threaded_local_moments.getMine();
+
         if (nvals == 4) val_tmp[3] = 0.0;
         for (int i=0; i<3; i++)
         {
@@ -190,7 +212,7 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
                 local_moments[0*nvals+3] = val_tmp[3];
             if (val_tmp[3] > local_moments[9*nvals+3])
                 local_moments[9*nvals+3] = val_tmp[3];
-            bin = int(floor(val_tmp[3]*2/binsize[3]));
+            int bin = int(floor(val_tmp[3]*2/binsize[3]));
             if (bin >= 0 && bin < nbins)
                 local_hist[bin*nvals+3]++;
         }
@@ -200,42 +222,63 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
                 local_moments[0*nvals+i] = val_tmp[i];
             if (val_tmp[i] > local_moments[(nmoments-1)*nvals+i])
                 local_moments[(nmoments-1)*nvals+i] = val_tmp[i];
-            bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i]));
+            int bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i]));
             if (bin >= 0 && bin < nbins)
                 local_hist[bin*nvals+i]++;
         }
-        for (int n=1; n < nmoments-1; n++)
-            for (int i=0; i<nvals; i++)
-                local_moments[n*nvals + i] += (pow_tmp[i] = val_tmp[i]*pow_tmp[i]);
-        );
+        for (int n=1; n < nmoments-1; n++){
+            double pow_tmp = 1.;
+            for (int i=0; i<nvals; i++){
+                local_moments[n*nvals + i] += (pow_tmp = val_tmp[i]*pow_tmp);
+            }
+        }
+    }
+    );
+
+    threaded_local_hist.mergeParallel();
+    threaded_local_moments.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double {
+          if(nvals == int(4) && idx == 0*nvals+3){
+              return std::min(v1, v2);  
+          }
+          if(nvals == int(4) && idx == 9*nvals+3){
+              return std::max(v1, v2);  
+          }
+          if(idx < 3){
+              return std::min(v1, v2);        
+          }      
+          if((nmoments-1)*nvals <= idx && idx < (nmoments-1)*nvals+3){
+              return std::max(v1, v2);        
+          }
+          return v1 + v2;
+      });
+
+
+    double *moments = new double[nmoments*nvals];
     MPI_Allreduce(
-            (void*)local_moments,
-            (void*)moments,
-            nvals,
-            MPI_DOUBLE, MPI_MIN, this->cd->comm);
+                threaded_local_moments.getMasterData(),
+                (void*)moments,
+                nvals,
+                MPI_DOUBLE, MPI_MIN, this->cd->comm);
     MPI_Allreduce(
-            (void*)(local_moments + nvals),
-            (void*)(moments+nvals),
-            (nmoments-2)*nvals,
-            MPI_DOUBLE, MPI_SUM, this->cd->comm);
+                (threaded_local_moments.getMasterData() + nvals),
+                (void*)(moments+nvals),
+                (nmoments-2)*nvals,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
     MPI_Allreduce(
-            (void*)(local_moments + (nmoments-1)*nvals),
-            (void*)(moments+(nmoments-1)*nvals),
-            nvals,
-            MPI_DOUBLE, MPI_MAX, this->cd->comm);
+                (threaded_local_moments.getMasterData() + (nmoments-1)*nvals),
+                (void*)(moments+(nmoments-1)*nvals),
+                nvals,
+                MPI_DOUBLE, MPI_MAX, this->cd->comm);
+    ptrdiff_t *hist = new ptrdiff_t[nbins*nvals];
     MPI_Allreduce(
-            (void*)local_hist,
-            (void*)hist,
-            nbins*nvals,
-            MPI_INT64_T, MPI_SUM, this->cd->comm);
+                threaded_local_hist.getMasterData(),
+                (void*)hist,
+                nbins*nvals,
+                MPI_INT64_T, MPI_SUM, this->cd->comm);
     for (int n=1; n < nmoments-1; n++)
         for (int i=0; i<nvals; i++)
             moments[n*nvals + i] /= this->normalization_factor;
-    delete[] local_moments;
-    delete[] local_hist;
-    delete[] val_tmp;
     delete[] binsize;
-    delete[] pow_tmp;
     if (this->rd->myrank == 0)
     {
         hid_t dset, wspace, mspace;
@@ -280,18 +323,28 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
         double max_estimate[],
         const int nbins)
 {
-    double *local_moments = fftw_alloc_real(10*nvals);
-    double val_tmp[nvals], binsize[nvals], pow_tmp[nvals];
-    ptrdiff_t *local_hist = new ptrdiff_t[nbins*nvals];
-    int bin;
+    TIMEZONE("fluid_solver_base::compute_rspace_stats");
+    shared_array<double> threaded_local_moments(10*nvals,[&](double* local_moments){
+        std::fill_n(local_moments, 10*nvals, 0);
+        if (nvals == 4) local_moments[3] = max_estimate[3];
+    });
+
+    shared_array<ptrdiff_t> threaded_local_hist(nbins*nvals, [&](ptrdiff_t* local_hist){
+        std::fill_n(local_hist, nbins*nvals, 0);
+    });
+
+    // Will not be modified by the threads
+    double binsize[nvals];
     for (int i=0; i<nvals; i++)
         binsize[i] = 2*max_estimate[i] / nbins;
-    std::fill_n(local_hist, nbins*nvals, 0);
-    std::fill_n(local_moments, 10*nvals, 0);
-    if (nvals == 4) local_moments[3] = max_estimate[3];
+
     RLOOP(
-        this,
-        std::fill_n(pow_tmp, nvals, 1.0);
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/, ptrdiff_t /*zindex*/){
+        ptrdiff_t *local_hist = threaded_local_hist.getMine();
+        double *local_moments = threaded_local_moments.getMine();
+
+        double val_tmp[nvals];
         if (nvals == 4) val_tmp[3] = 0.0;
         for (int i=0; i<3; i++)
         {
@@ -305,7 +358,7 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
                 local_moments[0*nvals+3] = val_tmp[3];
             if (val_tmp[3] > local_moments[9*nvals+3])
                 local_moments[9*nvals+3] = val_tmp[3];
-            bin = int(floor(val_tmp[3]*2/binsize[3]));
+            int bin = int(floor(val_tmp[3]*2/binsize[3]));
             if (bin >= 0 && bin < nbins)
                 local_hist[bin*nvals+3]++;
         }
@@ -315,44 +368,65 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
                 local_moments[0*nvals+i] = val_tmp[i];
             if (val_tmp[i] > local_moments[9*nvals+i])
                 local_moments[9*nvals+i] = val_tmp[i];
-            bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i]));
+            int bin = int(floor((val_tmp[i] + max_estimate[i]) / binsize[i]));
             if (bin >= 0 && bin < nbins)
                 local_hist[bin*nvals+i]++;
         }
-        for (int n=1; n<9; n++)
-            for (int i=0; i<nvals; i++)
-                local_moments[n*nvals + i] += (pow_tmp[i] = val_tmp[i]*pow_tmp[i]);
-        );
+        for (int n=1; n<9; n++){
+            double pow_tmp = 1;
+            for (int i=0; i<nvals; i++){
+                local_moments[n*nvals + i] += (pow_tmp = val_tmp[i]*pow_tmp);
+            }
+        }
+    }
+    );
+
+    threaded_local_moments.mergeParallel([&](const int idx, const double& v1, const double& v2) -> double {
+          if(nvals == int(4) && idx == 0*nvals+3){
+              return std::min(v1, v2);  
+          }
+          if(nvals == int(4) && idx == 9*nvals+3){
+              return std::max(v1, v2);  
+          }
+          if(idx < 3){
+              return std::min(v1, v2);        
+          }      
+          if(9*nvals <= idx && idx < 9*nvals+3){
+              return std::max(v1, v2);        
+          }
+          return v1 + v2;
+      });
+    threaded_local_hist.mergeParallel();
+
     MPI_Allreduce(
-            (void*)local_moments,
-            (void*)moments,
-            nvals,
-            MPI_DOUBLE, MPI_MIN, this->cd->comm);
+                threaded_local_moments.getMasterData(),
+                (void*)moments,
+                nvals,
+                MPI_DOUBLE, MPI_MIN, this->cd->comm);
     MPI_Allreduce(
-            (void*)(local_moments + nvals),
-            (void*)(moments+nvals),
-            8*nvals,
-            MPI_DOUBLE, MPI_SUM, this->cd->comm);
+                (threaded_local_moments.getMasterData() + nvals),
+                (void*)(moments+nvals),
+                8*nvals,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
     MPI_Allreduce(
-            (void*)(local_moments + 9*nvals),
-            (void*)(moments+9*nvals),
-            nvals,
-            MPI_DOUBLE, MPI_MAX, this->cd->comm);
+                (threaded_local_moments.getMasterData() + 9*nvals),
+                (void*)(moments+9*nvals),
+                nvals,
+                MPI_DOUBLE, MPI_MAX, this->cd->comm);
     MPI_Allreduce(
-            (void*)local_hist,
-            (void*)hist,
-            nbins*nvals,
-            MPI_INT64_T, MPI_SUM, this->cd->comm);
+                (void*)threaded_local_hist.getMasterData(),
+                (void*)hist,
+                nbins*nvals,
+                MPI_INT64_T, MPI_SUM, this->cd->comm);
     for (int n=1; n<9; n++)
         for (int i=0; i<nvals; i++)
             moments[n*nvals + i] /= this->normalization_factor;
-    fftw_free(local_moments);
-    delete[] local_hist;
 }
 
 template <class rnumber>
 void fluid_solver_base<rnumber>::write_spectrum(const char *fname, cnumber *a, const double k2exponent)
 {
+    TIMEZONE("fluid_solver_base::write_spectrum");
     double *spec = fftw_alloc_real(this->nshells);
     this->cospectrum(a, a, spec, k2exponent);
     if (this->cd->myrank == 0)
@@ -371,362 +445,383 @@ void fluid_solver_base<rnumber>::write_spectrum(const char *fname, cnumber *a, c
 /*****************************************************************************/
 /* macro for specializations to numeric types compatible with FFTW           */
 
-#define FLUID_SOLVER_BASE_DEFINITIONS(FFTW, R, MPI_RNUM, MPI_CNUM) \
- \
-template<> \
-fluid_solver_base<R>::fluid_solver_base( \
-        const char *NAME, \
-        int nx, \
-        int ny, \
-        int nz, \
-        double DKX, \
-        double DKY, \
-        double DKZ, \
-        int DEALIAS_TYPE, \
-        unsigned FFTW_PLAN_RIGOR) \
-{ \
-    strncpy(this->name, NAME, 256); \
-    this->name[255] = '\0'; \
-    this->iteration = 0; \
-    this->fftw_plan_rigor = FFTW_PLAN_RIGOR; \
- \
-    int ntmp[4]; \
-    ntmp[0] = nz; \
-    ntmp[1] = ny; \
-    ntmp[2] = nx; \
-    ntmp[3] = 3; \
-    this->rd = new field_descriptor<R>( \
-            4, ntmp, MPI_RNUM, MPI_COMM_WORLD);\
-    this->normalization_factor = (this->rd->full_size/3); \
-    ntmp[0] = ny; \
-    ntmp[1] = nz; \
-    ntmp[2] = nx/2 + 1; \
-    ntmp[3] = 3; \
-    this->cd = new field_descriptor<R>( \
-            4, ntmp, MPI_CNUM, this->rd->comm);\
- \
-    this->dkx = DKX; \
-    this->dky = DKY; \
-    this->dkz = DKZ; \
-    this->kx = new double[this->cd->sizes[2]]; \
-    this->ky = new double[this->cd->subsizes[0]]; \
-    this->kz = new double[this->cd->sizes[1]]; \
-    this->dealias_type = DEALIAS_TYPE; \
-    switch(this->dealias_type) \
-    { \
-        /* HL07 smooth filter */ \
-        case 1: \
-            this->kMx = this->dkx*(int(this->rd->sizes[2] / 2)-1); \
-            this->kMy = this->dky*(int(this->rd->sizes[1] / 2)-1); \
-            this->kMz = this->dkz*(int(this->rd->sizes[0] / 2)-1); \
-            break; \
-        default: \
-            this->kMx = this->dkx*(int(this->rd->sizes[2] / 3)-1); \
-            this->kMy = this->dky*(int(this->rd->sizes[1] / 3)-1); \
-            this->kMz = this->dkz*(int(this->rd->sizes[0] / 3)-1); \
-    } \
-    int i, ii; \
-    for (i = 0; i<this->cd->sizes[2]; i++) \
-        this->kx[i] = i*this->dkx; \
-    for (i = 0; i<this->cd->subsizes[0]; i++) \
-    { \
-        ii = i + this->cd->starts[0]; \
-        if (ii <= this->rd->sizes[1]/2) \
-            this->ky[i] = this->dky*ii; \
-        else \
-            this->ky[i] = this->dky*(ii - this->rd->sizes[1]); \
-    } \
-    for (i = 0; i<this->cd->sizes[1]; i++) \
-    { \
-        if (i <= this->rd->sizes[0]/2) \
-            this->kz[i] = this->dkz*i; \
-        else \
-            this->kz[i] = this->dkz*(i - this->rd->sizes[0]); \
-    } \
-    this->kM = this->kMx; \
-    if (this->kM < this->kMy) this->kM = this->kMy; \
-    if (this->kM < this->kMz) this->kM = this->kMz; \
-    this->kM2 = this->kM * this->kM; \
-    this->kMspec = this->kM; \
-    this->kMspec2 = this->kM2; \
-    this->dk = this->dkx; \
-    if (this->dk > this->dky) this->dk = this->dky; \
-    if (this->dk > this->dkz) this->dk = this->dkz; \
-    this->dk2 = this->dk*this->dk; \
-    DEBUG_MSG( \
-            "kM = %g, kM2 = %g, dk = %g, dk2 = %g\n", \
-            this->kM, this->kM2, this->dk, this->dk2); \
-    /* spectra stuff */ \
-    this->nshells = int(this->kMspec / this->dk) + 2; \
-    DEBUG_MSG( \
-            "kMspec = %g, kMspec2 = %g, nshells = %ld\n", \
-            this->kMspec, this->kMspec2, this->nshells); \
-    this->kshell = new double[this->nshells]; \
-    std::fill_n(this->kshell, this->nshells, 0.0); \
-    this->nshell = new int64_t[this->nshells]; \
-    std::fill_n(this->nshell, this->nshells, 0); \
-    double *kshell_local = new double[this->nshells]; \
-    std::fill_n(kshell_local, this->nshells, 0.0); \
-    int64_t *nshell_local = new int64_t[this->nshells]; \
-    std::fill_n(nshell_local, this->nshells, 0.0); \
-    double knorm; \
-    CLOOP_K2_NXMODES( \
-            this, \
-            if (k2 < this->kM2) \
-            { \
-                knorm = sqrt(k2); \
-                nshell_local[int(knorm/this->dk)] += nxmodes; \
-                kshell_local[int(knorm/this->dk)] += nxmodes*knorm; \
-            } \
-            this->Fourier_filter[int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.)); \
-            ); \
-    \
-    MPI_Allreduce( \
-            (void*)(nshell_local), \
-            (void*)(this->nshell), \
-            this->nshells, \
-            MPI_INT64_T, MPI_SUM, this->cd->comm); \
-    MPI_Allreduce( \
-            (void*)(kshell_local), \
-            (void*)(this->kshell), \
-            this->nshells, \
-            MPI_DOUBLE, MPI_SUM, this->cd->comm); \
-    for (unsigned int n=0; n<this->nshells; n++) \
-    { \
-        this->kshell[n] /= this->nshell[n]; \
-    } \
-    delete[] nshell_local; \
-    delete[] kshell_local; \
-} \
- \
-template<> \
-fluid_solver_base<R>::~fluid_solver_base() \
-{ \
-    delete[] this->kshell; \
-    delete[] this->nshell; \
- \
-    delete[] this->kx; \
-    delete[] this->ky; \
-    delete[] this->kz; \
- \
-    delete this->cd; \
-    delete this->rd; \
-} \
- \
-template<> \
-void fluid_solver_base<R>::low_pass_Fourier(FFTW(complex) *a, const int howmany, const double kmax) \
-{ \
-    const double km2 = kmax*kmax; \
-    const int howmany2 = 2*howmany; \
-    /*DEBUG_MSG("entered low_pass_Fourier, kmax=%lg km2=%lg howmany2=%d\n", kmax, km2, howmany2);*/ \
-    CLOOP_K2( \
-            this, \
-            /*DEBUG_MSG("kx=%lg ky=%lg kz=%lg k2=%lg\n", \
-                      this->kx[xindex], \
-                      this->ky[yindex], \
-                      this->kz[zindex], \
-                      k2);*/ \
-            if (k2 >= km2) \
-                std::fill_n((R*)(a + howmany*cindex), howmany2, 0.0); \
-            );\
-} \
- \
-template<> \
-void fluid_solver_base<R>::dealias(FFTW(complex) *a, const int howmany) \
-{ \
-    if (this->dealias_type == 0) \
-        { \
-            this->low_pass_Fourier(a, howmany, this->kM); \
-            return; \
-        } \
-    double tval; \
-    CLOOP_K2( \
-            this, \
-            tval = this->Fourier_filter[int(round(k2/this->dk2))]; \
-            for (int tcounter = 0; tcounter < howmany; tcounter++) \
-            for (int i=0; i<2; i++) \
-                a[howmany*cindex+tcounter][i] *= tval; \
-         ); \
-} \
- \
-template<> \
-void fluid_solver_base<R>::force_divfree(FFTW(complex) *a) \
-{ \
-    FFTW(complex) tval; \
-    CLOOP_K2( \
-            this, \
-            if (k2 > 0) \
-            { \
-                tval[0] = (this->kx[xindex]*((*(a + cindex*3  ))[0]) + \
-                           this->ky[yindex]*((*(a + cindex*3+1))[0]) + \
-                           this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2; \
-                tval[1] = (this->kx[xindex]*((*(a + cindex*3  ))[1]) + \
-                           this->ky[yindex]*((*(a + cindex*3+1))[1]) + \
-                           this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2; \
-                for (int imag_part=0; imag_part<2; imag_part++) \
-                { \
-                    a[cindex*3  ][imag_part] -= tval[imag_part]*this->kx[xindex]; \
-                    a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex]; \
-                    a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex]; \
-                } \
-            } \
-            );\
-    if (this->cd->myrank == this->cd->rank[0]) \
-        std::fill_n((R*)(a), 6, 0.0); \
-} \
- \
-template<> \
-void fluid_solver_base<R>::compute_vector_gradient(FFTW(complex) *A, FFTW(complex) *cvec) \
-{ \
-    ptrdiff_t tindex; \
-    std::fill_n((R*)A, 3*2*this->cd->local_size, 0.0); \
-    FFTW(complex) *dx_u, *dy_u, *dz_u; \
-    dx_u = A; \
-    dy_u = A + this->cd->local_size; \
-    dz_u = A + 2*this->cd->local_size; \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                for (int cc=0; cc<3; cc++) \
-                { \
-                    dx_u[tindex + cc][0] = -this->kx[xindex]*cvec[tindex+cc][1]; \
-                    dx_u[tindex + cc][1] =  this->kx[xindex]*cvec[tindex+cc][0]; \
-                    dy_u[tindex + cc][0] = -this->ky[yindex]*cvec[tindex+cc][1]; \
-                    dy_u[tindex + cc][1] =  this->ky[yindex]*cvec[tindex+cc][0]; \
-                    dz_u[tindex + cc][0] = -this->kz[zindex]*cvec[tindex+cc][1]; \
-                    dz_u[tindex + cc][1] =  this->kz[zindex]*cvec[tindex+cc][0]; \
-                } \
-            } \
-            ); \
-} \
- \
-template<> \
-void fluid_solver_base<R>::symmetrize(FFTW(complex) *data, const int howmany) \
-{ \
-    ptrdiff_t ii, cc; \
-    MPI_Status *mpistatus = new MPI_Status; \
-    if (this->cd->myrank == this->cd->rank[0]) \
-    { \
-        for (cc = 0; cc < howmany; cc++) \
-            data[cc][1] = 0.0; \
-        for (ii = 1; ii < this->cd->sizes[1]/2; ii++) \
-            for (cc = 0; cc < howmany; cc++) { \
-                ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[0] = \
-                 (*(data + cc + howmany*(                     ii)*this->cd->sizes[2]))[0]; \
-                ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[1] = \
-                -(*(data + cc + howmany*(                     ii)*this->cd->sizes[2]))[1]; \
-                } \
-    } \
-    FFTW(complex) *buffer; \
-    buffer = FFTW(alloc_complex)(howmany*this->cd->sizes[1]); \
-    ptrdiff_t yy; \
-    /*ptrdiff_t tindex;*/ \
-    int ranksrc, rankdst; \
-    for (yy = 1; yy < this->cd->sizes[0]/2; yy++) { \
-        ranksrc = this->cd->rank[yy]; \
-        rankdst = this->cd->rank[this->cd->sizes[0] - yy]; \
-        if (this->cd->myrank == ranksrc) \
-            for (ii = 0; ii < this->cd->sizes[1]; ii++) \
-                for (cc = 0; cc < howmany; cc++) \
-                    for (int imag_comp=0; imag_comp<2; imag_comp++) \
-                    (*(buffer + howmany*ii+cc))[imag_comp] = \
-                        (*(data + howmany*((yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[imag_comp]; \
-        if (ranksrc != rankdst) \
-        { \
-            if (this->cd->myrank == ranksrc) \
-                MPI_Send((void*)buffer, \
-                         howmany*this->cd->sizes[1], MPI_CNUM, rankdst, yy, \
-                         this->cd->comm); \
-            if (this->cd->myrank == rankdst) \
-                MPI_Recv((void*)buffer, \
-                         howmany*this->cd->sizes[1], MPI_CNUM, ranksrc, yy, \
-                         this->cd->comm, mpistatus); \
-        } \
-        if (this->cd->myrank == rankdst) \
-        { \
-            for (ii = 1; ii < this->cd->sizes[1]; ii++) \
-                for (cc = 0; cc < howmany; cc++) \
-                { \
-                    (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[0] = \
-                        (*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[0]; \
-                    (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[1] = \
-                       -(*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[1]; \
-                } \
-            for (cc = 0; cc < howmany; cc++) \
-            { \
-                (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[0] =  (*(buffer + cc))[0]; \
-                (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[1] = -(*(buffer + cc))[1]; \
-            } \
-        } \
-    } \
-    FFTW(free)(buffer); \
-    delete mpistatus; \
-    /* put asymmetric data to 0 */\
-    /*if (this->cd->myrank == this->cd->rank[this->cd->sizes[0]/2]) \
-    { \
-        tindex = howmany*(this->cd->sizes[0]/2 - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2]; \
-        for (ii = 0; ii < this->cd->sizes[1]; ii++) \
-        { \
-            std::fill_n((R*)(data + tindex), howmany*2*this->cd->sizes[2], 0.0); \
-            tindex += howmany*this->cd->sizes[2]; \
-        } \
-    } \
-    tindex = howmany*(); \
-    std::fill_n((R*)(data + tindex), howmany*2, 0.0);*/ \
-} \
- \
-template<> \
-int fluid_solver_base<R>::read_base(const char *fname, R *data) \
-{ \
-    char full_name[512]; \
-    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \
-    return this->rd->read(full_name, (void*)data); \
-} \
- \
-template<> \
-int fluid_solver_base<R>::read_base(const char *fname, FFTW(complex) *data) \
-{ \
-    char full_name[512]; \
-    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \
-    return this->cd->read(full_name, (void*)data); \
-} \
- \
-template<> \
-int fluid_solver_base<R>::write_base(const char *fname, R *data) \
-{ \
-    char full_name[512]; \
-    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \
-    return this->rd->write(full_name, (void*)data); \
-} \
- \
-template<> \
-int fluid_solver_base<R>::write_base(const char *fname, FFTW(complex) *data) \
-{ \
-    char full_name[512]; \
-    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \
-    return this->cd->write(full_name, (void*)data); \
-} \
- \
-/* finally, force generation of code                                         */ \
-template class fluid_solver_base<R>; \
+template <class rnumber>
+fluid_solver_base<rnumber>::fluid_solver_base(
+        const char *NAME,
+        int nx,
+        int ny,
+        int nz,
+        double DKX,
+        double DKY,
+        double DKZ,
+        int DEALIAS_TYPE,
+        unsigned FFTW_PLAN_RIGOR)
+{
+    TIMEZONE("fluid_solver_base::fluid_solver_base");
+    strncpy(this->name, NAME, 256);
+    this->name[255] = '\0';
+    this->iteration = 0;
+    this->fftw_plan_rigor = FFTW_PLAN_RIGOR;
 
-/*****************************************************************************/
+    int ntmp[4];
+    ntmp[0] = nz;
+    ntmp[1] = ny;
+    ntmp[2] = nx;
+    ntmp[3] = 3;
+    this->rd = new field_descriptor<rnumber>(
+                4, ntmp, mpi_real_type<rnumber>::real(), MPI_COMM_WORLD);
+    this->normalization_factor = (this->rd->full_size/3);
+    ntmp[0] = ny;
+    ntmp[1] = nz;
+    ntmp[2] = nx/2 + 1;
+    ntmp[3] = 3;
+    this->cd = new field_descriptor<rnumber>(
+                4, ntmp, mpi_real_type<rnumber>::complex(), this->rd->comm);
 
+    this->dkx = DKX;
+    this->dky = DKY;
+    this->dkz = DKZ;
+    this->kx = new double[this->cd->sizes[2]];
+    this->ky = new double[this->cd->subsizes[0]];
+    this->kz = new double[this->cd->sizes[1]];
+    this->dealias_type = DEALIAS_TYPE;
+    switch(this->dealias_type)
+    {
+    /* HL07 smooth filter */
+    case 1:
+        this->kMx = this->dkx*(int(this->rd->sizes[2] / 2)-1);
+        this->kMy = this->dky*(int(this->rd->sizes[1] / 2)-1);
+        this->kMz = this->dkz*(int(this->rd->sizes[0] / 2)-1);
+        break;
+    default:
+        this->kMx = this->dkx*(int(this->rd->sizes[2] / 3)-1);
+        this->kMy = this->dky*(int(this->rd->sizes[1] / 3)-1);
+        this->kMz = this->dkz*(int(this->rd->sizes[0] / 3)-1);
+    }
+    int i, ii;
+    for (i = 0; i<this->cd->sizes[2]; i++)
+        this->kx[i] = i*this->dkx;
+    for (i = 0; i<this->cd->subsizes[0]; i++)
+    {
+        ii = i + this->cd->starts[0];
+        if (ii <= this->rd->sizes[1]/2)
+            this->ky[i] = this->dky*ii;
+        else
+            this->ky[i] = this->dky*(ii - this->rd->sizes[1]);
+    }
+    for (i = 0; i<this->cd->sizes[1]; i++)
+    {
+        if (i <= this->rd->sizes[0]/2)
+            this->kz[i] = this->dkz*i;
+        else
+            this->kz[i] = this->dkz*(i - this->rd->sizes[0]);
+    }
+    this->kM = this->kMx;
+    if (this->kM < this->kMy) this->kM = this->kMy;
+    if (this->kM < this->kMz) this->kM = this->kMz;
+    this->kM2 = this->kM * this->kM;
+    this->kMspec = this->kM;
+    this->kMspec2 = this->kM2;
+    this->dk = this->dkx;
+    if (this->dk > this->dky) this->dk = this->dky;
+    if (this->dk > this->dkz) this->dk = this->dkz;
+    this->dk2 = this->dk*this->dk;
+    DEBUG_MSG(
+                "kM = %g, kM2 = %g, dk = %g, dk2 = %g\n",
+                this->kM, this->kM2, this->dk, this->dk2);
+    /* spectra stuff */
+    this->nshells = int(this->kMspec / this->dk) + 2;
+    DEBUG_MSG(
+                "kMspec = %g, kMspec2 = %g, nshells = %ld\n",
+                this->kMspec, this->kMspec2, this->nshells);
+    this->kshell = new double[this->nshells];
+    std::fill_n(this->kshell, this->nshells, 0.0);
+    this->nshell = new int64_t[this->nshells];
+    std::fill_n(this->nshell, this->nshells, 0);
 
+    shared_array<double> kshell_local_threaded(this->nshells,[&](double* kshell_local){
+        std::fill_n(kshell_local, this->nshells, 0.0);
+    });
+    shared_array<double> nshell_local_threaded(this->nshells,[&](double* nshell_local){
+        std::fill_n(nshell_local, this->nshells, 0.0);
+    });
+
+    std::vector<std::unordered_map<int, double>> Fourier_filter_threaded(omp_get_max_threads());
+
+    CLOOP_K2_NXMODES(
+                this,
+
+                [&](ptrdiff_t /*cindex*/, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/,
+                ptrdiff_t /*zindex*/, double k2, int nxmodes){
+        if (k2 < this->kM2)
+        {
+            double knorm = sqrt(k2);
+            nshell_local_threaded.getMine()[int(knorm/this->dk)] += nxmodes;
+            kshell_local_threaded.getMine()[int(knorm/this->dk)] += nxmodes*knorm;
+        }
+        Fourier_filter_threaded[omp_get_thread_num()][int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));}
+    );
+
+    // Merge results
+    nshell_local_threaded.mergeParallel();
+    kshell_local_threaded.mergeParallel();
+    for(int idxMerge = 0 ; idxMerge < int(Fourier_filter_threaded.size()) ; ++idxMerge){
+        for(const auto kv : Fourier_filter_threaded[idxMerge]){
+            this->Fourier_filter[kv.first] = kv.second;
+        }
+    }
+
+    MPI_Allreduce(
+                (void*)(nshell_local_threaded.getMasterData()),
+                (void*)(this->nshell),
+                this->nshells,
+                MPI_INT64_T, MPI_SUM, this->cd->comm);
+    MPI_Allreduce(
+                (void*)(kshell_local_threaded.getMasterData()),
+                (void*)(this->kshell),
+                this->nshells,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
+    for (unsigned int n=0; n<this->nshells; n++)
+    {
+        this->kshell[n] /= this->nshell[n];
+    }
+}
+
+template <class rnumber>
+fluid_solver_base<rnumber>::~fluid_solver_base()
+{
+    delete[] this->kshell;
+    delete[] this->nshell;
+
+    delete[] this->kx;
+    delete[] this->ky;
+    delete[] this->kz;
+
+    delete this->cd;
+    delete this->rd;
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::low_pass_Fourier(cnumber *a, const int howmany, const double kmax)
+{
+    TIMEZONE("fluid_solver_base::low_pass_Fourier");
+    const double km2 = kmax*kmax;
+    const int howmany2 = 2*howmany;
+    /*DEBUG_MSG("entered low_pass_Fourier, kmax=%lg km2=%lg howmany2=%d\n", kmax, km2, howmany2);*/
+    CLOOP_K2(
+                this,
+                /*DEBUG_MSG("kx=%lg ky=%lg kz=%lg k2=%lg\n",
+                                  this->kx[xindex],
+                                  this->ky[yindex],
+                                  this->kz[zindex],
+                                  k2);*/
+
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2){
+        if (k2 >= km2)
+            std::fill_n((rnumber*)(a + howmany*cindex), howmany2, 0.0);}
+    );
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::dealias(cnumber *a, const int howmany)
+{
+    TIMEZONE("fluid_solver_base::dealias");
+    if (this->dealias_type == 0)
+    {
+        this->low_pass_Fourier(a, howmany, this->kM);
+        return;
+    }
+
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t /*xindex*/, ptrdiff_t /*yindex*/,
+                ptrdiff_t /*zindex*/, double k2){
+        double tval = this->Fourier_filter[int(round(k2/this->dk2))];
+        // It is thread safe on the index cindex
+        for (int tcounter = 0; tcounter < howmany; tcounter++)
+            for (int i=0; i<2; i++)
+                a[howmany*cindex+tcounter][i] *= tval;
+    }
+    );
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::force_divfree(cnumber *a)
+{
+    TIMEZONE("fluid_solver_base::force_divfree");
+    CLOOP_K2(
+                this,
+
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2){
+        if (k2 > 0)
+        {
+            // It is thread safe on index cindex
+            cnumber tval;
+            tval[0] = (this->kx[xindex]*((*(a + cindex*3  ))[0]) +
+                    this->ky[yindex]*((*(a + cindex*3+1))[0]) +
+                    this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2;
+            tval[1] = (this->kx[xindex]*((*(a + cindex*3  ))[1]) +
+                    this->ky[yindex]*((*(a + cindex*3+1))[1]) +
+                    this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2;
+            for (int imag_part=0; imag_part<2; imag_part++)
+            {
+                a[cindex*3  ][imag_part] -= tval[imag_part]*this->kx[xindex];
+                a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex];
+                a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex];
+            }
+        }}
+    );
+    if (this->cd->myrank == this->cd->rank[0])
+        std::fill_n((rnumber*)(a), 6, 0.0);
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::compute_vector_gradient(cnumber *A, cnumber *cvec)
+{
+    TIMEZONE("fluid_solver_base::compute_vector_gradient");
+    std::fill_n((rnumber*)A, 3*2*this->cd->local_size, 0.0);
+    cnumber *dx_u, *dy_u, *dz_u;
+    dx_u = A;
+    dy_u = A + this->cd->local_size;
+    dz_u = A + 2*this->cd->local_size;
+    CLOOP_K2(
+                this,
+
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            // It is thread safe on cindex
+            ptrdiff_t tindex = 3*cindex;
+            for (int cc=0; cc<3; cc++)
+            {
+                dx_u[tindex + cc][0] = -this->kx[xindex]*cvec[tindex+cc][1];
+                dx_u[tindex + cc][1] =  this->kx[xindex]*cvec[tindex+cc][0];
+                dy_u[tindex + cc][0] = -this->ky[yindex]*cvec[tindex+cc][1];
+                dy_u[tindex + cc][1] =  this->ky[yindex]*cvec[tindex+cc][0];
+                dz_u[tindex + cc][0] = -this->kz[zindex]*cvec[tindex+cc][1];
+                dz_u[tindex + cc][1] =  this->kz[zindex]*cvec[tindex+cc][0];
+            }
+        }}
+    );
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::symmetrize(cnumber *data, const int howmany)
+{
+    TIMEZONE("fluid_solver_base::symmetrize");
+    ptrdiff_t ii, cc;
+    MPI_Status *mpistatus = new MPI_Status;
+    if (this->cd->myrank == this->cd->rank[0])
+    {
+        for (cc = 0; cc < howmany; cc++)
+            data[cc][1] = 0.0;
+        for (ii = 1; ii < this->cd->sizes[1]/2; ii++)
+            for (cc = 0; cc < howmany; cc++) {
+                ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[0] =
+                        (*(data + cc + howmany*(                     ii)*this->cd->sizes[2]))[0];
+                ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[1] =
+                        -(*(data + cc + howmany*(                     ii)*this->cd->sizes[2]))[1];
+            }
+    }
+    cnumber *buffer;
+    buffer = fftw_interface<rnumber>::alloc_complex(howmany*this->cd->sizes[1]);
+    ptrdiff_t yy;
+    /*ptrdiff_t tindex;*/
+    int ranksrc, rankdst;
+    for (yy = 1; yy < this->cd->sizes[0]/2; yy++) {
+        ranksrc = this->cd->rank[yy];
+        rankdst = this->cd->rank[this->cd->sizes[0] - yy];
+        if (this->cd->myrank == ranksrc)
+            for (ii = 0; ii < this->cd->sizes[1]; ii++)
+                for (cc = 0; cc < howmany; cc++)
+                    for (int imag_comp=0; imag_comp<2; imag_comp++)
+                        (*(buffer + howmany*ii+cc))[imag_comp] =
+                            (*(data + howmany*((yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[imag_comp];
+        if (ranksrc != rankdst)
+        {
+            if (this->cd->myrank == ranksrc)
+                MPI_Send((void*)buffer,
+                         howmany*this->cd->sizes[1], mpi_real_type<rnumber>::complex(), rankdst, yy,
+                        this->cd->comm);
+            if (this->cd->myrank == rankdst)
+                MPI_Recv((void*)buffer,
+                         howmany*this->cd->sizes[1], mpi_real_type<rnumber>::complex(), ranksrc, yy,
+                        this->cd->comm, mpistatus);
+        }
+        if (this->cd->myrank == rankdst)
+        {
+            for (ii = 1; ii < this->cd->sizes[1]; ii++)
+                for (cc = 0; cc < howmany; cc++)
+                {
+                    (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[0] =
+                            (*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[0];
+                    (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[1] =
+                            -(*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[1];
+                }
+            for (cc = 0; cc < howmany; cc++)
+            {
+                (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[0] =  (*(buffer + cc))[0];
+                (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[1] = -(*(buffer + cc))[1];
+            }
+        }
+    }
+    fftw_interface<rnumber>::free(buffer);
+    delete mpistatus;
+    /* put asymmetric data to 0 */
+    /*if (this->cd->myrank == this->cd->rank[this->cd->sizes[0]/2])
+    {
+        tindex = howmany*(this->cd->sizes[0]/2 - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2];
+        for (ii = 0; ii < this->cd->sizes[1]; ii++)
+        {
+            std::fill_n((rnumber*)(data + tindex), howmany*2*this->cd->sizes[2], 0.0);
+            tindex += howmany*this->cd->sizes[2];
+        }
+    }
+    tindex = howmany*();
+    std::fill_n((rnumber*)(data + tindex), howmany*2, 0.0);*/
+}
+
+template <class rnumber>
+int fluid_solver_base<rnumber>::read_base(const char *fname, rnumber *data)
+{
+    char full_name[512];
+    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration);
+    return this->rd->read(full_name, (void*)data);
+}
+
+template <class rnumber>
+int fluid_solver_base<rnumber>::read_base(const char *fname, cnumber *data)
+{
+    char full_name[512];
+    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration);
+    return this->cd->read(full_name, (void*)data);
+}
+
+template <class rnumber>
+int fluid_solver_base<rnumber>::write_base(const char *fname, rnumber *data)
+{
+    char full_name[512];
+    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration);
+    return this->rd->write(full_name, (void*)data);
+}
+
+template <class rnumber>
+int fluid_solver_base<rnumber>::write_base(const char *fname, cnumber *data)
+{
+    char full_name[512];
+    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration);
+    return this->cd->write(full_name, (void*)data);
+}
+
+/* finally, force generation of code                                         */
+template class fluid_solver_base<float>;
+template class fluid_solver_base<double>;
 
 /*****************************************************************************/
-/* now actually use the macro defined above                                  */
-FLUID_SOLVER_BASE_DEFINITIONS(
-        FFTW_MANGLE_FLOAT,
-        float,
-        MPI_FLOAT,
-        MPI_COMPLEX)
-FLUID_SOLVER_BASE_DEFINITIONS(
-        FFTW_MANGLE_DOUBLE,
-        double,
-        MPI_DOUBLE,
-        BFPS_MPICXX_DOUBLE_COMPLEX)
-/*****************************************************************************/
+
+
+
 
diff --git a/bfps/cpp/fluid_solver_base.hpp b/bfps/cpp/fluid_solver_base.hpp
index 62deb597b4a6a3f4fc87198099d15778e7a2a255..e446956001a08fdbf0d3b11da8552e1cb6c61a45 100644
--- a/bfps/cpp/fluid_solver_base.hpp
+++ b/bfps/cpp/fluid_solver_base.hpp
@@ -30,6 +30,8 @@
 #include <vector>
 #include "base.hpp"
 #include "field_descriptor.hpp"
+#include "scope_timer.hpp"
+#include "omputils.hpp"
 
 #ifndef FLUID_SOLVER_BASE
 
@@ -81,7 +83,7 @@ class fluid_solver_base
                 double DKY = 1.0,
                 double DKZ = 1.0,
                 int DEALIAS_TYPE = 0,
-                unsigned FFTW_PLAN_RIGOR = FFTW_ESTIMATE);
+                unsigned FFTW_PLAN_RIGOR = DEFAULT_FFTW_FLAG);
         ~fluid_solver_base();
 
         void low_pass_Fourier(cnumber *__restrict__ a, int howmany, double kmax);
@@ -135,97 +137,133 @@ class fluid_solver_base
 /* macros for loops                                                          */
 
 /* Fourier space loop */
-#define CLOOP(obj, expression) \
- \
-{ \
-    ptrdiff_t cindex = 0; \
-    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \
-    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \
-    for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) \
-        { \
-            expression; \
-            cindex++; \
-        } \
+template <class ObjectType, class FuncType>
+void CLOOP(ObjectType* obj, FuncType expression)
+{
+    TIMEZONE("CLOOP");
+    #pragma omp parallel
+    {
+        const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[0]);
+        const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[0]);
+        for (ptrdiff_t yindex = start; yindex < ptrdiff_t(end); yindex++){
+            ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2];
+            for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++)
+            for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++)
+                {
+                    expression(cindex, xindex, yindex, zindex);
+                    cindex++;
+                }
+        }
+    }
 }
 
-#define CLOOP_NXMODES(obj, expression) \
- \
-{ \
-    ptrdiff_t cindex = 0; \
-    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \
-    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \
-    { \
-        int nxmodes = 1; \
-        ptrdiff_t xindex = 0; \
-        expression; \
-        cindex++; \
-        nxmodes = 2; \
-    for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) \
-        { \
-            expression; \
-            cindex++; \
-        } \
-    } \
+template <class ObjectType, class FuncType>
+void CLOOP_NXMODES(ObjectType* obj, FuncType expression)
+{
+    TIMEZONE("CLOOP_NXMODES");
+    #pragma omp parallel
+    {
+        const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[1]);
+        const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[1]);
+        for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){
+            for (ptrdiff_t zindex = start; zindex < ptrdiff_t(end); zindex++)
+            {
+                ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]
+                                   + zindex*obj->cd->subsizes[2];
+                int nxmodes = 1;
+                ptrdiff_t xindex = 0;
+                expression();
+                cindex++;
+                nxmodes = 2;
+                for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++)
+                {
+                    expression();
+                    cindex++;
+                }
+            }
+        }
+    }
 }
 
-#define CLOOP_K2(obj, expression) \
- \
-{ \
-    double k2; \
-    ptrdiff_t cindex = 0; \
-    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \
-    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \
-    for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) \
-        { \
-            k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-                  obj->ky[yindex]*obj->ky[yindex] + \
-                  obj->kz[zindex]*obj->kz[zindex]); \
-            expression; \
-            cindex++; \
-        } \
+
+template <class ObjectType, class FuncType>
+void CLOOP_K2(ObjectType* obj, FuncType expression)
+{
+    TIMEZONE("CLOOP_K2");
+    #pragma omp parallel
+    {
+        const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[1]);
+        const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[1]);
+        for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){
+            for (ptrdiff_t zindex = start; zindex < ptrdiff_t(end); zindex++){
+                ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]
+                                   + zindex*obj->cd->subsizes[2];
+                for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++)
+                {
+                    double k2 = (obj->kx[xindex]*obj->kx[xindex] +
+                          obj->ky[yindex]*obj->ky[yindex] +
+                          obj->kz[zindex]*obj->kz[zindex]);
+                    expression(cindex, xindex, yindex, zindex, k2);
+                    cindex++;
+                }
+            }
+        }
+    }
 }
 
-#define CLOOP_K2_NXMODES(obj, expression) \
- \
-{ \
-    double k2; \
-    ptrdiff_t cindex = 0; \
-    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \
-    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \
-    { \
-        int nxmodes = 1; \
-        ptrdiff_t xindex = 0; \
-        k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-              obj->ky[yindex]*obj->ky[yindex] + \
-              obj->kz[zindex]*obj->kz[zindex]); \
-        expression; \
-        cindex++; \
-        nxmodes = 2; \
-    for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) \
-        { \
-            k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-                  obj->ky[yindex]*obj->ky[yindex] + \
-                  obj->kz[zindex]*obj->kz[zindex]); \
-            expression; \
-            cindex++; \
-        } \
-    } \
+
+template <class ObjectType, class FuncType>
+void CLOOP_K2_NXMODES(ObjectType* obj, FuncType expression)
+{
+    #pragma omp parallel
+    {
+        const hsize_t start = OmpUtils::ForIntervalStart(obj->cd->subsizes[1]);
+        const hsize_t end = OmpUtils::ForIntervalEnd(obj->cd->subsizes[1]);
+        for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++){
+            for (ptrdiff_t zindex = start; zindex < ptrdiff_t(end); zindex++)
+            {
+                ptrdiff_t cindex = yindex*obj->cd->subsizes[1]*obj->cd->subsizes[2]
+                                   + zindex*obj->cd->subsizes[2];
+                int nxmodes = 1;
+                ptrdiff_t xindex = 0;
+                double k2 = (obj->kx[xindex]*obj->kx[xindex] +
+                      obj->ky[yindex]*obj->ky[yindex] +
+                      obj->kz[zindex]*obj->kz[zindex]);
+                expression(cindex, xindex, yindex, zindex, k2, nxmodes);
+                cindex++;
+                nxmodes = 2;
+                for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++)
+                {
+                    double k2 = (obj->kx[xindex]*obj->kx[xindex] +
+                          obj->ky[yindex]*obj->ky[yindex] +
+                          obj->kz[zindex]*obj->kz[zindex]);
+                    expression(cindex, xindex, yindex, zindex, k2, nxmodes);
+                    cindex++;
+                }
+            }
+        }
+    }
 }
 
-/* real space loop */
-#define RLOOP(obj, expression) \
- \
-{ \
-    for (int zindex = 0; zindex < obj->rd->subsizes[0]; zindex++) \
-    for (int yindex = 0; yindex < obj->rd->subsizes[1]; yindex++) \
-    { \
-        ptrdiff_t rindex = (zindex * obj->rd->subsizes[1] + yindex)*(obj->rd->subsizes[2]+2); \
-    for (int xindex = 0; xindex < obj->rd->subsizes[2]; xindex++) \
-        { \
-            expression; \
-            rindex++; \
-        } \
-    } \
+
+template <class ObjectType, class FuncType>
+void RLOOP(ObjectType* obj, FuncType expression)
+{
+    #pragma omp parallel
+    {
+        const hsize_t start = OmpUtils::ForIntervalStart(obj->rd->subsizes[1]);
+        const hsize_t end = OmpUtils::ForIntervalEnd(obj->rd->subsizes[1]);
+        for (int zindex = 0; zindex < obj->rd->subsizes[0] ; zindex++)
+        for (int yindex = start; yindex < ptrdiff_t(end); yindex++)
+        {
+            ptrdiff_t rindex = (zindex * obj->rd->subsizes[1] + yindex)*(obj->rd->subsizes[2]+2);
+            for (int xindex = 0; xindex < obj->rd->subsizes[2]; xindex++)
+            {
+                expression(rindex, xindex, yindex, zindex);
+                rindex++;
+            }
+        }
+    }
 }
 
 /*****************************************************************************/
diff --git a/bfps/cpp/interpolator.cpp b/bfps/cpp/interpolator.cpp
index ef53742a4fdeb2545f02954f10c47d2bcb3f6538..b088f86df95d6d0166e8a95923bf0d1cc062c073 100644
--- a/bfps/cpp/interpolator.cpp
+++ b/bfps/cpp/interpolator.cpp
@@ -150,7 +150,7 @@ template <class rnumber, int interp_neighbours>
 void interpolator<rnumber, interp_neighbours>::operator()(
         const int *xg,
         const double *xx,
-        double *dest,
+        double *__restrict__ dest,
         const int *deriv)
 {
     double bx[interp_neighbours*2+2], by[interp_neighbours*2+2], bz[interp_neighbours*2+2];
diff --git a/bfps/cpp/interpolator_base.cpp b/bfps/cpp/interpolator_base.cpp
index 58bf57cf13382f0704da4537dae9d21bb4a841da..db81dcb329070e14897e432e82a2fee95810e169 100644
--- a/bfps/cpp/interpolator_base.cpp
+++ b/bfps/cpp/interpolator_base.cpp
@@ -43,6 +43,20 @@ interpolator_base<rnumber, interp_neighbours>::interpolator_base(
     this->dz = 4*acos(0) / (fs->dkz*this->descriptor->sizes[0]);
 }
 
+template <class rnumber, int interp_neighbours>
+interpolator_base<rnumber, interp_neighbours>::interpolator_base(
+        vorticity_equation<rnumber, FFTW> *fs,
+        base_polynomial_values BETA_POLYS)
+{
+//    this->descriptor = fs->rd;
+//    this->compute_beta = BETA_POLYS;
+//
+//    // compute dx, dy, dz;
+//    this->dx = 4*acos(0) / (fs->kk->dkx*this->descriptor->sizes[2]);
+//    this->dy = 4*acos(0) / (fs->kk->dky*this->descriptor->sizes[1]);
+//    this->dz = 4*acos(0) / (fs->kk->dkz*this->descriptor->sizes[0]);
+}
+
 template <class rnumber, int interp_neighbours>
 void interpolator_base<rnumber, interp_neighbours>::get_grid_coordinates(
         const int nparticles,
diff --git a/bfps/cpp/interpolator_base.hpp b/bfps/cpp/interpolator_base.hpp
index 7dda7fb08319bf2a044bcc220e204b748d6336d6..f4b793342d9b5b38e39c717ad30ee88e106958aa 100644
--- a/bfps/cpp/interpolator_base.hpp
+++ b/bfps/cpp/interpolator_base.hpp
@@ -25,6 +25,7 @@
 
 
 #include "fluid_solver_base.hpp"
+#include "vorticity_equation.hpp"
 #include "spline_n1.hpp"
 #include "spline_n2.hpp"
 #include "spline_n3.hpp"
@@ -58,6 +59,10 @@ class interpolator_base
         interpolator_base(
                 fluid_solver_base<rnumber> *FSOLVER,
                 base_polynomial_values BETA_POLYS);
+
+        interpolator_base(
+                vorticity_equation<rnumber, FFTW> *FSOLVER,
+                base_polynomial_values BETA_POLYS);
         virtual ~interpolator_base(){}
 
         /* may not destroy input */
diff --git a/bfps/cpp/kspace.cpp b/bfps/cpp/kspace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70581f081790ba7c114a8abbe5e113eabf38dd54
--- /dev/null
+++ b/bfps/cpp/kspace.cpp
@@ -0,0 +1,492 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+
+#include <cmath>
+#include <cstdlib>
+#include <algorithm>
+#include <cassert>
+#include "kspace.hpp"
+#include "scope_timer.hpp"
+#include "shared_array.hpp"
+
+template <field_backend be,
+          kspace_dealias_type dt>
+template <field_components fc>
+kspace<be, dt>::kspace(
+        const field_layout<fc> *source_layout,
+        const double DKX,
+        const double DKY,
+        const double DKZ)
+{
+    TIMEZONE("field::kspace");
+    /* get layout */
+    this->layout = new field_layout<ONE>(
+            source_layout->sizes,
+            source_layout->subsizes,
+            source_layout->starts,
+            source_layout->comm);
+
+    /* store dk values */
+    this->dkx = DKX;
+    this->dky = DKY;
+    this->dkz = DKZ;
+
+    /* compute kx, ky, kz and compute kM values */
+    switch(be)
+    {
+        case FFTW:
+            this->kx.resize(this->layout->sizes[2]);
+            this->ky.resize(this->layout->subsizes[0]);
+            this->kz.resize(this->layout->sizes[1]);
+            int i, ii;
+            for (i = 0; i<int(this->layout->sizes[2]); i++)
+                this->kx[i] = i*this->dkx;
+            for (i = 0; i<int(this->layout->subsizes[0]); i++)
+            {
+                ii = i + this->layout->starts[0];
+                if (ii <= int(this->layout->sizes[1]/2))
+                    this->ky[i] = this->dky*ii;
+                else
+                    this->ky[i] = this->dky*(ii - int(this->layout->sizes[1]));
+            }
+            for (i = 0; i<int(this->layout->sizes[1]); i++)
+            {
+                if (i <= int(this->layout->sizes[0]/2))
+                    this->kz[i] = this->dkz*i;
+                else
+                    this->kz[i] = this->dkz*(i - int(this->layout->sizes[0]));
+            }
+            switch(dt)
+            {
+                case TWO_THIRDS:
+                    this->kMx = this->dkx*(int(2*(int(this->layout->sizes[2])-1)/3)-1);
+                    this->kMy = this->dky*(int(this->layout->sizes[0] / 3)-1);
+                    this->kMz = this->dkz*(int(this->layout->sizes[1] / 3)-1);
+                    break;
+                case SMOOTH:
+                    this->kMx = this->dkx*(int(this->layout->sizes[2])-2);
+                    this->kMy = this->dky*(int(this->layout->sizes[0] / 2)-1);
+                    this->kMz = this->dkz*(int(this->layout->sizes[1] / 2)-1);
+                    break;
+            }
+            break;
+    }
+
+    /* get global kM and dk */
+    this->kM = this->kMx;
+    if (this->kM < this->kMy) this->kM = this->kMy;
+    if (this->kM < this->kMz) this->kM = this->kMz;
+    this->kM2 = this->kM * this->kM;
+    this->dk = this->dkx;
+    if (this->dk > this->dky) this->dk = this->dky;
+    if (this->dk > this->dkz) this->dk = this->dkz;
+    this->dk2 = this->dk*this->dk;
+
+    /* spectra stuff */
+    this->nshells = int(this->kM / this->dk) + 2;
+    this->kshell.resize(this->nshells, 0);
+    this->nshell.resize(this->nshells, 0);
+
+    shared_array<double> kshell_local_thread(this->nshells,[&](double* kshell_local){
+        std::fill_n(kshell_local, this->nshells, 0);
+    });
+    shared_array<int64_t> nshell_local_thread(this->nshells,[&](int64_t* nshell_local){
+        std::fill_n(nshell_local, this->nshells, 0);
+    });
+
+    std::vector<std::unordered_map<int, double>> dealias_filter_threaded(omp_get_max_threads());
+
+    this->CLOOP_K2_NXMODES(
+            [&](ptrdiff_t cindex,
+                ptrdiff_t xindex,
+                ptrdiff_t yindex,
+                ptrdiff_t zindex,
+                double k2,
+                int nxmodes){
+            if (k2 < this->kM2)
+            {
+                double knorm = sqrt(k2);
+                kshell_local_thread.getMine()[int(knorm/this->dk)] += nxmodes;
+                nshell_local_thread.getMine()[int(knorm/this->dk)] += nxmodes*knorm;
+            }
+            if (dt == SMOOTH){
+                dealias_filter_threaded[omp_get_thread_num()][int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));
+            }
+    });
+
+    // Merge results
+
+    kshell_local_thread.mergeParallel();
+    nshell_local_thread.mergeParallel();
+
+    if (dt == SMOOTH){
+        for(int idxMerge = 0 ; idxMerge < int(dealias_filter_threaded.size()) ; ++idxMerge){
+            for(const auto kv : dealias_filter_threaded[idxMerge]){
+                this->dealias_filter[kv.first] = kv.second;
+            }
+        }
+    }
+
+    MPI_Allreduce(
+            nshell_local_thread.getMasterData(),
+            &this->nshell.front(),
+            this->nshells,
+            MPI_INT64_T, MPI_SUM, this->layout->comm);
+    MPI_Allreduce(
+            kshell_local_thread.getMasterData(),
+            &this->kshell.front(),
+            this->nshells,
+            MPI_DOUBLE, MPI_SUM, this->layout->comm);
+    for (int n=0; n<this->nshells; n++){
+		if(this->nshell[n] != 0){
+	        this->kshell[n] /= this->nshell[n];
+		}
+    }
+}
+
+template <field_backend be,
+          kspace_dealias_type dt>
+kspace<be, dt>::~kspace()
+{
+    delete this->layout;
+}
+
+template <field_backend be,
+          kspace_dealias_type dt>
+template <typename rnumber,
+          field_components fc>
+void kspace<be, dt>::low_pass(typename fftw_interface<rnumber>::complex *__restrict__ a, const double kmax)
+{
+    const double km2 = kmax*kmax;
+    this->CLOOP_K2(
+            [&](ptrdiff_t cindex,
+                ptrdiff_t xindex,
+                ptrdiff_t yindex,
+                ptrdiff_t zindex,
+                double k2){
+            if (k2 >= km2)
+                std::fill_n((rnumber*)(a + ncomp(fc)*cindex), 2*ncomp(fc), 0);
+                });
+}
+
+template <field_backend be,
+          kspace_dealias_type dt>
+template <typename rnumber,
+          field_components fc>
+void kspace<be, dt>::Gauss_filter(
+        typename fftw_interface<rnumber>::complex *__restrict__ a,
+        const double sigma)
+{
+    const double prefactor = - sigma*sigma/2;
+    this->CLOOP_K2(
+            [&](ptrdiff_t cindex,
+                ptrdiff_t xindex,
+                ptrdiff_t yindex,
+                ptrdiff_t zindex,
+                double k2){
+                if (k2 <= this->kM2)
+                {
+                    for (unsigned int tcounter=0; tcounter<2*ncomp(fc); tcounter++)
+                        ((rnumber*)a)[2*ncomp(fc)*cindex + tcounter] *= exp(prefactor*k2);
+                }
+                });
+}
+
+template <field_backend be,
+          kspace_dealias_type dt>
+template <typename rnumber,
+          field_components fc>
+void kspace<be, dt>::dealias(typename fftw_interface<rnumber>::complex *__restrict__ a)
+{
+    switch(dt)
+    {
+        case TWO_THIRDS:
+            this->low_pass<rnumber, fc>(a, this->kM);
+            break;
+        case SMOOTH:
+            this->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+                    double tval = this->dealias_filter[int(round(k2 / this->dk2))];
+                    for (unsigned int tcounter=0; tcounter<2*ncomp(fc); tcounter++)
+                        ((rnumber*)a)[2*ncomp(fc)*cindex + tcounter] *= tval;
+                });
+            break;
+    }
+}
+
+template <field_backend be,
+          kspace_dealias_type dt>
+template <typename rnumber>
+void kspace<be, dt>::force_divfree(typename fftw_interface<rnumber>::complex *__restrict__ a)
+{
+    TIMEZONE("kspace::force_divfree");
+    this->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+                if (k2 > 0)
+        {
+                    typename fftw_interface<rnumber>::complex tval;
+                    tval[0] = (this->kx[xindex]*((*(a + cindex*3  ))[0]) +
+                               this->ky[yindex]*((*(a + cindex*3+1))[0]) +
+                               this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2;
+                    tval[1] = (this->kx[xindex]*((*(a + cindex*3  ))[1]) +
+                               this->ky[yindex]*((*(a + cindex*3+1))[1]) +
+                               this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2;
+                    for (int imag_part=0; imag_part<2; imag_part++)
+                    {
+                        a[cindex*3  ][imag_part] -= tval[imag_part]*this->kx[xindex];
+                        a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex];
+                        a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex];
+                    }
+           }
+        }
+    );
+    if (this->layout->myrank == this->layout->rank[0][0])
+        std::fill_n((rnumber*)(a), 6, 0.0);
+}
+
+template <field_backend be,
+          kspace_dealias_type dt>
+template <typename rnumber,
+          field_components fc>
+void kspace<be, dt>::cospectrum(
+        const rnumber(* __restrict a)[2],
+        const rnumber(* __restrict b)[2],
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset)
+{
+    TIMEZONE("field::cospectrum");
+    shared_array<double> spec_local_thread(this->nshells*ncomp(fc)*ncomp(fc),[&](double* spec_local){
+        std::fill_n(spec_local, this->nshells*ncomp(fc)*ncomp(fc), 0);
+    });
+
+    this->CLOOP_K2_NXMODES(
+            [&](ptrdiff_t cindex,
+                ptrdiff_t xindex,
+                ptrdiff_t yindex,
+                ptrdiff_t zindex,
+                double k2,
+                int nxmodes){
+            if (k2 <= this->kM2)
+            {
+                double* spec_local = spec_local_thread.getMine();
+                int tmp_int = int(sqrt(k2) / this->dk)*ncomp(fc)*ncomp(fc);
+                for (hsize_t i=0; i<ncomp(fc); i++)
+                for (hsize_t j=0; j<ncomp(fc); j++){
+                    spec_local[tmp_int + i*ncomp(fc)+j] += nxmodes * (
+                        (a[ncomp(fc)*cindex + i][0] * b[ncomp(fc)*cindex + j][0]) +
+                        (a[ncomp(fc)*cindex + i][1] * b[ncomp(fc)*cindex + j][1]));
+                }
+            }
+            });
+
+    spec_local_thread.mergeParallel();
+
+    std::vector<double> spec;
+    spec.resize(this->nshells*ncomp(fc)*ncomp(fc), 0);
+    MPI_Allreduce(
+            spec_local_thread.getMasterData(),
+            &spec.front(),
+            spec.size(),
+            MPI_DOUBLE, MPI_SUM, this->layout->comm);
+    if (this->layout->myrank == 0)
+    {
+        hid_t dset, wspace, mspace;
+        hsize_t count[(ndim(fc)-2)*2], offset[(ndim(fc)-2)*2], dims[(ndim(fc)-2)*2];
+        dset = H5Dopen(group, ("spectra/" + dset_name).c_str(), H5P_DEFAULT);
+        wspace = H5Dget_space(dset);
+        H5Sget_simple_extent_dims(wspace, dims, NULL);
+        switch (fc)
+        {
+            case THREExTHREE:
+                offset[4] = 0;
+                offset[5] = 0;
+                count[4] = ncomp(fc);
+                count[5] = ncomp(fc);
+            case THREE:
+                offset[2] = 0;
+                offset[3] = 0;
+                count[2] = ncomp(fc);
+                count[3] = ncomp(fc);
+            default:
+                offset[0] = toffset;
+                offset[1] = 0;
+                count[0] = 1;
+                count[1] = this->nshells;
+        }
+        mspace = H5Screate_simple((ndim(fc)-2)*2, count, NULL);
+        H5Sselect_hyperslab(wspace, H5S_SELECT_SET, offset, NULL, count, NULL);
+        H5Dwrite(dset, H5T_NATIVE_DOUBLE, mspace, wspace, H5P_DEFAULT, &spec.front());
+        H5Sclose(wspace);
+        H5Sclose(mspace);
+        H5Dclose(dset);
+    }
+}
+
+
+template class kspace<FFTW, TWO_THIRDS>;
+template class kspace<FFTW, SMOOTH>;
+
+template kspace<FFTW, TWO_THIRDS>::kspace<>(
+        const field_layout<ONE> *,
+        const double, const double, const double);
+template kspace<FFTW, TWO_THIRDS>::kspace<>(
+        const field_layout<THREE> *,
+        const double, const double, const double);
+template kspace<FFTW, TWO_THIRDS>::kspace<>(
+        const field_layout<THREExTHREE> *,
+        const double, const double, const double);
+
+template kspace<FFTW, SMOOTH>::kspace<>(
+        const field_layout<ONE> *,
+        const double, const double, const double);
+template kspace<FFTW, SMOOTH>::kspace<>(
+        const field_layout<THREE> *,
+        const double, const double, const double);
+template kspace<FFTW, SMOOTH>::kspace<>(
+        const field_layout<THREExTHREE> *,
+        const double, const double, const double);
+
+template void kspace<FFTW, SMOOTH>::low_pass<float, ONE>(
+        typename fftw_interface<float>::complex *__restrict__ a,
+        const double kmax);
+template void kspace<FFTW, SMOOTH>::low_pass<float, THREE>(
+        typename fftw_interface<float>::complex *__restrict__ a,
+        const double kmax);
+template void kspace<FFTW, SMOOTH>::low_pass<float, THREExTHREE>(
+        typename fftw_interface<float>::complex *__restrict__ a,
+        const double kmax);
+
+template void kspace<FFTW, SMOOTH>::low_pass<double, ONE>(
+        typename fftw_interface<double>::complex *__restrict__ a,
+        const double kmax);
+template void kspace<FFTW, SMOOTH>::low_pass<double, THREE>(
+        typename fftw_interface<double>::complex *__restrict__ a,
+        const double kmax);
+template void kspace<FFTW, SMOOTH>::low_pass<double, THREExTHREE>(
+        typename fftw_interface<double>::complex *__restrict__ a,
+        const double kmax);
+
+template void kspace<FFTW, SMOOTH>::dealias<float, ONE>(
+        typename fftw_interface<float>::complex *__restrict__ a);
+template void kspace<FFTW, SMOOTH>::dealias<float, THREE>(
+        typename fftw_interface<float>::complex *__restrict__ a);
+template void kspace<FFTW, SMOOTH>::dealias<float, THREExTHREE>(
+        typename fftw_interface<float>::complex *__restrict__ a);
+
+template void kspace<FFTW, SMOOTH>::dealias<double, ONE>(
+        typename fftw_interface<double>::complex *__restrict__ a);
+template void kspace<FFTW, SMOOTH>::dealias<double, THREE>(
+        typename fftw_interface<double>::complex *__restrict__ a);
+template void kspace<FFTW, SMOOTH>::dealias<double, THREExTHREE>(
+        typename fftw_interface<double>::complex *__restrict__ a);
+
+template void kspace<FFTW, TWO_THIRDS>::cospectrum<float, ONE>(
+        const typename fftw_interface<float>::complex *__restrict__ a,
+        const typename fftw_interface<float>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, TWO_THIRDS>::cospectrum<float, THREE>(
+        const typename fftw_interface<float>::complex *__restrict__ a,
+        const typename fftw_interface<float>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, TWO_THIRDS>::cospectrum<float, THREExTHREE>(
+        const typename fftw_interface<float>::complex *__restrict__ a,
+        const typename fftw_interface<float>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, TWO_THIRDS>::cospectrum<double, ONE>(
+        const typename fftw_interface<double>::complex *__restrict__ a,
+        const typename fftw_interface<double>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, TWO_THIRDS>::cospectrum<double, THREE>(
+        const typename fftw_interface<double>::complex *__restrict__ a,
+        const typename fftw_interface<double>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, TWO_THIRDS>::cospectrum<double, THREExTHREE>(
+        const typename fftw_interface<double>::complex *__restrict__ a,
+        const typename fftw_interface<double>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+
+template void kspace<FFTW, SMOOTH>::cospectrum<float, ONE>(
+        const typename fftw_interface<float>::complex *__restrict__ a,
+        const typename fftw_interface<float>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, SMOOTH>::cospectrum<float, THREE>(
+        const typename fftw_interface<float>::complex *__restrict__ a,
+        const typename fftw_interface<float>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, SMOOTH>::cospectrum<float, THREExTHREE>(
+        const typename fftw_interface<float>::complex *__restrict__ a,
+        const typename fftw_interface<float>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, SMOOTH>::cospectrum<double, ONE>(
+        const typename fftw_interface<double>::complex *__restrict__ a,
+        const typename fftw_interface<double>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, SMOOTH>::cospectrum<double, THREE>(
+        const typename fftw_interface<double>::complex *__restrict__ a,
+        const typename fftw_interface<double>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+template void kspace<FFTW, SMOOTH>::cospectrum<double, THREExTHREE>(
+        const typename fftw_interface<double>::complex *__restrict__ a,
+        const typename fftw_interface<double>::complex *__restrict__ b,
+        const hid_t group,
+        const std::string dset_name,
+        const hsize_t toffset);
+
+template void kspace<FFTW, SMOOTH>::force_divfree<float>(
+       typename fftw_interface<float>::complex *__restrict__ a);
+template void kspace<FFTW, SMOOTH>::force_divfree<double>(
+       typename fftw_interface<double>::complex *__restrict__ a);
+
diff --git a/bfps/cpp/kspace.hpp b/bfps/cpp/kspace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6f0f67a09a39d355480f94683e1d40d68b12cce
--- /dev/null
+++ b/bfps/cpp/kspace.hpp
@@ -0,0 +1,176 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+
+
+#include <hdf5.h>
+#include <unordered_map>
+#include <vector>
+#include <string>
+#include "omputils.hpp"
+#include "fftw_interface.hpp"
+#include "field_layout.hpp"
+
+#ifndef KSPACE_HPP
+
+#define KSPACE_HPP
+
+enum field_backend {FFTW};
+enum kspace_dealias_type {TWO_THIRDS, SMOOTH};
+
+
+template <field_backend be,
+          kspace_dealias_type dt>
+class kspace
+{
+    public:
+        /* relevant field layout */
+        field_layout<ONE> *layout;
+
+        /* physical parameters */
+        double dkx, dky, dkz, dk, dk2;
+
+        /* mode and dealiasing information */
+        double kMx, kMy, kMz, kM, kM2;
+        std::vector<double> kx, ky, kz;
+        std::unordered_map<int, double> dealias_filter;
+        std::vector<double> kshell;
+        std::vector<int64_t> nshell;
+        int nshells;
+
+        /* methods */
+        template <field_components fc>
+        kspace(
+                const field_layout<fc> *source_layout,
+                const double DKX = 1.0,
+                const double DKY = 1.0,
+                const double DKZ = 1.0);
+        ~kspace();
+
+        template <typename rnumber,
+                  field_components fc>
+        void low_pass(
+                typename fftw_interface<rnumber>::complex *__restrict__ a,
+                const double kmax);
+
+        template <typename rnumber,
+                  field_components fc>
+        void Gauss_filter(
+                typename fftw_interface<rnumber>::complex *__restrict__ a,
+                const double sigma);
+
+        template <typename rnumber,
+                  field_components fc>
+        void dealias(typename fftw_interface<rnumber>::complex *__restrict__ a);
+
+        template <typename rnumber,
+                  field_components fc>
+        void cospectrum(
+                const rnumber(* __restrict__ a)[2],
+                const rnumber(* __restrict__ b)[2],
+                const hid_t group,
+                const std::string dset_name,
+                const hsize_t toffset);
+        template <class func_type>
+        void CLOOP(func_type expression)
+        {
+            #pragma omp parallel
+            {
+                const hsize_t start = OmpUtils::ForIntervalStart(this->layout->subsizes[1]);
+                const hsize_t end = OmpUtils::ForIntervalEnd(this->layout->subsizes[1]);
+
+                for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){
+                    for (hsize_t zindex = start; zindex < end; zindex++){
+                        ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]
+                                            + zindex*this->layout->subsizes[2];
+                        for (hsize_t xindex = 0; xindex < this->layout->subsizes[2]; xindex++)
+                        {
+                            expression(cindex, xindex, yindex, zindex);
+                            cindex++;
+                        }
+                    }
+                }
+            }
+        }
+        template <class func_type>
+        void CLOOP_K2(func_type expression)
+        {
+            #pragma omp parallel
+            {
+                const hsize_t start = OmpUtils::ForIntervalStart(this->layout->subsizes[1]);
+                const hsize_t end = OmpUtils::ForIntervalEnd(this->layout->subsizes[1]);
+
+                for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){
+                    for (hsize_t zindex = start; zindex < end; zindex++){
+                        ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]
+                                            + zindex*this->layout->subsizes[2];
+                        for (hsize_t xindex = 0; xindex < this->layout->subsizes[2]; xindex++)
+                        {
+                            double k2 = (this->kx[xindex]*this->kx[xindex] +
+                                  this->ky[yindex]*this->ky[yindex] +
+                                  this->kz[zindex]*this->kz[zindex]);
+                            expression(cindex, xindex, yindex, zindex, k2);
+                            cindex++;
+                        }
+                    }
+                }
+            }
+        }
+        template <class func_type>
+        void CLOOP_K2_NXMODES(func_type expression)
+        {
+            #pragma omp parallel
+            {
+                const hsize_t start = OmpUtils::ForIntervalStart(this->layout->subsizes[1]);
+                const hsize_t end = OmpUtils::ForIntervalEnd(this->layout->subsizes[1]);
+
+                for (hsize_t yindex = 0; yindex < this->layout->subsizes[0]; yindex++){
+                    for (hsize_t zindex = start; zindex < end; zindex++){
+                        ptrdiff_t cindex = yindex*this->layout->subsizes[1]*this->layout->subsizes[2]
+                                            + zindex*this->layout->subsizes[2];
+                        hsize_t xindex = 0;
+                        double k2 = (
+                                this->kx[xindex]*this->kx[xindex] +
+                                this->ky[yindex]*this->ky[yindex] +
+                                this->kz[zindex]*this->kz[zindex]);
+                        expression(cindex, xindex, yindex, zindex, k2, 1);
+                        cindex++;
+                        for (xindex = 1; xindex < this->layout->subsizes[2]; xindex++)
+                        {
+                            k2 = (this->kx[xindex]*this->kx[xindex] +
+                                  this->ky[yindex]*this->ky[yindex] +
+                                  this->kz[zindex]*this->kz[zindex]);
+                            expression(cindex, xindex, yindex, zindex, k2, 2);
+                            cindex++;
+                        }
+                    }
+                }
+            }
+        }
+        template <typename rnumber>
+        void force_divfree(typename fftw_interface<rnumber>::complex *__restrict__ a);
+};
+
+#endif//KSPACE_HPP
+
diff --git a/bfps/cpp/omputils.hpp b/bfps/cpp/omputils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdd6c6c173c7cf002b72e0c1a7aebcf727f2d33e
--- /dev/null
+++ b/bfps/cpp/omputils.hpp
@@ -0,0 +1,27 @@
+#ifndef OMPUTILS_HPP
+#define OMPUTILS_HPP
+
+#include <omp.h>
+
+namespace OmpUtils{
+
+template <class IndexType>
+inline IndexType ForIntervalStart(const IndexType size){
+    const double chunk = double(size)/double(omp_get_num_threads());
+    const IndexType start = IndexType(chunk*double(omp_get_thread_num()));
+    return start;
+}
+
+template <class IndexType>
+inline IndexType ForIntervalEnd(const IndexType size){
+    const double chunk = double(size)/double(omp_get_num_threads());
+    const IndexType end = (omp_get_thread_num() == omp_get_num_threads()-1) ?
+                                size:
+                                IndexType(chunk*double(omp_get_thread_num()+1));
+    return end;
+}
+
+}
+
+
+#endif
diff --git a/bfps/cpp/particles.cpp b/bfps/cpp/particles.cpp
index 847f065d49299b559162060876402101fe48d9d4..cdaf157cb912c3074faf84bfecf1d9b3752c78a7 100644
--- a/bfps/cpp/particles.cpp
+++ b/bfps/cpp/particles.cpp
@@ -43,17 +43,17 @@ template <particle_types particle_type, class rnumber, int interp_neighbours>
 particles<particle_type, rnumber, interp_neighbours>::particles(
         const char *NAME,
         const hid_t data_file_id,
-        interpolator_base<rnumber, interp_neighbours> *FIELD,
+        interpolator_base<rnumber, interp_neighbours> *VEL,
         const int TRAJ_SKIP,
         const int INTEGRATION_STEPS) : particles_io_base<particle_type>(
             NAME,
             TRAJ_SKIP,
             data_file_id,
-            FIELD->descriptor->comm)
+            VEL->descriptor->comm)
 {
     assert((INTEGRATION_STEPS <= 6) &&
            (INTEGRATION_STEPS >= 1));
-    this->vel = FIELD;
+    this->vel = VEL;
     this->integration_steps = INTEGRATION_STEPS;
     this->array_size = this->nparticles * state_dimension(particle_type);
     this->state = new double[this->array_size];
diff --git a/bfps/cpp/particles/abstract_particles_distr.hpp b/bfps/cpp/particles/abstract_particles_distr.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..28837b5cdc69e711ca90f6b62d2fb72128564dbe
--- /dev/null
+++ b/bfps/cpp/particles/abstract_particles_distr.hpp
@@ -0,0 +1,849 @@
+#ifndef ABSTRACT_PARTICLES_DISTR_HPP
+#define ABSTRACT_PARTICLES_DISTR_HPP
+
+#include <mpi.h>
+
+#include <vector>
+#include <memory>
+#include <cassert>
+
+#include <type_traits>
+#include <omp.h>
+
+#include "scope_timer.hpp"
+#include "particles_utils.hpp"
+
+
+template <class real_number, int size_particle_positions, int size_particle_rhs, int size_particle_index>
+class abstract_particles_distr {
+protected:
+    static const int MaxNbRhs = 100;
+
+    enum MpiTag{
+        TAG_LOW_UP_NB_PARTICLES,
+        TAG_UP_LOW_NB_PARTICLES,
+        TAG_LOW_UP_PARTICLES,
+        TAG_UP_LOW_PARTICLES,
+        TAG_LOW_UP_RESULTS,
+        TAG_UP_LOW_RESULTS,
+
+        TAG_LOW_UP_MOVED_NB_PARTICLES,
+        TAG_UP_LOW_MOVED_NB_PARTICLES,
+        TAG_LOW_UP_MOVED_PARTICLES,
+        TAG_UP_LOW_MOVED_PARTICLES,
+
+        TAG_LOW_UP_MOVED_PARTICLES_INDEXES,
+        TAG_UP_LOW_MOVED_PARTICLES_INDEXES,
+
+        TAG_LOW_UP_MOVED_PARTICLES_RHS,
+        TAG_LOW_UP_MOVED_PARTICLES_RHS_MAX = TAG_LOW_UP_MOVED_PARTICLES_RHS+MaxNbRhs,
+
+        TAG_UP_LOW_MOVED_PARTICLES_RHS = TAG_LOW_UP_MOVED_PARTICLES_RHS_MAX,
+        TAG_UP_LOW_MOVED_PARTICLES_RHS_MAX = TAG_UP_LOW_MOVED_PARTICLES_RHS+MaxNbRhs,
+    };
+
+    struct NeighborDescriptor{
+        int nbPartitionsToSend;
+        int nbPartitionsToRecv;
+        int nbParticlesToSend;
+        int nbParticlesToRecv;
+        int destProc;
+        int rankDiff;
+        bool isLower;
+        int idxLowerUpper;
+
+        std::unique_ptr<real_number[]> toRecvAndMerge;
+        std::unique_ptr<real_number[]> toCompute;
+        std::unique_ptr<real_number[]> results;
+    };
+
+    enum Action{
+        NOTHING_TODO,
+        RECV_PARTICLES,
+        COMPUTE_PARTICLES,
+        RELEASE_BUFFER_PARTICLES,
+        MERGE_PARTICLES,
+
+        RECV_MOVE_NB_LOW,
+        RECV_MOVE_NB_UP,
+        RECV_MOVE_LOW,
+        RECV_MOVE_UP
+    };
+
+    MPI_Comm current_com;
+
+    int my_rank;
+    int nb_processes;
+    int nb_processes_involved;
+
+    const std::pair<int,int> current_partition_interval;
+    const int current_partition_size;
+
+    std::unique_ptr<int[]> partition_interval_size_per_proc;
+    std::unique_ptr<int[]> partition_interval_offset_per_proc;
+
+    std::unique_ptr<int[]> current_offset_particles_for_partition;
+
+    std::vector<std::pair<Action,int>> whatNext;
+    std::vector<MPI_Request> mpiRequests;
+    std::vector<NeighborDescriptor> neigDescriptors;
+
+public:
+    ////////////////////////////////////////////////////////////////////////////
+
+    abstract_particles_distr(MPI_Comm in_current_com,
+                             const std::pair<int,int>& in_current_partitions)
+        : current_com(in_current_com),
+            my_rank(-1), nb_processes(-1),nb_processes_involved(-1),
+            current_partition_interval(in_current_partitions),
+            current_partition_size(current_partition_interval.second-current_partition_interval.first){
+
+        AssertMpi(MPI_Comm_rank(current_com, &my_rank));
+        AssertMpi(MPI_Comm_size(current_com, &nb_processes));
+
+        partition_interval_size_per_proc.reset(new int[nb_processes]);
+        AssertMpi( MPI_Allgather( const_cast<int*>(&current_partition_size), 1, MPI_INT,
+                                  partition_interval_size_per_proc.get(), 1, MPI_INT,
+                                  current_com) );
+        assert(partition_interval_size_per_proc[my_rank] == current_partition_size);
+
+        partition_interval_offset_per_proc.reset(new int[nb_processes+1]);
+        partition_interval_offset_per_proc[0] = 0;
+        for(int idxProc = 0 ; idxProc < nb_processes ; ++idxProc){
+            partition_interval_offset_per_proc[idxProc+1] = partition_interval_offset_per_proc[idxProc] + partition_interval_size_per_proc[idxProc];
+        }
+
+        current_offset_particles_for_partition.reset(new int[current_partition_size+1]);
+
+        nb_processes_involved = nb_processes;
+        while(nb_processes_involved != 0 && partition_interval_size_per_proc[nb_processes_involved-1] == 0){
+            nb_processes_involved -= 1;
+        }
+        assert(nb_processes_involved != 0);
+        for(int idx_proc_involved = 0 ; idx_proc_involved < nb_processes_involved ; ++idx_proc_involved){
+            assert(partition_interval_size_per_proc[idx_proc_involved] != 0);
+        }
+    }
+
+    virtual ~abstract_particles_distr(){}
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    void compute_distr(const int current_my_nb_particles_per_partition[],
+                       const real_number particles_positions[],
+                       real_number particles_current_rhs[],
+                       const int interpolation_size){
+        TIMEZONE("compute_distr");
+
+        // Some processes might not be involved
+        if(nb_processes_involved <= my_rank){
+            return;
+        }
+
+        current_offset_particles_for_partition[0] = 0;
+        int myTotalNbParticles = 0;
+        for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+            myTotalNbParticles += current_my_nb_particles_per_partition[idxPartition];
+            current_offset_particles_for_partition[idxPartition+1] = current_offset_particles_for_partition[idxPartition] + current_my_nb_particles_per_partition[idxPartition];
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        /// Exchange the number of particles in each partition
+        /// Could involve only here but I do not think it will be a problem
+        //////////////////////////////////////////////////////////////////////
+
+
+        assert(whatNext.size() == 0);
+        assert(mpiRequests.size() == 0);
+
+        neigDescriptors.clear();
+
+        int nbProcToRecvLower;
+        {
+            int nextDestProc = my_rank;
+            for(int idxLower = 1 ; idxLower <= interpolation_size ; idxLower += partition_interval_size_per_proc[nextDestProc]){
+                nextDestProc = (nextDestProc-1+nb_processes_involved)%nb_processes_involved;
+                const int destProc = nextDestProc;
+                const int lowerRankDiff = (nextDestProc < my_rank ? my_rank - nextDestProc : nb_processes_involved-nextDestProc+my_rank);
+
+                const int nbPartitionsToSend = std::min(current_partition_size, interpolation_size-(idxLower-1));
+                const int nbParticlesToSend = current_offset_particles_for_partition[nbPartitionsToSend] - current_offset_particles_for_partition[0];
+
+                const int nbPartitionsToRecv = std::min(partition_interval_size_per_proc[destProc], (interpolation_size+1)-(idxLower-1));
+                const int nbParticlesToRecv = -1;
+
+                NeighborDescriptor descriptor;
+                descriptor.destProc = destProc;
+                descriptor.rankDiff = lowerRankDiff;
+                descriptor.nbPartitionsToSend = nbPartitionsToSend;
+                descriptor.nbParticlesToSend = nbParticlesToSend;
+                descriptor.nbPartitionsToRecv = nbPartitionsToRecv;
+                descriptor.nbParticlesToRecv = nbParticlesToRecv;
+                descriptor.isLower = true;
+                descriptor.idxLowerUpper = idxLower;
+
+                neigDescriptors.emplace_back(std::move(descriptor));
+            }
+            nbProcToRecvLower = neigDescriptors.size();
+
+            nextDestProc = my_rank;
+            for(int idxUpper = 1 ; idxUpper <= interpolation_size ; idxUpper += partition_interval_size_per_proc[nextDestProc]){
+                nextDestProc = (nextDestProc+1+nb_processes_involved)%nb_processes_involved;
+                const int destProc = nextDestProc;
+                const int upperRankDiff = (nextDestProc > my_rank ? nextDestProc - my_rank: nb_processes_involved-my_rank+nextDestProc);
+
+                const int nbPartitionsToSend = std::min(current_partition_size, (interpolation_size+1)-(idxUpper-1));
+                const int nbParticlesToSend = current_offset_particles_for_partition[current_partition_size] - current_offset_particles_for_partition[current_partition_size-nbPartitionsToSend];
+
+                const int nbPartitionsToRecv = std::min(partition_interval_size_per_proc[destProc], interpolation_size-(idxUpper-1));
+                const int nbParticlesToRecv = -1;
+
+                NeighborDescriptor descriptor;
+                descriptor.destProc = destProc;
+                descriptor.rankDiff = upperRankDiff;
+                descriptor.nbPartitionsToSend = nbPartitionsToSend;
+                descriptor.nbParticlesToSend = nbParticlesToSend;
+                descriptor.nbPartitionsToRecv = nbPartitionsToRecv;
+                descriptor.nbParticlesToRecv = nbParticlesToRecv;
+                descriptor.isLower = false;
+                descriptor.idxLowerUpper = idxUpper;
+
+                neigDescriptors.emplace_back(std::move(descriptor));
+            }
+        }
+        const int nbProcToRecvUpper = neigDescriptors.size()-nbProcToRecvLower;
+        const int nbProcToRecv = nbProcToRecvUpper + nbProcToRecvLower;
+        assert(int(neigDescriptors.size()) == nbProcToRecv);
+
+        for(int idxDescr = 0 ; idxDescr < int(neigDescriptors.size()) ; ++idxDescr){
+            NeighborDescriptor& descriptor = neigDescriptors[idxDescr];
+
+            if(descriptor.isLower){
+                whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Isend(const_cast<int*>(&descriptor.nbParticlesToSend), 1, MPI_INT, descriptor.destProc, TAG_LOW_UP_NB_PARTICLES,
+                          current_com, &mpiRequests.back()));
+
+                if(descriptor.nbParticlesToSend){
+                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                    mpiRequests.emplace_back();
+                    AssertMpi(MPI_Isend(const_cast<real_number*>(&particles_positions[0]), descriptor.nbParticlesToSend*size_particle_positions, particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_LOW_UP_PARTICLES,
+                              current_com, &mpiRequests.back()));
+
+                    assert(descriptor.toRecvAndMerge == nullptr);
+                    descriptor.toRecvAndMerge.reset(new real_number[descriptor.nbParticlesToSend*size_particle_rhs]);
+                    whatNext.emplace_back(std::pair<Action,int>{MERGE_PARTICLES, idxDescr});
+                    mpiRequests.emplace_back();
+                    AssertMpi(MPI_Irecv(descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend*size_particle_rhs, particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_UP_LOW_RESULTS,
+                              current_com, &mpiRequests.back()));
+                }
+
+                whatNext.emplace_back(std::pair<Action,int>{RECV_PARTICLES, idxDescr});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Irecv(&descriptor.nbParticlesToRecv,
+                          1, MPI_INT, descriptor.destProc, TAG_UP_LOW_NB_PARTICLES,
+                          current_com, &mpiRequests.back()));
+            }
+            else{
+                whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Isend(const_cast<int*>(&descriptor.nbParticlesToSend), 1, MPI_INT, descriptor.destProc, TAG_UP_LOW_NB_PARTICLES,
+                          current_com, &mpiRequests.back()));
+
+                if(descriptor.nbParticlesToSend){
+                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                    mpiRequests.emplace_back();
+                    AssertMpi(MPI_Isend(const_cast<real_number*>(&particles_positions[(current_offset_particles_for_partition[current_partition_size-descriptor.nbPartitionsToSend])*size_particle_positions]), descriptor.nbParticlesToSend*size_particle_positions, particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_UP_LOW_PARTICLES,
+                              current_com, &mpiRequests.back()));
+
+                    assert(descriptor.toRecvAndMerge == nullptr);
+                    descriptor.toRecvAndMerge.reset(new real_number[descriptor.nbParticlesToSend*size_particle_rhs]);
+                    whatNext.emplace_back(std::pair<Action,int>{MERGE_PARTICLES, idxDescr});
+                    mpiRequests.emplace_back();
+                    AssertMpi(MPI_Irecv(descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend*size_particle_rhs, particles_utils::GetMpiType(real_number()), descriptor.destProc, TAG_LOW_UP_RESULTS,
+                              current_com, &mpiRequests.back()));
+                }
+
+                whatNext.emplace_back(std::pair<Action,int>{RECV_PARTICLES, idxDescr});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Irecv(&descriptor.nbParticlesToRecv,
+                          1, MPI_INT, descriptor.destProc, TAG_LOW_UP_NB_PARTICLES,
+                          current_com, &mpiRequests.back()));
+            }
+        }
+
+        const bool more_than_one_thread = (omp_get_max_threads() > 1);
+
+        TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads())
+        #pragma omp parallel default(shared)
+        {
+            #pragma omp master
+            {
+                while(mpiRequests.size()){
+                    assert(mpiRequests.size() == whatNext.size());
+
+                    int idxDone = mpiRequests.size();
+                    {
+                        TIMEZONE("wait");
+                        AssertMpi(MPI_Waitany(mpiRequests.size(), mpiRequests.data(), &idxDone, MPI_STATUSES_IGNORE));
+                    }
+                    const std::pair<Action, int> releasedAction = whatNext[idxDone];
+                    std::swap(mpiRequests[idxDone], mpiRequests[mpiRequests.size()-1]);
+                    std::swap(whatNext[idxDone], whatNext[mpiRequests.size()-1]);
+                    mpiRequests.pop_back();
+                    whatNext.pop_back();
+
+                    //////////////////////////////////////////////////////////////////////
+                    /// Data to exchange particles
+                    //////////////////////////////////////////////////////////////////////
+                    if(releasedAction.first == RECV_PARTICLES){
+                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
+
+                        if(descriptor.isLower){
+                            //const int idxLower = descriptor.idxLowerUpper;
+                            const int destProc = descriptor.destProc;
+                            //const int nbPartitionsToRecv = descriptor.nbPartitionsToRecv;
+                            const int NbParticlesToReceive = descriptor.nbParticlesToRecv;
+                            assert(NbParticlesToReceive != -1);
+                            assert(descriptor.toCompute == nullptr);
+                            if(NbParticlesToReceive){
+                                descriptor.toCompute.reset(new real_number[NbParticlesToReceive*size_particle_positions]);
+                                whatNext.emplace_back(std::pair<Action,int>{COMPUTE_PARTICLES, releasedAction.second});
+                                mpiRequests.emplace_back();
+                                AssertMpi(MPI_Irecv(descriptor.toCompute.get(), NbParticlesToReceive*size_particle_positions, particles_utils::GetMpiType(real_number()), destProc, TAG_UP_LOW_PARTICLES,
+                                          current_com, &mpiRequests.back()));
+                            }
+                        }
+                        else{
+                            //const int idxUpper = descriptor.idxLowerUpper;
+                            const int destProc = descriptor.destProc;
+                            //const int nbPartitionsToRecv = descriptor.nbPartitionsToRecv;
+                            const int NbParticlesToReceive = descriptor.nbParticlesToRecv;
+                            assert(NbParticlesToReceive != -1);
+                            assert(descriptor.toCompute == nullptr);
+                            if(NbParticlesToReceive){
+                                descriptor.toCompute.reset(new real_number[NbParticlesToReceive*size_particle_positions]);
+                                whatNext.emplace_back(std::pair<Action,int>{COMPUTE_PARTICLES, releasedAction.second});
+                                mpiRequests.emplace_back();
+                                AssertMpi(MPI_Irecv(descriptor.toCompute.get(), NbParticlesToReceive*size_particle_positions, particles_utils::GetMpiType(real_number()), destProc, TAG_LOW_UP_PARTICLES,
+                                          current_com, &mpiRequests.back()));
+                            }
+                        }
+                    }
+
+                    //////////////////////////////////////////////////////////////////////
+                    /// Computation
+                    //////////////////////////////////////////////////////////////////////
+                    if(releasedAction.first == COMPUTE_PARTICLES){
+                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
+                        const int NbParticlesToReceive = descriptor.nbParticlesToRecv;
+
+                        assert(descriptor.toCompute != nullptr);
+                        descriptor.results.reset(new real_number[NbParticlesToReceive*size_particle_rhs]);
+                        init_result_array(descriptor.results.get(), NbParticlesToReceive);
+
+                        if(more_than_one_thread == false){
+                            apply_computation(descriptor.toCompute.get(), descriptor.results.get(), NbParticlesToReceive);
+                        }
+                        else{
+                            TIMEZONE_OMP_INIT_PRETASK(timeZoneTaskKey)
+                            NeighborDescriptor* ptr_descriptor = &descriptor;
+                            #pragma omp taskgroup
+                            {
+                                for(int idxPart = 0 ; idxPart < NbParticlesToReceive ; idxPart += 300){
+                                    const int sizeToDo = std::min(300, NbParticlesToReceive-idxPart);
+                                    #pragma omp task default(shared) firstprivate(ptr_descriptor, idxPart, sizeToDo) priority(10) \
+                                             TIMEZONE_OMP_PRAGMA_TASK_KEY(timeZoneTaskKey)
+                                    {
+                                        TIMEZONE_OMP_TASK("apply_computation", timeZoneTaskKey);
+                                        apply_computation(&ptr_descriptor->toCompute[idxPart*size_particle_positions],
+                                                &ptr_descriptor->results[idxPart*size_particle_rhs], sizeToDo);
+                                    }
+                                }
+                            }
+                        }
+
+                        const int destProc = descriptor.destProc;
+                        whatNext.emplace_back(std::pair<Action,int>{RELEASE_BUFFER_PARTICLES, releasedAction.second});
+                        mpiRequests.emplace_back();
+                        const int tag = descriptor.isLower? TAG_LOW_UP_RESULTS : TAG_UP_LOW_RESULTS;
+                        AssertMpi(MPI_Isend(descriptor.results.get(), NbParticlesToReceive*size_particle_rhs, particles_utils::GetMpiType(real_number()), destProc, tag,
+                                  current_com, &mpiRequests.back()));
+                    }
+                    //////////////////////////////////////////////////////////////////////
+                    /// Computation
+                    //////////////////////////////////////////////////////////////////////
+                    if(releasedAction.first == RELEASE_BUFFER_PARTICLES){
+                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
+                        assert(descriptor.toCompute != nullptr);
+                        descriptor.toCompute.release();
+                    }
+                    //////////////////////////////////////////////////////////////////////
+                    /// Merge
+                    //////////////////////////////////////////////////////////////////////
+                    if(releasedAction.first == MERGE_PARTICLES && more_than_one_thread == false){
+                        NeighborDescriptor& descriptor = neigDescriptors[releasedAction.second];
+
+                        if(descriptor.isLower){
+                            TIMEZONE("reduce");
+                            assert(descriptor.toRecvAndMerge != nullptr);
+                            reduce_particles_rhs(&particles_current_rhs[0], descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend);
+                            descriptor.toRecvAndMerge.release();
+                        }
+                        else {
+                            TIMEZONE("reduce");
+                            assert(descriptor.toRecvAndMerge != nullptr);
+                            reduce_particles_rhs(&particles_current_rhs[(current_offset_particles_for_partition[current_partition_size]-descriptor.nbParticlesToSend)*size_particle_rhs],
+                                             descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend);
+                            descriptor.toRecvAndMerge.release();
+                        }
+                    }
+                }
+            }
+            if(more_than_one_thread && omp_get_thread_num() == 1){
+                TIMEZONE_OMP_INIT_PRETASK(timeZoneTaskKey)
+                #pragma omp taskgroup
+                {
+                    // Do for all partitions except the first and last one
+                    for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+                        for(int idxPart = current_offset_particles_for_partition[idxPartition] ;
+                            idxPart < current_offset_particles_for_partition[idxPartition+1] ; idxPart += 300){
+
+                            const int sizeToDo = std::min(300, current_offset_particles_for_partition[idxPartition+1]-idxPart);
+
+                            // Low priority to help master thread when possible
+                            #pragma omp task default(shared) firstprivate(idxPart, sizeToDo) priority(0) TIMEZONE_OMP_PRAGMA_TASK_KEY(timeZoneTaskKey)
+                            {
+                                TIMEZONE_OMP_TASK("apply_computation", timeZoneTaskKey);
+                                apply_computation(&particles_positions[idxPart*size_particle_positions],
+                                                  &particles_current_rhs[idxPart*size_particle_rhs],
+                                                  sizeToDo);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        if(more_than_one_thread == true){
+            for(int idxDescr = 0 ; idxDescr < int(neigDescriptors.size()) ; ++idxDescr){
+                NeighborDescriptor& descriptor = neigDescriptors[idxDescr];
+                if(descriptor.nbParticlesToSend){
+                    if(descriptor.isLower){
+                        TIMEZONE("reduce_later");
+                        assert(descriptor.toRecvAndMerge != nullptr);
+                        reduce_particles_rhs(&particles_current_rhs[0], descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend);
+                        descriptor.toRecvAndMerge.release();
+                    }
+                    else {
+                        TIMEZONE("reduce_later");
+                        assert(descriptor.toRecvAndMerge != nullptr);
+                        reduce_particles_rhs(&particles_current_rhs[(current_offset_particles_for_partition[current_partition_size]-descriptor.nbParticlesToSend)*size_particle_rhs],
+                                         descriptor.toRecvAndMerge.get(), descriptor.nbParticlesToSend);
+                        descriptor.toRecvAndMerge.release();
+                    }
+                }
+            }
+        }
+
+        // Do my own computation if not threaded
+        if(more_than_one_thread == false){
+            TIMEZONE("compute-my_compute");
+            // Compute my particles
+            if(myTotalNbParticles){
+                apply_computation(particles_positions, particles_current_rhs, myTotalNbParticles);
+            }
+        }
+
+        assert(whatNext.size() == 0);
+        assert(mpiRequests.size() == 0);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    virtual void init_result_array(real_number particles_current_rhs[],
+                                   const int nb_particles) const = 0;
+    virtual void apply_computation(const real_number particles_positions[],
+                                   real_number particles_current_rhs[],
+                                   const int nb_particles) const = 0;
+    virtual void reduce_particles_rhs(real_number particles_current_rhs[],
+                                  const real_number extra_particles_current_rhs[],
+                                  const int nb_particles) const = 0;
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    void redistribute(int current_my_nb_particles_per_partition[],
+                      int* nb_particles,
+                      std::unique_ptr<real_number[]>* inout_positions_particles,
+                      std::unique_ptr<real_number[]> inout_rhs_particles[], const int in_nb_rhs,
+                      std::unique_ptr<int[]>* inout_index_particles,
+                      const real_number mySpatialLowLimit,
+                      const real_number mySpatialUpLimit,
+                      const real_number spatialPartitionWidth){
+        TIMEZONE("redistribute");
+
+        // Some latest processes might not be involved
+        if(nb_processes_involved <= my_rank){
+            return;
+        }
+
+        current_offset_particles_for_partition[0] = 0;
+        int myTotalNbParticles = 0;
+        for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+            myTotalNbParticles += current_my_nb_particles_per_partition[idxPartition];
+            current_offset_particles_for_partition[idxPartition+1] = current_offset_particles_for_partition[idxPartition] + current_my_nb_particles_per_partition[idxPartition];
+        }
+        assert((*nb_particles) == myTotalNbParticles);
+
+        // Find particles outside my interval
+        const int nbOutLower = particles_utils::partition_extra<size_particle_positions>(&(*inout_positions_particles)[0], current_my_nb_particles_per_partition[0],
+                    [&](const real_number val[]){
+            const bool isLower = val[IDX_Z] < mySpatialLowLimit;
+            return isLower;
+        },
+                    [&](const int idx1, const int idx2){
+            for(int idx_val = 0 ; idx_val < size_particle_index ; ++idx_val){
+                std::swap((*inout_index_particles)[idx1], (*inout_index_particles)[idx2]);
+            }
+
+            for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                for(int idx_val = 0 ; idx_val < size_particle_rhs ; ++idx_val){
+                    std::swap(inout_rhs_particles[idx_rhs][idx1*size_particle_rhs + idx_val],
+                              inout_rhs_particles[idx_rhs][idx2*size_particle_rhs + idx_val]);
+                }
+            }
+        });
+        const int offesetOutLow = (current_partition_size==1? nbOutLower : 0);
+
+        const int nbOutUpper = current_my_nb_particles_per_partition[current_partition_size-1] - offesetOutLow - particles_utils::partition_extra<size_particle_positions>(
+                    &(*inout_positions_particles)[(current_offset_particles_for_partition[current_partition_size-1]+offesetOutLow)*size_particle_positions],
+                    myTotalNbParticles - (current_offset_particles_for_partition[current_partition_size-1]+offesetOutLow),
+                    [&](const real_number val[]){
+            const bool isUpper = mySpatialUpLimit <= val[IDX_Z];
+            return !isUpper;
+        },
+                    [&](const int idx1, const int idx2){
+            for(int idx_val = 0 ; idx_val < size_particle_index ; ++idx_val){
+                std::swap((*inout_index_particles)[idx1], (*inout_index_particles)[idx2]);
+            }
+
+            for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                for(int idx_val = 0 ; idx_val < size_particle_rhs ; ++idx_val){
+                    std::swap(inout_rhs_particles[idx_rhs][idx1*size_particle_rhs + idx_val],
+                              inout_rhs_particles[idx_rhs][idx2*size_particle_rhs + idx_val]);
+                }
+            }
+        }, (current_offset_particles_for_partition[current_partition_size-1]+offesetOutLow));
+
+        // Exchange number
+        int eventsBeforeWaitall = 0;
+        int nbNewFromLow = 0;
+        int nbNewFromUp = 0;
+        std::unique_ptr<real_number[]> newParticlesLow;
+        std::unique_ptr<real_number[]> newParticlesUp;
+        std::unique_ptr<int[]> newParticlesLowIndexes;
+        std::unique_ptr<int[]> newParticlesUpIndexes;
+        std::vector<std::unique_ptr<real_number[]>> newParticlesLowRhs(in_nb_rhs);
+        std::vector<std::unique_ptr<real_number[]>> newParticlesUpRhs(in_nb_rhs);
+
+        const bool more_than_one_thread = (omp_get_max_threads() > 1);
+
+        TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads())
+        #pragma omp parallel default(shared)
+        {
+            #pragma omp master
+            {
+                assert(whatNext.size() == 0);
+                assert(mpiRequests.size() == 0);
+
+                whatNext.emplace_back(std::pair<Action,int>{RECV_MOVE_NB_LOW, -1});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Irecv(&nbNewFromLow, 1, MPI_INT, (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_UP_LOW_MOVED_NB_PARTICLES,
+                          MPI_COMM_WORLD, &mpiRequests.back()));
+                eventsBeforeWaitall += 1;
+
+                whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Isend(const_cast<int*>(&nbOutLower), 1, MPI_INT, (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_LOW_UP_MOVED_NB_PARTICLES,
+                          MPI_COMM_WORLD, &mpiRequests.back()));
+
+                if(nbOutLower){
+                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                    mpiRequests.emplace_back();
+                    AssertMpi(MPI_Isend(&(*inout_positions_particles)[0], nbOutLower*size_particle_positions, particles_utils::GetMpiType(real_number()), (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES,
+                              MPI_COMM_WORLD, &mpiRequests.back()));
+                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                    mpiRequests.emplace_back();
+                    AssertMpi(MPI_Isend(&(*inout_index_particles)[0], nbOutLower, MPI_INT, (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES_INDEXES,
+                              MPI_COMM_WORLD, &mpiRequests.back()));
+
+                    for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                        whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                        mpiRequests.emplace_back();
+                        AssertMpi(MPI_Isend(&inout_rhs_particles[idx_rhs][0], nbOutLower*size_particle_rhs, particles_utils::GetMpiType(real_number()), (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES_RHS+idx_rhs,
+                                  MPI_COMM_WORLD, &mpiRequests.back()));
+                    }
+                }
+
+                whatNext.emplace_back(std::pair<Action,int>{RECV_MOVE_NB_UP, -1});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Irecv(&nbNewFromUp, 1, MPI_INT, (my_rank+1)%nb_processes_involved, TAG_LOW_UP_MOVED_NB_PARTICLES,
+                          MPI_COMM_WORLD, &mpiRequests.back()));
+                eventsBeforeWaitall += 1;
+
+                whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                mpiRequests.emplace_back();
+                AssertMpi(MPI_Isend(const_cast<int*>(&nbOutUpper), 1, MPI_INT, (my_rank+1)%nb_processes_involved, TAG_UP_LOW_MOVED_NB_PARTICLES,
+                          MPI_COMM_WORLD, &mpiRequests.back()));
+
+                if(nbOutUpper){
+                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                    mpiRequests.emplace_back();
+                    AssertMpi(MPI_Isend(&(*inout_positions_particles)[(myTotalNbParticles-nbOutUpper)*size_particle_positions], nbOutUpper*size_particle_positions, particles_utils::GetMpiType(real_number()), (my_rank+1)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES,
+                              MPI_COMM_WORLD, &mpiRequests.back()));
+                    whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                    mpiRequests.emplace_back();
+                    AssertMpi(MPI_Isend(&(*inout_index_particles)[(myTotalNbParticles-nbOutUpper)], nbOutUpper, MPI_INT, (my_rank+1)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES_INDEXES,
+                              MPI_COMM_WORLD, &mpiRequests.back()));
+
+
+                    for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                        whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                        mpiRequests.emplace_back();
+                        AssertMpi(MPI_Isend(&inout_rhs_particles[idx_rhs][(myTotalNbParticles-nbOutUpper)*size_particle_rhs], nbOutUpper*size_particle_rhs, particles_utils::GetMpiType(real_number()), (my_rank+1)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES_RHS+idx_rhs,
+                                  MPI_COMM_WORLD, &mpiRequests.back()));
+                    }
+                }
+
+                while(mpiRequests.size() && eventsBeforeWaitall){
+                    int idxDone = mpiRequests.size();
+                    {
+                        TIMEZONE("waitany_move");
+                        AssertMpi(MPI_Waitany(mpiRequests.size(), mpiRequests.data(), &idxDone, MPI_STATUSES_IGNORE));
+                    }
+                    const std::pair<Action, int> releasedAction = whatNext[idxDone];
+                    std::swap(mpiRequests[idxDone], mpiRequests[mpiRequests.size()-1]);
+                    std::swap(whatNext[idxDone], whatNext[mpiRequests.size()-1]);
+                    mpiRequests.pop_back();
+                    whatNext.pop_back();
+
+                    if(releasedAction.first == RECV_MOVE_NB_LOW){
+                        if(nbNewFromLow){
+                            assert(newParticlesLow == nullptr);
+                            newParticlesLow.reset(new real_number[nbNewFromLow*size_particle_positions]);
+                            whatNext.emplace_back(std::pair<Action,int>{RECV_MOVE_LOW, -1});
+                            mpiRequests.emplace_back();
+                            AssertMpi(MPI_Irecv(&newParticlesLow[0], nbNewFromLow*size_particle_positions, particles_utils::GetMpiType(real_number()), (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES,
+                                      MPI_COMM_WORLD, &mpiRequests.back()));
+
+                            newParticlesLowIndexes.reset(new int[nbNewFromLow]);
+                            whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                            mpiRequests.emplace_back();
+                            AssertMpi(MPI_Irecv(&newParticlesLowIndexes[0], nbNewFromLow, MPI_INT, (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES_INDEXES,
+                                      MPI_COMM_WORLD, &mpiRequests.back()));
+
+                            for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                                newParticlesLowRhs[idx_rhs].reset(new real_number[nbNewFromLow*size_particle_rhs]);
+                                whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                                mpiRequests.emplace_back();
+                                AssertMpi(MPI_Irecv(&newParticlesLowRhs[idx_rhs][0], nbNewFromLow*size_particle_rhs, particles_utils::GetMpiType(real_number()), (my_rank-1+nb_processes_involved)%nb_processes_involved, TAG_UP_LOW_MOVED_PARTICLES_RHS+idx_rhs,
+                                          MPI_COMM_WORLD, &mpiRequests.back()));
+                            }
+                        }
+                        eventsBeforeWaitall -= 1;
+                    }
+                    else if(releasedAction.first == RECV_MOVE_NB_UP){
+                        if(nbNewFromUp){
+                            assert(newParticlesUp == nullptr);
+                            newParticlesUp.reset(new real_number[nbNewFromUp*size_particle_positions]);
+                            whatNext.emplace_back(std::pair<Action,int>{RECV_MOVE_UP, -1});
+                            mpiRequests.emplace_back();
+                            AssertMpi(MPI_Irecv(&newParticlesUp[0], nbNewFromUp*size_particle_positions, particles_utils::GetMpiType(real_number()), (my_rank+1)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES,
+                                      MPI_COMM_WORLD, &mpiRequests.back()));
+
+                            newParticlesUpIndexes.reset(new int[nbNewFromUp]);
+                            whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                            mpiRequests.emplace_back();
+                            AssertMpi(MPI_Irecv(&newParticlesUpIndexes[0], nbNewFromUp, MPI_INT, (my_rank+1)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES_INDEXES,
+                                      MPI_COMM_WORLD, &mpiRequests.back()));
+
+                            for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                                newParticlesUpRhs[idx_rhs].reset(new real_number[nbNewFromUp*size_particle_rhs]);
+                                whatNext.emplace_back(std::pair<Action,int>{NOTHING_TODO, -1});
+                                mpiRequests.emplace_back();
+                                AssertMpi(MPI_Irecv(&newParticlesUpRhs[idx_rhs][0], nbNewFromUp*size_particle_rhs, particles_utils::GetMpiType(real_number()), (my_rank+1)%nb_processes_involved, TAG_LOW_UP_MOVED_PARTICLES_RHS+idx_rhs,
+                                          MPI_COMM_WORLD, &mpiRequests.back()));
+                            }
+                        }
+                        eventsBeforeWaitall -= 1;
+                    }
+                }
+
+                if(mpiRequests.size()){
+                    // TODO Proceed when received
+                    TIMEZONE("waitall-move");
+                    AssertMpi(MPI_Waitall(mpiRequests.size(), mpiRequests.data(), MPI_STATUSES_IGNORE));
+                    mpiRequests.clear();
+                    whatNext.clear();
+                }
+
+                // If we use thread, insert task to proceed received data
+                if(more_than_one_thread == true){
+                    TIMEZONE_OMP_INIT_PRETASK(timeZoneTaskKey)
+                    #pragma omp taskgroup
+                    {
+                        if(nbNewFromLow){
+                            assert(newParticlesLow.get() != nullptr);
+                            #pragma omp task TIMEZONE_OMP_PRAGMA_TASK_KEY(timeZoneTaskKey)
+                            {
+                                TIMEZONE_OMP_TASK("task-pbc", timeZoneTaskKey);
+                                apply_pbc_z_new_particles(newParticlesLow.get(), nbNewFromLow);
+                                apply_pbc_xy(newParticlesLow.get(), nbNewFromLow);
+                            }
+                        }
+                        if(nbNewFromUp){
+                            assert(newParticlesUp.get() != nullptr);
+                            #pragma omp task TIMEZONE_OMP_PRAGMA_TASK_KEY(timeZoneTaskKey)
+                            {
+                               TIMEZONE_OMP_TASK("task-pbc", timeZoneTaskKey);
+                               apply_pbc_z_new_particles(newParticlesUp.get(), nbNewFromUp);
+                               apply_pbc_xy(newParticlesUp.get(), nbNewFromUp);
+                            }
+                        }
+                    }
+                }
+            }
+            // if we use threads and we are not master thread than proceed local data (not send/recv)
+            if(more_than_one_thread == true && omp_get_thread_num() > 0){
+                TIMEZONE("apply_pbc_xy");
+                const int nbOldParticles = myTotalNbParticles - nbOutLower - nbOutUpper;
+                particles_utils::IntervalSplitter<int> interval(nbOldParticles,
+                                                                omp_get_num_threads()-1,
+                                                                omp_get_thread_num()-1);
+
+                apply_pbc_xy(&(*inout_positions_particles)[(nbOutLower+interval.getMyOffset())*size_particle_positions], interval.getMySize());
+            }
+        }
+
+        // If we do not use thread, process all data sequentially
+        if(more_than_one_thread == false){
+            TIMEZONE("apply_pbc_z_new_particles");
+            if(nbNewFromLow){
+                assert(newParticlesLow.get() != nullptr);
+                apply_pbc_z_new_particles(newParticlesLow.get(), nbNewFromLow);
+                apply_pbc_xy(newParticlesLow.get(), nbNewFromLow);
+            }
+            if(nbNewFromUp){
+                assert(newParticlesUp.get() != nullptr);
+                apply_pbc_z_new_particles(newParticlesUp.get(), nbNewFromUp);
+                apply_pbc_xy(newParticlesUp.get(), nbNewFromUp);
+            }
+
+            apply_pbc_xy(&(*inout_positions_particles)[nbOutLower*size_particle_positions], myTotalNbParticles - nbOutLower - nbOutUpper);
+        }
+
+        // Realloc an merge
+        {
+            TIMEZONE("realloc_copy");
+            const int nbOldParticlesInside = myTotalNbParticles - nbOutLower - nbOutUpper;
+            const int myTotalNewNbParticles = nbOldParticlesInside + nbNewFromLow + nbNewFromUp;
+
+            std::unique_ptr<real_number[]> newArray(new real_number[myTotalNewNbParticles*size_particle_positions]);
+            std::unique_ptr<int[]> newArrayIndexes(new int[myTotalNewNbParticles]);
+            std::vector<std::unique_ptr<real_number[]>> newArrayRhs(in_nb_rhs);
+            for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                newArrayRhs[idx_rhs].reset(new real_number[myTotalNewNbParticles*size_particle_rhs]);
+            }
+
+            // Copy new particles recv form lower first
+            if(nbNewFromLow){
+                const particles_utils::fixed_copy fcp(0, 0, nbNewFromLow);
+                fcp.copy(newArray, newParticlesLow, size_particle_positions);
+                fcp.copy(newArrayIndexes, newParticlesLowIndexes);
+                for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                    fcp.copy(newArrayRhs[idx_rhs], newParticlesLowRhs[idx_rhs], size_particle_rhs);
+                }
+            }
+
+            // Copy my own particles
+            {
+                const particles_utils::fixed_copy fcp(nbNewFromLow, nbOutLower, nbOldParticlesInside);
+                fcp.copy(newArray, (*inout_positions_particles), size_particle_positions);
+                fcp.copy(newArrayIndexes, (*inout_index_particles));
+                for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                    fcp.copy(newArrayRhs[idx_rhs], inout_rhs_particles[idx_rhs], size_particle_rhs);
+                }
+            }
+
+            // Copy new particles from upper at the back
+            if(nbNewFromUp){
+                const particles_utils::fixed_copy fcp(nbNewFromLow+nbOldParticlesInside, 0, nbNewFromUp);
+                fcp.copy(newArray, newParticlesUp, size_particle_positions);
+                fcp.copy(newArrayIndexes, newParticlesUpIndexes);
+                for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                    fcp.copy(newArrayRhs[idx_rhs], newParticlesUpRhs[idx_rhs], size_particle_rhs);
+                }
+            }
+
+            (*inout_positions_particles) = std::move(newArray);
+            (*inout_index_particles) = std::move(newArrayIndexes);
+            for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                inout_rhs_particles[idx_rhs] = std::move(newArrayRhs[idx_rhs]);
+            }
+
+            myTotalNbParticles = myTotalNewNbParticles;
+        }
+
+        // Partitions all particles
+        {
+            TIMEZONE("repartition");
+            particles_utils::partition_extra_z<size_particle_positions>(&(*inout_positions_particles)[0],
+                                             myTotalNbParticles,current_partition_size,
+                                             current_my_nb_particles_per_partition, current_offset_particles_for_partition.get(),
+                                             [&](const int idxPartition){
+                return (idxPartition+1)*spatialPartitionWidth + mySpatialLowLimit;
+            },
+            [&](const int idx1, const int idx2){
+                for(int idx_val = 0 ; idx_val < size_particle_index ; ++idx_val){
+                    std::swap((*inout_index_particles)[idx1], (*inout_index_particles)[idx2]);
+                }
+
+                for(int idx_rhs = 0 ; idx_rhs < in_nb_rhs ; ++idx_rhs){
+                    for(int idx_val = 0 ; idx_val < size_particle_rhs ; ++idx_val){
+                        std::swap(inout_rhs_particles[idx_rhs][idx1*size_particle_rhs + idx_val],
+                                  inout_rhs_particles[idx_rhs][idx2*size_particle_rhs + idx_val]);
+                    }
+                }
+            });
+
+            {// TODO remove
+                for(int idxPartition = 0 ; idxPartition < current_partition_size ; ++idxPartition){
+                    assert(current_my_nb_particles_per_partition[idxPartition] ==
+                           current_offset_particles_for_partition[idxPartition+1] - current_offset_particles_for_partition[idxPartition]);
+                    const real_number limitPartition = (idxPartition+1)*spatialPartitionWidth + mySpatialLowLimit;
+                    for(int idx = 0 ; idx < current_offset_particles_for_partition[idxPartition+1] ; ++idx){
+                        assert((*inout_positions_particles)[idx*3+IDX_Z] < limitPartition);
+                    }
+                    for(int idx = current_offset_particles_for_partition[idxPartition+1] ; idx < myTotalNbParticles ; ++idx){
+                        assert((*inout_positions_particles)[idx*3+IDX_Z] >= limitPartition);
+                    }
+                }
+            }
+        }
+        (*nb_particles) = myTotalNbParticles;
+
+        assert(mpiRequests.size() == 0);
+    }
+
+    virtual void apply_pbc_z_new_particles(real_number* newParticlesLow, const int nbNewFromLow) const = 0;
+    virtual void apply_pbc_xy(real_number* inout_positions_particles, const int nbNew) const = 0;
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    virtual void move_particles(real_number particles_positions[],
+              const int nb_particles,
+              const std::unique_ptr<real_number[]> particles_current_rhs[],
+              const int nb_rhs, const real_number dt) const = 0;
+};
+
+#endif
diff --git a/bfps/cpp/particles/abstract_particles_input.hpp b/bfps/cpp/particles/abstract_particles_input.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb295c40717085e58126530e8403c3ff5b71a014
--- /dev/null
+++ b/bfps/cpp/particles/abstract_particles_input.hpp
@@ -0,0 +1,21 @@
+#ifndef ABSTRACT_PARTICLES_INPUT_HPP
+#define ABSTRACT_PARTICLES_INPUT_HPP
+
+#include <tuple>
+
+template <class real_number>
+class abstract_particles_input {
+public:
+    virtual ~abstract_particles_input(){}
+
+    virtual int getTotalNbParticles()  = 0;
+    virtual int getLocalNbParticles()  = 0;
+    virtual int getNbRhs()  = 0;
+
+    virtual std::unique_ptr<real_number[]> getMyParticles()  = 0;
+    virtual std::unique_ptr<int[]> getMyParticlesIndexes()  = 0;
+    virtual std::vector<std::unique_ptr<real_number[]>> getMyRhs()  = 0;
+};
+
+
+#endif
diff --git a/bfps/cpp/particles/abstract_particles_output.hpp b/bfps/cpp/particles/abstract_particles_output.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..955f1e6fd07f98421837bd9bf359026ea9535b74
--- /dev/null
+++ b/bfps/cpp/particles/abstract_particles_output.hpp
@@ -0,0 +1,192 @@
+#ifndef ABSTRACT_PARTICLES_OUTPUT
+#define ABSTRACT_PARTICLES_OUTPUT
+
+#include <memory>
+#include <vector>
+#include <cassert>
+#include <algorithm>
+#include <cstddef>
+
+#include "base.hpp"
+#include "particles_utils.hpp"
+#include "alltoall_exchanger.hpp"
+#include "scope_timer.hpp"
+
+
+template <class real_number, int size_particle_positions, int size_particle_rhs>
+class abstract_particles_output {
+    MPI_Comm mpi_com;
+
+    int my_rank;
+    int nb_processes;
+
+    const int total_nb_particles;
+    const int nb_rhs;
+
+    std::unique_ptr<std::pair<int,int>[]> buffer_indexes_send;
+    std::unique_ptr<real_number[]> buffer_particles_positions_send;
+    std::vector<std::unique_ptr<real_number[]>> buffer_particles_rhs_send;
+    int size_buffers_send;
+
+    std::unique_ptr<real_number[]> buffer_particles_positions_recv;
+    std::vector<std::unique_ptr<real_number[]>> buffer_particles_rhs_recv;
+    std::unique_ptr<int[]> buffer_indexes_recv;
+    int size_buffers_recv;
+
+
+protected:
+    MPI_Comm& getCom(){
+        return mpi_com;
+    }
+
+    int getTotalNbParticles() const {
+        return total_nb_particles;
+    }
+
+    int getNbRhs() const {
+        return nb_rhs;
+    }
+
+public:
+    abstract_particles_output(MPI_Comm in_mpi_com, const int inTotalNbParticles, const int in_nb_rhs)
+            : mpi_com(in_mpi_com), my_rank(-1), nb_processes(-1),
+                total_nb_particles(inTotalNbParticles), nb_rhs(in_nb_rhs),
+                buffer_particles_rhs_send(in_nb_rhs), size_buffers_send(-1),
+                buffer_particles_rhs_recv(in_nb_rhs), size_buffers_recv(-1){
+
+        AssertMpi(MPI_Comm_rank(mpi_com, &my_rank));
+        AssertMpi(MPI_Comm_size(mpi_com, &nb_processes));
+    }
+
+    virtual ~abstract_particles_output(){
+    }
+
+    void releaseMemory(){
+        buffer_indexes_send.release();
+        buffer_particles_positions_send.release();
+        size_buffers_send = -1;
+        buffer_indexes_recv.release();
+        buffer_particles_positions_recv.release();
+        size_buffers_recv = -1;
+        for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){
+            buffer_particles_rhs_send[idx_rhs].release();
+            buffer_particles_rhs_recv[idx_rhs].release();
+        }
+    }
+
+    void save(const real_number input_particles_positions[], const std::unique_ptr<real_number[]> input_particles_rhs[],
+              const int index_particles[], const int nb_particles, const int idx_time_step){
+        TIMEZONE("abstract_particles_output::save");
+        assert(total_nb_particles != -1);
+
+        {
+            TIMEZONE("sort-to-distribute");
+
+            if(size_buffers_send < nb_particles && nb_particles){
+                buffer_indexes_send.reset(new std::pair<int,int>[nb_particles]);
+                buffer_particles_positions_send.reset(new real_number[nb_particles*size_particle_positions]);
+                for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){
+                    buffer_particles_rhs_send[idx_rhs].reset(new real_number[nb_particles*size_particle_rhs]);
+                }
+                size_buffers_send = nb_particles;
+            }
+
+            for(int idx_part = 0 ; idx_part < nb_particles ; ++idx_part){
+                buffer_indexes_send[idx_part].first = idx_part;
+                buffer_indexes_send[idx_part].second = index_particles[idx_part];
+            }
+
+            std::sort(&buffer_indexes_send[0], &buffer_indexes_send[nb_particles], [](const std::pair<int,int>& p1, const std::pair<int,int>& p2){
+                return p1.second < p2.second;
+            });
+
+            for(int idx_part = 0 ; idx_part < nb_particles ; ++idx_part){
+                const int src_idx = buffer_indexes_send[idx_part].first;
+                const int dst_idx = idx_part;
+
+                for(int idx_val = 0 ; idx_val < size_particle_positions ; ++idx_val){
+                    buffer_particles_positions_send[dst_idx*size_particle_positions + idx_val]
+                            = input_particles_positions[src_idx*size_particle_positions + idx_val];
+                }
+                for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){
+                    for(int idx_val = 0 ; idx_val < int(size_particle_rhs) ; ++idx_val){
+                        buffer_particles_rhs_send[idx_rhs][dst_idx*size_particle_rhs + idx_val]
+                                = input_particles_rhs[idx_rhs][src_idx*size_particle_rhs + idx_val];
+                    }
+                }
+            }
+        }
+
+        const particles_utils::IntervalSplitter<int> particles_splitter(total_nb_particles, nb_processes, my_rank);
+
+        int* buffer_indexes_send_tmp = reinterpret_cast<int*>(buffer_indexes_send.get());// trick re-use buffer_indexes_send memory
+        std::vector<int> nb_particles_to_send(nb_processes, 0);
+        for(int idx_part = 0 ; idx_part < nb_particles ; ++idx_part){
+            nb_particles_to_send[particles_splitter.getOwner(buffer_indexes_send[idx_part].second)] += 1;
+            buffer_indexes_send_tmp[idx_part] = buffer_indexes_send[idx_part].second;
+        }
+
+        alltoall_exchanger exchanger(mpi_com, std::move(nb_particles_to_send));
+        // nb_particles_to_send is invalid after here
+
+        const int nb_to_receive = exchanger.getTotalToRecv();
+        assert(nb_to_receive == particles_splitter.getMySize());
+
+        if(size_buffers_recv < nb_to_receive && nb_to_receive){
+            buffer_indexes_recv.reset(new int[nb_to_receive]);
+            buffer_particles_positions_recv.reset(new real_number[nb_to_receive*size_particle_positions]);
+            for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){
+                buffer_particles_rhs_recv[idx_rhs].reset(new real_number[nb_to_receive*size_particle_rhs]);
+            }
+            size_buffers_recv = nb_to_receive;
+        }
+
+        {
+            TIMEZONE("exchange");
+            // Could be done with multiple asynchronous coms
+            exchanger.alltoallv<int>(buffer_indexes_send_tmp, buffer_indexes_recv.get());
+            exchanger.alltoallv<real_number>(buffer_particles_positions_send.get(), buffer_particles_positions_recv.get(), size_particle_positions);
+            for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){
+                exchanger.alltoallv<real_number>(buffer_particles_rhs_send[idx_rhs].get(), buffer_particles_rhs_recv[idx_rhs].get(), size_particle_rhs);
+            }
+        }
+
+        if(size_buffers_send < nb_to_receive && nb_to_receive){
+            buffer_indexes_send.reset(new std::pair<int,int>[nb_to_receive]);
+            buffer_particles_positions_send.reset(new real_number[nb_to_receive*size_particle_positions]);
+            for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){
+                buffer_particles_rhs_send[idx_rhs].reset(new real_number[nb_to_receive*size_particle_rhs]);
+            }
+            size_buffers_send = nb_to_receive;
+        }
+
+        {
+            TIMEZONE("copy-local-order");
+            for(int idx_part = 0 ; idx_part < nb_to_receive ; ++idx_part){
+                const int src_idx = idx_part;
+                const int dst_idx = buffer_indexes_recv[idx_part]-particles_splitter.getMyOffset();
+                assert(0 <= dst_idx);
+                assert(dst_idx < particles_splitter.getMySize());
+
+                for(int idx_val = 0 ; idx_val < size_particle_positions ; ++idx_val){
+                    buffer_particles_positions_send[dst_idx*size_particle_positions + idx_val]
+                            = buffer_particles_positions_recv[src_idx*size_particle_positions + idx_val];
+                }
+                for(int idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){
+                    for(int idx_val = 0 ; idx_val < int(size_particle_rhs) ; ++idx_val){
+                        buffer_particles_rhs_send[idx_rhs][dst_idx*size_particle_rhs + idx_val]
+                                = buffer_particles_rhs_recv[idx_rhs][src_idx*size_particle_rhs + idx_val];
+                    }
+                }
+            }
+        }
+
+        write(idx_time_step, buffer_particles_positions_send.get(), buffer_particles_rhs_send.data(),
+              nb_to_receive, particles_splitter.getMyOffset());
+    }
+
+    virtual void write(const int idx_time_step, const real_number* positions, const std::unique_ptr<real_number[]>* rhs,
+                       const int nb_particles, const int particles_idx_offset) = 0;
+};
+
+#endif
diff --git a/bfps/cpp/particles/abstract_particles_system.hpp b/bfps/cpp/particles/abstract_particles_system.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..32510404b4fa69596a53385b470aea0d4136b08b
--- /dev/null
+++ b/bfps/cpp/particles/abstract_particles_system.hpp
@@ -0,0 +1,32 @@
+#ifndef ABSTRACT_PARTICLES_SYSTEM_HPP
+#define ABSTRACT_PARTICLES_SYSTEM_HPP
+
+#include <memory>
+
+template <class real_number>
+class abstract_particles_system {
+public:
+    virtual void compute() = 0;
+
+    virtual void move(const real_number dt) = 0;
+
+    virtual void redistribute() = 0;
+
+    virtual void inc_step_idx() = 0;
+
+    virtual void shift_rhs_vectors() = 0;
+
+    virtual void completeLoop(const real_number dt) = 0;
+
+    virtual const real_number* getParticlesPositions() const = 0;
+
+    virtual const std::unique_ptr<real_number[]>* getParticlesRhs() const = 0;
+
+    virtual const int* getParticlesIndexes() const = 0;
+
+    virtual int getLocalNbParticles() const = 0;
+
+    virtual int getNbRhs() const = 0;
+};
+
+#endif
diff --git a/bfps/cpp/particles/alltoall_exchanger.hpp b/bfps/cpp/particles/alltoall_exchanger.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c592011c9772afad1ab68555b179754f868624e
--- /dev/null
+++ b/bfps/cpp/particles/alltoall_exchanger.hpp
@@ -0,0 +1,109 @@
+#ifndef ALLTOALL_EXCHANGER_HPP
+#define ALLTOALL_EXCHANGER_HPP
+
+#include <mpi.h>
+#include <cassert>
+
+#include "base.hpp"
+#include "particles_utils.hpp"
+#include "scope_timer.hpp"
+
+class alltoall_exchanger {
+    const MPI_Comm mpi_com;
+
+    int my_rank;
+    int nb_processes;
+
+    const std::vector<int> nb_items_to_send;
+
+    std::vector<int> offset_items_to_send;
+
+    std::vector<int> nb_items_to_sendrecv_all;
+    std::vector<int> nb_items_to_recv;
+    std::vector<int> offset_items_to_recv;
+
+    int total_to_recv;
+
+public:
+    alltoall_exchanger(const MPI_Comm& in_mpi_com, std::vector<int>/*no ref to move here*/ in_nb_items_to_send)
+        :mpi_com(in_mpi_com), nb_items_to_send(std::move(in_nb_items_to_send)), total_to_recv(0){
+        TIMEZONE("alltoall_exchanger::constructor");
+
+        AssertMpi(MPI_Comm_rank(mpi_com, &my_rank));
+        AssertMpi(MPI_Comm_size(mpi_com, &nb_processes));
+
+        assert(int(nb_items_to_send.size()) == nb_processes);
+
+        offset_items_to_send.resize(nb_processes+1, 0);
+        for(int idx_proc = 0 ; idx_proc < nb_processes ; ++idx_proc){
+            offset_items_to_send[idx_proc+1] = offset_items_to_send[idx_proc]
+                                             + nb_items_to_send[idx_proc];
+        }
+
+        nb_items_to_sendrecv_all.resize(nb_processes*nb_processes);
+        AssertMpi(MPI_Allgather(const_cast<int*>(nb_items_to_send.data()), nb_processes, MPI_INT,
+                          nb_items_to_sendrecv_all.data(), nb_processes, MPI_INT,
+                          mpi_com));
+
+        nb_items_to_recv.resize(nb_processes, 0);
+        offset_items_to_recv.resize(nb_processes+1, 0);
+        for(int idx_proc = 0 ; idx_proc < nb_processes ; ++idx_proc){
+            const int nbrecv = nb_items_to_sendrecv_all[idx_proc*nb_processes + my_rank];
+            total_to_recv += nbrecv;
+            nb_items_to_recv[idx_proc] = nbrecv;
+            offset_items_to_recv[idx_proc+1] = nb_items_to_recv[idx_proc]
+                                                    + offset_items_to_recv[idx_proc];
+        }
+    }
+
+    int getTotalToRecv() const{
+        return total_to_recv;
+    }
+
+    template <class ItemType>
+    void alltoallv_dt(const ItemType in_to_send[],
+                   ItemType out_to_recv[], const MPI_Datatype& in_type) const {
+        TIMEZONE("alltoallv");
+        AssertMpi(MPI_Alltoallv(const_cast<ItemType*>(in_to_send), const_cast<int*>(nb_items_to_send.data()),
+                          const_cast<int*>(offset_items_to_send.data()), in_type, out_to_recv,
+                          const_cast<int*>(nb_items_to_recv.data()), const_cast<int*>(offset_items_to_recv.data()), in_type,
+                          mpi_com));
+    }
+
+    template <class ItemType>
+    void alltoallv(const ItemType in_to_send[],
+                   ItemType out_to_recv[]) const {
+        alltoallv_dt<ItemType>(in_to_send, out_to_recv, particles_utils::GetMpiType(ItemType()));
+    }
+
+    template <class ItemType>
+    void alltoallv_dt(const ItemType in_to_send[],
+                   ItemType out_to_recv[], const MPI_Datatype& in_type, const int in_nb_values_per_item) const {
+        TIMEZONE("alltoallv");
+        std::vector<int> nb_items_to_send_tmp = nb_items_to_send;
+        particles_utils::transform(nb_items_to_send_tmp.begin(), nb_items_to_send_tmp.end(), nb_items_to_send_tmp.begin(),
+                                   [&](const int val) -> int { return val * in_nb_values_per_item ;});
+        std::vector<int> offset_items_to_send_tmp = offset_items_to_send;
+        particles_utils::transform(offset_items_to_send_tmp.begin(), offset_items_to_send_tmp.end(), offset_items_to_send_tmp.begin(),
+                                   [&](const int val) -> int { return val * in_nb_values_per_item ;});
+        std::vector<int> nb_items_to_recv_tmp = nb_items_to_recv;
+        particles_utils::transform(nb_items_to_recv_tmp.begin(), nb_items_to_recv_tmp.end(), nb_items_to_recv_tmp.begin(),
+                                   [&](const int val) -> int { return val * in_nb_values_per_item ;});
+        std::vector<int> offset_items_to_recv_tmp = offset_items_to_recv;
+        particles_utils::transform(offset_items_to_recv_tmp.begin(), offset_items_to_recv_tmp.end(), offset_items_to_recv_tmp.begin(),
+                                   [&](const int val) -> int { return val * in_nb_values_per_item ;});
+
+        AssertMpi(MPI_Alltoallv(const_cast<ItemType*>(in_to_send), const_cast<int*>(nb_items_to_send_tmp.data()),
+                          const_cast<int*>(offset_items_to_send_tmp.data()), in_type, out_to_recv,
+                          const_cast<int*>(nb_items_to_recv_tmp.data()), const_cast<int*>(offset_items_to_recv_tmp.data()), in_type,
+                          mpi_com));
+    }
+
+    template <class ItemType>
+    void alltoallv(const ItemType in_to_send[],
+                   ItemType out_to_recv[], const int in_nb_values_per_item) const {
+        alltoallv_dt<ItemType>(in_to_send, out_to_recv,particles_utils::GetMpiType(ItemType()), in_nb_values_per_item);
+    }
+};
+
+#endif
diff --git a/bfps/cpp/particles/field_accessor.hpp b/bfps/cpp/particles/field_accessor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb4fae763a6a65f7d17fb88d85b9d31a45d48a18
--- /dev/null
+++ b/bfps/cpp/particles/field_accessor.hpp
@@ -0,0 +1,54 @@
+#ifndef FIELD_ACCESSOR_HPP
+#define FIELD_ACCESSOR_HPP
+
+#include <algorithm>
+#include <array>
+
+#include "particles_utils.hpp"
+
+template <class real_number>
+class field_accessor {
+    static const int nb_dim = 3;
+
+    const real_number* field_date;
+    std::array<size_t,3> local_field_dims;
+    std::array<size_t,3> local_field_offset;
+    std::array<size_t,3> field_memory_dims;
+
+public:
+    field_accessor(const real_number* in_field_date, const std::array<size_t,3>& in_dims,
+                   const std::array<size_t,3>& in_local_field_offset,
+                   const std::array<size_t,3>& in_field_memory_dims)
+            : field_date(in_field_date), local_field_dims(in_dims),
+              local_field_offset(in_local_field_offset),
+              field_memory_dims(in_field_memory_dims){
+    }
+
+    ~field_accessor(){}
+
+    const real_number& getValue(const size_t in_index, const int in_dim) const {
+        assert(in_index < field_memory_dims[0]*field_memory_dims[1]*field_memory_dims[2]);
+        return field_date[in_index*nb_dim + in_dim];
+    }
+
+    size_t getIndexFromGlobalPosition(const size_t in_global_x, const size_t in_global_y, const size_t in_global_z) const {
+        return getIndexFromLocalPosition(in_global_x - local_field_offset[IDX_X],
+                                         in_global_y - local_field_offset[IDX_Y],
+                                         in_global_z - local_field_offset[IDX_Z]);
+    }
+
+    size_t getIndexFromLocalPosition(const size_t in_local_x, const size_t in_local_y, const size_t in_local_z) const {
+        assert(0 <= in_local_x && in_local_x < local_field_dims[IDX_X]);
+        assert(0 <= in_local_y && in_local_y < local_field_dims[IDX_Y]);
+        assert(0 <= in_local_z && in_local_z < local_field_dims[IDX_Z]);
+        static_assert(IDX_X == 2 && IDX_Y == 1 && IDX_Z == 0,
+                      "Dimension idx does not match, please ensure getIndexFromLocalPosition"
+                      "is correct before commenting this assert");
+        return (((in_local_z)*field_memory_dims[1] +
+                in_local_y)*(field_memory_dims[2]) +
+                in_local_x);
+    }
+};
+
+
+#endif
diff --git a/bfps/cpp/particles/particles_adams_bashforth.hpp b/bfps/cpp/particles/particles_adams_bashforth.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aaa81e03515c4dad9e7468d3435a3bee0adc8487
--- /dev/null
+++ b/bfps/cpp/particles/particles_adams_bashforth.hpp
@@ -0,0 +1,113 @@
+#ifndef PARTICLES_ADAMS_BASHFORTH_HPP
+#define PARTICLES_ADAMS_BASHFORTH_HPP
+
+#include <stdexcept>
+#include <omp.h>
+
+#include "scope_timer.hpp"
+#include "particles_utils.hpp"
+
+template <class real_number, int size_particle_positions = 3, int size_particle_rhs = 3>
+class particles_adams_bashforth {
+public:
+    static const int Max_steps = 6;
+
+    void move_particles(real_number particles_positions[],
+                       const int nb_particles,
+                       const std::unique_ptr<real_number[]> particles_rhs[],
+                       const int nb_rhs, const real_number dt) const{
+        TIMEZONE("particles_adams_bashforth::move_particles");
+
+        if(Max_steps < nb_rhs){
+            throw std::runtime_error("Error, in bfps particles_adams_bashforth.\n"
+                                     "Step in particles_adams_bashforth is too large,"
+                                     "you must add formulation up this number or limit the number of steps.");
+        }
+
+        // Not needed: TIMEZONE_OMP_INIT_PREPARALLEL(omp_get_max_threads())
+        #pragma omp parallel default(shared)
+        {
+            particles_utils::IntervalSplitter<int> interval(nb_particles,
+                                                       omp_get_num_threads(),
+                                                       omp_get_thread_num());
+            const int last_idx = interval.getMyOffset()+interval.getMySize();
+
+            // TODO full unroll + blocking
+            switch (nb_rhs){
+            case 1:
+                for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){
+                    for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){
+                        // dt × [0]
+                        particles_positions[idx_part*size_particle_positions + idx_dim]
+                                += dt * particles_rhs[0][idx_part*size_particle_rhs + idx_dim];
+                    }
+                }
+                break;
+            case 2:
+                for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){
+                    for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){
+                        // dt × (3[0] - [1])/2
+                        particles_positions[idx_part*size_particle_positions + idx_dim]
+                                += dt * (3.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim]
+                                          - particles_rhs[1][idx_part*size_particle_rhs + idx_dim])/2.;
+                    }
+                }
+                break;
+            case 3:
+                for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){
+                    for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){
+                        // dt × (23[0] - 16[1] + 5[2])/12
+                        particles_positions[idx_part*size_particle_positions + idx_dim]
+                                += dt * (23.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim]
+                                       - 16.*particles_rhs[1][idx_part*size_particle_rhs + idx_dim]
+                                       +  5.*particles_rhs[2][idx_part*size_particle_rhs + idx_dim])/12.;
+                    }
+                }
+                break;
+            case 4:
+                for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){
+                    for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){
+                        // dt × (55[0] - 59[1] + 37[2] - 9[3])/24
+                        particles_positions[idx_part*size_particle_positions + idx_dim]
+                                += dt * (55.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim]
+                                       - 59.*particles_rhs[1][idx_part*size_particle_rhs + idx_dim]
+                                       + 37.*particles_rhs[2][idx_part*size_particle_rhs + idx_dim]
+                                       -  9.*particles_rhs[3][idx_part*size_particle_rhs + idx_dim])/24.;
+                    }
+                }
+                break;
+            case 5:
+                for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){
+                    for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){
+                        // dt × (1901[0] - 2774[1] + 2616[2] - 1274[3] + 251[4])/720
+                        particles_positions[idx_part*size_particle_positions + idx_dim]
+                                += dt * (1901.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim]
+                                       - 2774.*particles_rhs[1][idx_part*size_particle_rhs + idx_dim]
+                                       + 2616.*particles_rhs[2][idx_part*size_particle_rhs + idx_dim]
+                                       - 1274.*particles_rhs[3][idx_part*size_particle_rhs + idx_dim]
+                                       +  251.*particles_rhs[4][idx_part*size_particle_rhs + idx_dim])/720.;
+                    }
+                }
+                break;
+            case 6:
+                for(int idx_part = interval.getMyOffset() ; idx_part < last_idx ; ++idx_part){
+                    for(int idx_dim = 0 ; idx_dim < size_particle_positions ; ++idx_dim){
+                        // dt × (4277[0] - 7923[1] + 9982[2] - 7298[3] + 2877[4] - 475[5])/1440
+                        particles_positions[idx_part*size_particle_positions + idx_dim]
+                                += dt * (4277.*particles_rhs[0][idx_part*size_particle_rhs + idx_dim]
+                                       - 7923.*particles_rhs[1][idx_part*size_particle_rhs + idx_dim]
+                                       + 9982.*particles_rhs[2][idx_part*size_particle_rhs + idx_dim]
+                                       - 7298.*particles_rhs[3][idx_part*size_particle_rhs + idx_dim]
+                                       + 2877.*particles_rhs[4][idx_part*size_particle_rhs + idx_dim]
+                                       -  475.*particles_rhs[5][idx_part*size_particle_rhs + idx_dim])/1440.;
+                    }
+                }
+                break;
+            }
+        }
+    }
+};
+
+
+
+#endif
diff --git a/bfps/cpp/particles/particles_field_computer.hpp b/bfps/cpp/particles/particles_field_computer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..80f4745070953509efee5af587230d6941287d14
--- /dev/null
+++ b/bfps/cpp/particles/particles_field_computer.hpp
@@ -0,0 +1,234 @@
+#ifndef PARTICLES_FIELD_COMPUTER_HPP
+#define PARTICLES_FIELD_COMPUTER_HPP
+
+#include <array>
+#include <utility>
+
+#include "abstract_particles_distr.hpp"
+#include "scope_timer.hpp"
+#include "particles_utils.hpp"
+
+template <class real_number, class interpolator_class, class field_class, int interp_neighbours, class positions_updater_class >
+class particles_field_computer : public abstract_particles_distr<real_number, 3,3,1> {
+    using Parent = abstract_particles_distr<real_number, 3,3,1>;
+
+    const std::array<size_t,3> field_grid_dim;
+    const std::pair<int,int> current_partition_interval;
+
+    const interpolator_class& interpolator;
+    const field_class& field;
+
+    const positions_updater_class positions_updater;
+
+    const std::array<real_number,3> spatial_box_width;
+    const std::array<real_number,3> box_step_width;
+    const real_number my_spatial_low_limit_z;
+    const real_number my_spatial_up_limit_z;
+
+    int deriv[3];
+
+    ////////////////////////////////////////////////////////////////////////
+    /// Computation related
+    ////////////////////////////////////////////////////////////////////////
+
+    virtual void init_result_array(real_number particles_current_rhs[],
+                                   const int nb_particles) const final{
+        // Set values to zero initialy
+        std::fill(particles_current_rhs, particles_current_rhs+nb_particles*3, 0);
+    }
+
+    real_number get_norm_pos_in_cell(const real_number in_pos, const int idx_pos) const {
+        const real_number cell_idx = floor(in_pos/box_step_width[idx_pos]);
+        const real_number pos_in_cell = (in_pos - cell_idx*box_step_width[idx_pos]) / box_step_width[idx_pos];
+        assert(0 <= pos_in_cell && pos_in_cell < 1);
+        return pos_in_cell;
+    }
+
+    virtual void apply_computation(const real_number particles_positions[],
+                                   real_number particles_current_rhs[],
+                                   const int nb_particles) const final{
+        TIMEZONE("particles_field_computer::apply_computation");
+        for(int idxPart = 0 ; idxPart < nb_particles ; ++idxPart){
+            const real_number reltv_x = get_norm_pos_in_cell(particles_positions[idxPart*3+IDX_X], IDX_X);
+            const real_number reltv_y = get_norm_pos_in_cell(particles_positions[idxPart*3+IDX_Y], IDX_Y);
+            const real_number reltv_z = get_norm_pos_in_cell(particles_positions[idxPart*3+IDX_Z], IDX_Z);
+
+            typename interpolator_class::real_number bx[interp_neighbours*2+2], by[interp_neighbours*2+2], bz[interp_neighbours*2+2];
+            interpolator.compute_beta(deriv[IDX_X], reltv_x, bx);
+            interpolator.compute_beta(deriv[IDX_Y], reltv_y, by);
+            interpolator.compute_beta(deriv[IDX_Z], reltv_z, bz);
+
+            const int partGridIdx_x = int(particles_positions[idxPart*3+IDX_X]/box_step_width[IDX_X]);
+            const int partGridIdx_y = int(particles_positions[idxPart*3+IDX_Y]/box_step_width[IDX_Y]);
+            const int partGridIdx_z = int(particles_positions[idxPart*3+IDX_Z]/box_step_width[IDX_Z]);
+
+            assert(0 <= partGridIdx_x && partGridIdx_x < int(field_grid_dim[IDX_X]));
+            assert(0 <= partGridIdx_y && partGridIdx_y < int(field_grid_dim[IDX_Y]));
+            assert(0 <= partGridIdx_z && partGridIdx_z < int(field_grid_dim[IDX_Z]));
+
+            const int interp_limit_mx = partGridIdx_x-interp_neighbours;
+            const int interp_limit_x = partGridIdx_x+interp_neighbours+1;
+            const int interp_limit_my = partGridIdx_y-interp_neighbours;
+            const int interp_limit_y = partGridIdx_y+interp_neighbours+1;
+            const int interp_limit_mz_bz = partGridIdx_z-interp_neighbours;
+
+            int interp_limit_mz[2];
+            int interp_limit_z[2];
+            int nb_z_intervals;
+
+            if((partGridIdx_z-interp_neighbours) < 0){
+                assert(partGridIdx_z+interp_neighbours+1 < int(field_grid_dim[IDX_Z]));
+                interp_limit_mz[0] = ((partGridIdx_z-interp_neighbours)+field_grid_dim[IDX_Z])%field_grid_dim[IDX_Z];
+                interp_limit_z[0] = current_partition_interval.second-1;
+
+                interp_limit_mz[1] = std::max(0, current_partition_interval.first);// max is not really needed here
+                interp_limit_z[1] = std::min(partGridIdx_z+interp_neighbours+1, current_partition_interval.second-1);
+
+                nb_z_intervals = 2;
+            }
+            else if(int(field_grid_dim[2]) <= (partGridIdx_z+interp_neighbours+1)){
+                interp_limit_mz[0] = std::max(current_partition_interval.first, partGridIdx_z-interp_neighbours);
+                interp_limit_z[0] = std::min(int(field_grid_dim[IDX_Z])-1,current_partition_interval.second-1);// max is not really needed here
+
+                interp_limit_mz[1] = std::max(0, current_partition_interval.first);
+                interp_limit_z[1] = std::min(int((partGridIdx_z+interp_neighbours+1+field_grid_dim[IDX_Z])%field_grid_dim[IDX_Z]), current_partition_interval.second-1);
+
+                nb_z_intervals = 2;
+            }
+            else{
+                interp_limit_mz[0] = std::max(partGridIdx_z-interp_neighbours, current_partition_interval.first);
+                interp_limit_z[0] = std::min(partGridIdx_z+interp_neighbours+1, current_partition_interval.second-1);
+                nb_z_intervals = 1;
+            }
+
+            for(int idx_inter = 0 ; idx_inter < nb_z_intervals ; ++idx_inter){
+                for(int idx_z = interp_limit_mz[idx_inter] ; idx_z <= interp_limit_z[idx_inter] ; ++idx_z ){
+                    const int idx_z_pbc = (idx_z + field_grid_dim[IDX_Z])%field_grid_dim[IDX_Z];
+                    assert(current_partition_interval.first <= idx_z_pbc && idx_z_pbc < current_partition_interval.second);
+                    assert(((idx_z+field_grid_dim[IDX_Z]-interp_limit_mz_bz)%field_grid_dim[IDX_Z]) < interp_neighbours*2+2);
+
+                    for(int idx_x = interp_limit_mx ; idx_x <= interp_limit_x ; ++idx_x ){
+                        const int idx_x_pbc = (idx_x + field_grid_dim[IDX_X])%field_grid_dim[IDX_X];
+                        assert(idx_x-interp_limit_mx < interp_neighbours*2+2);
+
+                        for(int idx_y = interp_limit_my ; idx_y <= interp_limit_y ; ++idx_y ){
+                            const int idx_y_pbc = (idx_y + field_grid_dim[IDX_Y])%field_grid_dim[IDX_Y];
+                            assert(idx_y-interp_limit_my < interp_neighbours*2+2);
+
+                            const real_number coef = (bz[((idx_z+field_grid_dim[IDX_Z]-interp_limit_mz_bz)%field_grid_dim[IDX_Z])]
+                                                    * by[idx_y-interp_limit_my]
+                                                    * bx[idx_x-interp_limit_mx]);
+
+                            const ptrdiff_t tindex = field.getIndexFromGlobalPosition(idx_x_pbc, idx_y_pbc, idx_z_pbc);
+
+                            // getValue does not necessary return real_number
+                            particles_current_rhs[idxPart*3+IDX_X] += real_number(field.getValue(tindex,IDX_X))*coef;
+                            particles_current_rhs[idxPart*3+IDX_Y] += real_number(field.getValue(tindex,IDX_Y))*coef;
+                            particles_current_rhs[idxPart*3+IDX_Z] += real_number(field.getValue(tindex,IDX_Z))*coef;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    virtual void reduce_particles_rhs(real_number particles_current_rhs[],
+                                  const real_number extra_particles_current_rhs[],
+                                  const int nb_particles) const final{
+        TIMEZONE("particles_field_computer::reduce_particles");
+        // Simply sum values
+        for(int idxPart = 0 ; idxPart < nb_particles ; ++idxPart){
+            particles_current_rhs[idxPart*3+IDX_X] += extra_particles_current_rhs[idxPart*3+IDX_X];
+            particles_current_rhs[idxPart*3+IDX_Y] += extra_particles_current_rhs[idxPart*3+IDX_Y];
+            particles_current_rhs[idxPart*3+IDX_Z] += extra_particles_current_rhs[idxPart*3+IDX_Z];
+        }
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////
+    /// Re-distribution related
+    ////////////////////////////////////////////////////////////////////////
+
+    void apply_pbc_xy(real_number* inout_particles, const int size) const final {
+        TIMEZONE("particles_field_computer::apply_pbc_xy");
+        const std::array<int, 2> dims_xy={IDX_X, IDX_Y};
+        for(int idxPart = 0 ; idxPart < size ; ++idxPart){
+            // Consider it will never move for more than one box repeatition
+            for(const int idxDim : dims_xy){
+                if(inout_particles[idxPart*3+idxDim] < 0) inout_particles[idxPart*3+idxDim] += spatial_box_width[idxDim];
+                else if(spatial_box_width[idxDim] <= inout_particles[idxPart*3+idxDim]) inout_particles[idxPart*3+idxDim] -= spatial_box_width[idxDim];
+                assert(0 <= inout_particles[idxPart*3+idxDim] && inout_particles[idxPart*3+idxDim] < spatial_box_width[idxDim]);
+            }
+        }
+    }
+
+    void apply_pbc_z_new_particles(real_number* values, const int size) const final {
+        TIMEZONE("particles_field_computer::apply_pbc_z_new_particles");
+        if(Parent::my_rank == 0){
+            const int idxDim = IDX_Z;
+            for(int idxPart = 0 ; idxPart < size ; ++idxPart){
+                assert(values[idxPart*3+idxDim] < my_spatial_up_limit_z || spatial_box_width[idxDim] <= values[idxPart*3+idxDim]);
+                assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim]);
+
+                if(spatial_box_width[idxDim] <= values[idxPart*3+idxDim]) values[idxPart*3+idxDim] -= spatial_box_width[idxDim];
+
+                assert(0 <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < spatial_box_width[idxDim]);
+                assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < my_spatial_up_limit_z);
+            }
+        }
+        else if(Parent::my_rank == Parent::nb_processes_involved - 1){
+            const int idxDim = IDX_Z;
+            for(int idxPart = 0 ; idxPart < size ; ++idxPart){
+                assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim] || values[idxPart*3+idxDim] < 0);
+                assert(values[idxPart*3+idxDim] < spatial_box_width[idxDim]);
+
+                if(values[idxPart*3+idxDim] < 0) values[idxPart*3+idxDim] += spatial_box_width[idxDim];
+
+                assert(0 <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < spatial_box_width[idxDim]);
+                assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < my_spatial_up_limit_z);
+            }
+        }
+        else{
+            const int idxDim = IDX_Z;
+            for(int idxPart = 0 ; idxPart < size ; ++idxPart){
+                assert(my_spatial_low_limit_z <= values[idxPart*3+idxDim] && values[idxPart*3+idxDim] < my_spatial_up_limit_z);
+            }
+        }
+    }
+
+public:
+
+    particles_field_computer(MPI_Comm in_current_com, const std::array<size_t,3>& in_field_grid_dim,
+                             const std::pair<int,int>& in_current_partitions,
+                             const interpolator_class& in_interpolator,
+                             const field_class& in_field,
+                             const std::array<real_number,3>& in_spatial_box_width,
+                             const std::array<real_number,3>& in_box_step_width, const real_number in_my_spatial_low_limit_z,
+                             const real_number in_my_spatial_up_limit_z)
+        : abstract_particles_distr<real_number, 3,3,1>(in_current_com, in_current_partitions),
+          field_grid_dim(in_field_grid_dim), current_partition_interval(in_current_partitions),
+          interpolator(in_interpolator), field(in_field), positions_updater(),
+          spatial_box_width(in_spatial_box_width), box_step_width(in_box_step_width),
+          my_spatial_low_limit_z(in_my_spatial_low_limit_z), my_spatial_up_limit_z(in_my_spatial_up_limit_z){
+        deriv[IDX_X] = 0;
+        deriv[IDX_Y] = 0;
+        deriv[IDX_Z] = 0;
+    }
+
+    ////////////////////////////////////////////////////////////////////////
+    /// Update position
+    ////////////////////////////////////////////////////////////////////////
+
+    void move_particles(real_number particles_positions[],
+                   const int nb_particles,
+                   const std::unique_ptr<real_number[]> particles_current_rhs[],
+                   const int nb_rhs, const real_number dt) const final{
+        TIMEZONE("particles_field_computer::move_particles");
+        positions_updater.move_particles(particles_positions, nb_particles,
+                                         particles_current_rhs, nb_rhs, dt);
+    }
+
+};
+
+
+#endif
diff --git a/bfps/cpp/particles/particles_input_hdf5.hpp b/bfps/cpp/particles/particles_input_hdf5.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce1bddc152f459576d3e539ed1a73fde69a651e3
--- /dev/null
+++ b/bfps/cpp/particles/particles_input_hdf5.hpp
@@ -0,0 +1,305 @@
+#ifndef PARTICLES_INPUT_HDF5_HPP
+#define PARTICLES_INPUT_HDF5_HPP
+
+#include <tuple>
+#include <mpi.h>
+#include <hdf5.h>
+#include <cassert>
+#include <vector>
+
+#include "abstract_particles_input.hpp"
+#include "base.hpp"
+#include "alltoall_exchanger.hpp"
+#include "particles_utils.hpp"
+#include "scope_timer.hpp"
+
+
+// why is "size_particle_rhs" a template parameter?
+// I think it's safe to assume this will always be 3.
+template <class real_number, int size_particle_positions, int size_particle_rhs>
+class particles_input_hdf5 : public abstract_particles_input<real_number> {
+    const std::string filename;
+
+    MPI_Comm mpi_comm;
+    int my_rank;
+    int nb_processes;
+
+    hsize_t nb_total_particles;
+    hsize_t nb_rhs;
+    int nb_particles_for_me;
+
+    std::unique_ptr<real_number[]> my_particles_positions;
+    std::unique_ptr<int[]> my_particles_indexes;
+    std::vector<std::unique_ptr<real_number[]>> my_particles_rhs;
+
+    static std::vector<real_number> BuildLimitsAllProcesses(MPI_Comm mpi_comm,
+                                                       const real_number my_spatial_low_limit, const real_number my_spatial_up_limit){
+        int my_rank;
+        int nb_processes;
+
+        AssertMpi(MPI_Comm_rank(mpi_comm, &my_rank));
+        AssertMpi(MPI_Comm_size(mpi_comm, &nb_processes));
+
+        std::vector<real_number> spatial_limit_per_proc(nb_processes*2);
+
+        real_number intervalToSend[2] = {my_spatial_low_limit, my_spatial_up_limit};
+        AssertMpi(MPI_Allgather(intervalToSend, 2, particles_utils::GetMpiType(real_number()),
+                                spatial_limit_per_proc.data(), 2, particles_utils::GetMpiType(real_number()), mpi_comm));
+
+        for(int idx_proc = 0; idx_proc < nb_processes-1 ; ++idx_proc){
+            assert(spatial_limit_per_proc[idx_proc*2] <= spatial_limit_per_proc[idx_proc*2+1]);
+            assert(spatial_limit_per_proc[idx_proc*2+1] == spatial_limit_per_proc[(idx_proc+1)*2]);
+            spatial_limit_per_proc[idx_proc+1] = spatial_limit_per_proc[idx_proc*2+1];
+        }
+        spatial_limit_per_proc[nb_processes] = spatial_limit_per_proc[(nb_processes-1)*2+1];
+        spatial_limit_per_proc.resize(nb_processes+1);
+
+        return spatial_limit_per_proc;
+    }
+
+public:
+    particles_input_hdf5(const MPI_Comm in_mpi_comm,const std::string& inFilename,
+                         const std::string& inDatanameState, const std::string& inDatanameRhs,
+                         const real_number my_spatial_low_limit, const real_number my_spatial_up_limit)
+        : particles_input_hdf5(in_mpi_comm, inFilename, inDatanameState, inDatanameRhs,
+                               BuildLimitsAllProcesses(in_mpi_comm, my_spatial_low_limit, my_spatial_up_limit)){
+    }
+
+    particles_input_hdf5(const MPI_Comm in_mpi_comm,const std::string& inFilename,
+                         const std::string& inDatanameState, const std::string& inDatanameRhs,
+                         const std::vector<real_number>& in_spatial_limit_per_proc)
+        : filename(inFilename),
+          mpi_comm(in_mpi_comm), my_rank(-1), nb_processes(-1), nb_total_particles(0),
+          nb_particles_for_me(-1){
+        TIMEZONE("particles_input_hdf5");
+
+        AssertMpi(MPI_Comm_rank(mpi_comm, &my_rank));
+        AssertMpi(MPI_Comm_size(mpi_comm, &nb_processes));
+        assert(int(in_spatial_limit_per_proc.size()) == nb_processes+1);
+
+        hid_t plist_id_par = H5Pcreate(H5P_FILE_ACCESS);
+        assert(plist_id_par >= 0);
+        {
+            int retTest = H5Pset_fapl_mpio(plist_id_par, mpi_comm, MPI_INFO_NULL);
+            assert(retTest >= 0);
+        }
+
+        hid_t particle_file = H5Fopen(filename.c_str(), H5F_ACC_RDONLY, plist_id_par);
+        assert(particle_file >= 0);
+
+        {
+            TIMEZONE("state");
+            hid_t dset = H5Dopen(particle_file, inDatanameState.c_str(), H5P_DEFAULT);
+            assert(dset >= 0);
+
+            hid_t dspace = H5Dget_space(dset); // copy?
+            assert(dspace >= 0);
+
+            hid_t space_dim = H5Sget_simple_extent_ndims(dspace);
+            assert(space_dim >= 2);
+
+            std::vector<hsize_t> state_dim_array(space_dim);
+            int hdfret = H5Sget_simple_extent_dims(dspace, &state_dim_array[0], NULL);
+            assert(hdfret >= 0);
+            // Last value is the position dim of the particles
+            assert(state_dim_array.back() == size_particle_positions);
+
+            nb_total_particles = 1;
+            for (size_t idx_dim = 0; idx_dim < state_dim_array.size()-1; ++idx_dim){
+                nb_total_particles *= state_dim_array[idx_dim];
+            }
+
+            hdfret = H5Sclose(dspace);
+            assert(hdfret >= 0);
+            hdfret = H5Dclose(dset);
+            assert(hdfret >= 0);
+        }
+        {
+            TIMEZONE("rhs");
+            hid_t dset = H5Dopen(particle_file, inDatanameRhs.c_str(), H5P_DEFAULT);
+            assert(dset >= 0);
+            hid_t dspace = H5Dget_space(dset); // copy?
+            assert(dspace >= 0);
+
+            hid_t rhs_dim = H5Sget_simple_extent_ndims(dspace);
+            // Chichi comment: this assertion will fail in general, there's no reason for it.
+                //assert(rhs_dim == 4);
+            std::vector<hsize_t> rhs_dim_array(rhs_dim);
+
+            // Chichi comment: wouldn't &rhs_dim_array.front() be safer?
+            int hdfret = H5Sget_simple_extent_dims(dspace, &rhs_dim_array[0], NULL);
+            assert(hdfret >= 0);
+            assert(rhs_dim_array.back() == size_particle_rhs);
+            // Chichi comment: this assertion will fail in general
+            //assert(rhs_dim_array.front() == 1);
+            nb_rhs = rhs_dim_array[0];
+
+            hdfret = H5Sclose(dspace);
+            assert(hdfret >= 0);
+            hdfret = H5Dclose(dset);
+            assert(hdfret >= 0);
+        }
+
+        particles_utils::IntervalSplitter<hsize_t> load_splitter(nb_total_particles, nb_processes, my_rank);
+
+        static_assert(std::is_same<real_number, double>::value
+                      || std::is_same<real_number, float>::value, "real_number must be double or float");
+        const hid_t type_id = (sizeof(real_number) == 8?H5T_NATIVE_DOUBLE:H5T_NATIVE_FLOAT);
+
+        /// Load the data
+        std::unique_ptr<real_number[]> split_particles_positions(new real_number[load_splitter.getMySize()*size_particle_positions]);
+        {
+            TIMEZONE("state-read");
+            hid_t dset = H5Dopen(particle_file, inDatanameState.c_str(), H5P_DEFAULT);
+            assert(dset >= 0);
+
+            hid_t rspace = H5Dget_space(dset);
+            assert(rspace >= 0);
+
+            hsize_t offset[2] = {load_splitter.getMyOffset(), 0};
+            hsize_t mem_dims[2] = {load_splitter.getMySize(), 3};
+
+            hid_t mspace = H5Screate_simple(2, &mem_dims[0], NULL);
+            assert(mspace >= 0);
+
+            int rethdf = H5Sselect_hyperslab(rspace, H5S_SELECT_SET, offset,
+                                             NULL, mem_dims, NULL);
+            assert(rethdf >= 0);
+            rethdf = H5Dread(dset, type_id, mspace, rspace, H5P_DEFAULT, split_particles_positions.get());
+            assert(rethdf >= 0);
+
+            rethdf = H5Sclose(rspace);
+            assert(rethdf >= 0);
+            rethdf = H5Dclose(dset);
+            assert(rethdf >= 0);
+        }
+        std::vector<std::unique_ptr<real_number[]>> split_particles_rhs(nb_rhs);
+        {
+            TIMEZONE("rhs-read");
+            hid_t dset = H5Dopen(particle_file, inDatanameRhs.c_str(), H5P_DEFAULT);
+            assert(dset >= 0);
+
+            for(hsize_t idx_rhs = 0 ; idx_rhs < nb_rhs ; ++idx_rhs){
+                hid_t rspace = H5Dget_space(dset);
+                assert(rspace >= 0);
+
+                split_particles_rhs[idx_rhs].reset(new real_number[load_splitter.getMySize()*size_particle_rhs]);
+
+                hsize_t offset[3] = {idx_rhs, load_splitter.getMyOffset(), 0};
+                hsize_t mem_dims[3] = {1, load_splitter.getMySize(), size_particle_rhs};
+
+                hid_t mspace = H5Screate_simple( 3, &mem_dims[0], NULL);
+                assert(mspace >= 0);
+
+                int rethdf = H5Sselect_hyperslab( rspace, H5S_SELECT_SET, offset,
+                                                 NULL, mem_dims, NULL);
+                assert(rethdf >= 0);
+                rethdf = H5Dread(dset, type_id, mspace, rspace, H5P_DEFAULT, split_particles_rhs[idx_rhs].get());
+                assert(rethdf >= 0);
+
+                rethdf = H5Sclose(mspace);
+                assert(rethdf >= 0);
+
+                rethdf = H5Sclose(rspace);
+                assert(rethdf >= 0);
+            }
+            int rethdf = H5Dclose(dset);
+            assert(rethdf >= 0);
+        }
+
+        std::unique_ptr<int[]> split_particles_indexes(new int[load_splitter.getMySize()]);
+        for(int idx_part = 0 ; idx_part < int(load_splitter.getMySize()) ; ++idx_part){
+            split_particles_indexes[idx_part] = idx_part + load_splitter.getMyOffset();
+        }
+
+        // Permute
+        std::vector<int> nb_particles_per_proc(nb_processes);
+        {
+            TIMEZONE("partition");
+            int previousOffset = 0;
+            for(int idx_proc = 0 ; idx_proc < nb_processes-1 ; ++idx_proc){
+                const real_number limitPartition = in_spatial_limit_per_proc[idx_proc+1];
+                const int localOffset = particles_utils::partition_extra<size_particle_positions>(
+                                                &split_particles_positions[previousOffset*size_particle_positions],
+                                                 load_splitter.getMySize()-previousOffset,
+                                                 [&](const real_number val[]){
+                    return val[IDX_Z] < limitPartition;
+                },
+                [&](const int idx1, const int idx2){
+                    std::swap(split_particles_indexes[idx1], split_particles_indexes[idx2]);
+                    for(int idx_rhs = 0 ; idx_rhs < int(nb_rhs) ; ++idx_rhs){
+                        for(int idx_val = 0 ; idx_val < size_particle_rhs ; ++idx_val){
+                            std::swap(split_particles_rhs[idx_rhs][idx1*size_particle_rhs + idx_val],
+                                      split_particles_rhs[idx_rhs][idx2*size_particle_rhs + idx_val]);
+                        }
+                    }
+                }, previousOffset);
+
+                nb_particles_per_proc[idx_proc] = localOffset;
+                previousOffset += localOffset;
+            }
+            nb_particles_per_proc[nb_processes-1] = load_splitter.getMySize() - previousOffset;
+        }
+
+        {
+            TIMEZONE("exchanger");
+            alltoall_exchanger exchanger(mpi_comm, std::move(nb_particles_per_proc));
+            // nb_particles_per_processes cannot be used after due to move
+            nb_particles_for_me = exchanger.getTotalToRecv();
+
+            my_particles_positions.reset(new real_number[exchanger.getTotalToRecv()*size_particle_positions]);
+            exchanger.alltoallv<real_number>(split_particles_positions.get(), my_particles_positions.get(), size_particle_positions);
+            split_particles_positions.release();
+
+            my_particles_indexes.reset(new int[exchanger.getTotalToRecv()]);
+            exchanger.alltoallv<int>(split_particles_indexes.get(), my_particles_indexes.get());
+            split_particles_indexes.release();
+
+            my_particles_rhs.resize(nb_rhs);
+            for(int idx_rhs = 0 ; idx_rhs < int(nb_rhs) ; ++idx_rhs){
+                my_particles_rhs[idx_rhs].reset(new real_number[exchanger.getTotalToRecv()*size_particle_rhs]);
+                exchanger.alltoallv<real_number>(split_particles_rhs[idx_rhs].get(), my_particles_rhs[idx_rhs].get(), size_particle_rhs);
+            }
+        }
+
+        {
+            TIMEZONE("close");
+            int hdfret = H5Fclose(particle_file);
+            assert(hdfret >= 0);
+            hdfret = H5Pclose(plist_id_par);
+            assert(hdfret >= 0);
+        }
+    }
+
+    ~particles_input_hdf5(){
+    }
+
+    int getTotalNbParticles() final{
+        return nb_total_particles;
+    }
+
+    int getLocalNbParticles() final{
+        return int(nb_particles_for_me);
+    }
+
+    int getNbRhs() final{
+        return int(nb_rhs);
+    }
+
+    std::unique_ptr<real_number[]> getMyParticles() final {
+        assert(my_particles_positions != nullptr);
+        return std::move(my_particles_positions);
+    }
+
+    std::vector<std::unique_ptr<real_number[]>> getMyRhs() final {
+        assert(my_particles_rhs.size() == nb_rhs);
+        return std::move(my_particles_rhs);
+    }
+
+    std::unique_ptr<int[]> getMyParticlesIndexes() final {
+        assert(my_particles_indexes != nullptr);
+        return std::move(my_particles_indexes);
+    }
+};
+
+#endif
diff --git a/bfps/cpp/particles/particles_interp_spline.hpp b/bfps/cpp/particles/particles_interp_spline.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1a9f67da0cb0f360711b889363c05286ffc3009
--- /dev/null
+++ b/bfps/cpp/particles/particles_interp_spline.hpp
@@ -0,0 +1,201 @@
+#ifndef PARTICLES_INTER_SPLINE_HPP
+#define PARTICLES_INTER_SPLINE_HPP
+
+template <class real_number, int interp_neighbours, int mode>
+class particles_interp_spline;
+
+#include "spline_n1.hpp"
+
+template <>
+class particles_interp_spline<double, 1,0>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n1_m0(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 1,1>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n1_m1(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 1,2>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n1_m2(in_derivative, in_part_val, poly_val);
+    }
+};
+
+#include "spline_n2.hpp"
+
+template <>
+class particles_interp_spline<double, 2,0>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n2_m0(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 2,1>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n2_m1(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 2,2>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n2_m2(in_derivative, in_part_val, poly_val);
+    }
+};
+
+#include "spline_n3.hpp"
+
+template <>
+class particles_interp_spline<double, 3,0>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n3_m0(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 3,1>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n3_m1(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 3,2>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n3_m2(in_derivative, in_part_val, poly_val);
+    }
+};
+
+#include "spline_n4.hpp"
+
+template <>
+class particles_interp_spline<double, 4,0>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n4_m0(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 4,1>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n4_m1(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 4,2>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n4_m2(in_derivative, in_part_val, poly_val);
+    }
+};
+
+#include "spline_n5.hpp"
+
+template <>
+class particles_interp_spline<double, 5,0>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n5_m0(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 5,1>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n5_m1(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 5,2>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n5_m2(in_derivative, in_part_val, poly_val);
+    }
+};
+
+#include "spline_n6.hpp"
+
+template <>
+class particles_interp_spline<double, 6,0>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n6_m0(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 6,1>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n6_m1(in_derivative, in_part_val, poly_val);
+    }
+};
+
+template <>
+class particles_interp_spline<double, 6,2>{
+public:
+    using real_number = double;
+
+    void compute_beta(const int in_derivative, const double in_part_val, double poly_val[]) const {
+        beta_n6_m2(in_derivative, in_part_val, poly_val);
+    }
+};
+
+
+
+#endif
diff --git a/bfps/cpp/particles/particles_output_hdf5.hpp b/bfps/cpp/particles/particles_output_hdf5.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..49a0b69e3d0c0b888a7c685570d4ba7296a230f5
--- /dev/null
+++ b/bfps/cpp/particles/particles_output_hdf5.hpp
@@ -0,0 +1,162 @@
+#ifndef PARTICLES_OUTPUT_HDF5_HPP
+#define PARTICLES_OUTPUT_HDF5_HPP
+
+#include <memory>
+#include <vector>
+#include <hdf5.h>
+
+#include "abstract_particles_output.hpp"
+#include "scope_timer.hpp"
+
+template <class real_number, int size_particle_positions, int size_particle_rhs>
+class particles_output_hdf5 : public abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>{
+    using Parent = abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>;
+
+    const std::string filename;
+
+    hid_t file_id;
+    const int total_nb_particles;
+
+    const std::string datagroup_basename_state;
+    const std::string datagroup_basename_rhs;
+
+    hid_t dset_id_state;
+    hid_t dset_id_rhs;
+
+public:
+    particles_output_hdf5(MPI_Comm in_mpi_com, const std::string in_filename, const int inTotalNbParticles,
+                          const int in_nb_rhs, const std::string in_datagroup_basename_state,
+                          const std::string in_datagroup_basename_rhs)
+            : abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>(in_mpi_com, inTotalNbParticles, in_nb_rhs),
+              filename(in_filename),
+              file_id(0), total_nb_particles(inTotalNbParticles), datagroup_basename_state(in_datagroup_basename_state),
+              datagroup_basename_rhs(in_datagroup_basename_rhs), dset_id_state(0), dset_id_rhs(0){
+        if(datagroup_basename_state == datagroup_basename_rhs){
+            DEBUG_MSG("The same dataset names have been passed to particles_output_hdf5 for the state and the rhs\n"
+                      "It will result into an undefined behavior.\n"
+                      "Dataset name = %s\n", datagroup_basename_state.c_str());
+        }
+
+        TIMEZONE("particles_output_hdf5::H5Pcreate");
+        hid_t plist_id_par = H5Pcreate(H5P_FILE_ACCESS);
+        assert(plist_id_par >= 0);
+        int retTest = H5Pset_fapl_mpio(plist_id_par, Parent::getCom(), MPI_INFO_NULL);
+        assert(retTest >= 0);
+
+        // Parallel HDF5 write
+        file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR | H5F_ACC_DEBUG, plist_id_par);
+        // file_id = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC | H5F_ACC_DEBUG/*H5F_ACC_EXCL*/, H5P_DEFAULT/*H5F_ACC_RDWR*/, plist_id_par);
+        assert(file_id >= 0);
+        H5Pclose(plist_id_par);
+
+        dset_id_state = H5Gopen(file_id, datagroup_basename_state.c_str(), H5P_DEFAULT);
+        assert(dset_id_state >= 0);
+        dset_id_rhs = H5Gopen(file_id, datagroup_basename_rhs.c_str(), H5P_DEFAULT);
+        assert(dset_id_rhs >= 0);
+    }
+
+    ~particles_output_hdf5(){
+        TIMEZONE("particles_output_hdf5::H5Dclose");
+
+        int rethdf = H5Gclose(dset_id_state);
+        assert(rethdf >= 0);
+
+        rethdf = H5Gclose(dset_id_rhs);
+        assert(rethdf >= 0);
+
+        rethdf = H5Fclose(file_id);
+        assert(rethdf >= 0);
+    }
+
+    void write(const int idx_time_step, const real_number* particles_positions, const std::unique_ptr<real_number[]>* particles_rhs,
+                           const int nb_particles, const int particles_idx_offset) final{
+        TIMEZONE("particles_output_hdf5::write");
+
+        assert(particles_idx_offset < Parent::getTotalNbParticles());
+        assert(particles_idx_offset+nb_particles <= Parent::getTotalNbParticles());
+
+        static_assert(std::is_same<real_number, double>::value
+                      || std::is_same<real_number, float>::value, "real_number must be double or float");
+        const hid_t type_id = (sizeof(real_number) == 8?H5T_NATIVE_DOUBLE:H5T_NATIVE_FLOAT);
+
+        hid_t plist_id = H5Pcreate(H5P_DATASET_XFER);
+        assert(plist_id >= 0);
+        {
+            int rethdf = H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT);
+            assert(rethdf >= 0);
+        }
+
+        {
+            assert(total_nb_particles >= 0);
+            assert(size_particle_positions >= 0);
+            const hsize_t datacount[2] = {hsize_t(total_nb_particles), hsize_t(size_particle_positions)};
+            hid_t dataspace = H5Screate_simple(2, datacount, NULL);
+            assert(dataspace >= 0);
+
+            hid_t dataset_id = H5Dcreate( dset_id_state, std::to_string(idx_time_step).c_str(), type_id, dataspace, H5P_DEFAULT,
+                                          H5P_DEFAULT, H5P_DEFAULT);
+            assert(dataset_id >= 0);
+
+            assert(nb_particles >= 0);
+            assert(particles_idx_offset >= 0);
+            const hsize_t count[2] = {hsize_t(nb_particles), size_particle_positions};
+            const hsize_t offset[2] = {hsize_t(particles_idx_offset), 0};
+            hid_t memspace = H5Screate_simple(2, count, NULL);
+            assert(memspace >= 0);
+
+            hid_t filespace = H5Dget_space(dataset_id);
+            int rethdf = H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);
+            assert(rethdf >= 0);
+
+            herr_t	status = H5Dwrite(dataset_id, type_id, memspace, filespace,
+                      plist_id, particles_positions);
+            assert(status >= 0);
+            rethdf = H5Sclose(memspace);
+            assert(rethdf >= 0);
+            rethdf = H5Dclose(dataset_id);
+            assert(rethdf >= 0);
+            rethdf = H5Sclose(filespace);
+            assert(rethdf >= 0);
+        }
+        {
+            assert(size_particle_rhs >= 0);
+            const hsize_t datacount[3] = {hsize_t(Parent::getNbRhs()), hsize_t(total_nb_particles), hsize_t(size_particle_rhs)};
+            hid_t dataspace = H5Screate_simple(3, datacount, NULL);
+            assert(dataspace >= 0);
+
+            hid_t dataset_id = H5Dcreate( dset_id_rhs, std::to_string(idx_time_step).c_str(), type_id, dataspace, H5P_DEFAULT,
+                                          H5P_DEFAULT, H5P_DEFAULT);
+            assert(dataset_id >= 0);
+
+            assert(particles_idx_offset >= 0);
+            for(int idx_rhs = 0 ; idx_rhs < Parent::getNbRhs() ; ++idx_rhs){
+                const hsize_t count[3] = {1, hsize_t(nb_particles), hsize_t(size_particle_rhs)};
+                const hsize_t offset[3] = {hsize_t(idx_rhs), hsize_t(particles_idx_offset), 0};
+                hid_t memspace = H5Screate_simple(3, count, NULL);
+                assert(memspace >= 0);
+
+                hid_t filespace = H5Dget_space(dataset_id);
+                assert(filespace >= 0);
+                int rethdf = H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);
+                assert(rethdf >= 0);
+
+                herr_t	status = H5Dwrite(dataset_id, type_id, memspace, filespace,
+                          plist_id, particles_rhs[idx_rhs].get());
+                assert(status >= 0);
+                rethdf = H5Sclose(filespace);
+                assert(rethdf >= 0);
+                rethdf = H5Sclose(memspace);
+                assert(rethdf >= 0);
+            }
+            int rethdf = H5Dclose(dataset_id);
+            assert(rethdf >= 0);
+        }
+
+        {
+            int rethdf = H5Pclose(plist_id);
+            assert(rethdf >= 0);
+        }
+    }
+};
+
+#endif
diff --git a/bfps/cpp/particles/particles_output_mpiio.hpp b/bfps/cpp/particles/particles_output_mpiio.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a034c74b90f4a121a52308a460a9942b9dd1d29
--- /dev/null
+++ b/bfps/cpp/particles/particles_output_mpiio.hpp
@@ -0,0 +1,86 @@
+#ifndef PARTICLES_OUTPUT_MPIIO
+#define PARTICLES_OUTPUT_MPIIO
+
+#include <memory>
+#include <vector>
+#include <string>
+#include <cassert>
+
+#include "abstract_particles_output.hpp"
+#include "scope_timer.hpp"
+#include "particles_utils.hpp"
+
+template <class real_number, int size_particle_positions, int size_particle_rhs>
+class particles_output_mpiio : public abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>{
+    using Parent = abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>;
+
+    const std::string filename;
+    const int nb_step_prealloc;
+
+    int current_step_in_file;
+
+    MPI_File mpi_file;
+
+public:
+    particles_output_mpiio(MPI_Comm in_mpi_com, const std::string in_filename, const int inTotalNbParticles,
+                           const int in_nb_rhs, const int in_nb_step_prealloc = -1)
+            : abstract_particles_output<real_number, size_particle_positions, size_particle_rhs>(in_mpi_com, inTotalNbParticles, in_nb_rhs),
+              filename(in_filename), nb_step_prealloc(in_nb_step_prealloc), current_step_in_file(0){
+        {
+            TIMEZONE("particles_output_mpiio::MPI_File_open");
+            AssertMpi(MPI_File_open(Parent::getCom(), const_cast<char*>(filename.c_str()),
+                MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &mpi_file));
+        }
+        if(nb_step_prealloc != -1){
+            TIMEZONE("particles_output_mpiio::MPI_File_set_size");
+            AssertMpi(MPI_File_set_size(mpi_file,
+                nb_step_prealloc*Parent::getTotalNbParticles()*sizeof(real_number)*(size_particle_positions+size_particle_rhs*Parent::getNbRhs())));
+        }
+    }
+
+    ~particles_output_mpiio(){
+        TIMEZONE("particles_output_mpiio::MPI_File_close");
+        AssertMpi(MPI_File_close(&mpi_file));
+    }
+
+    void write(const int /*time_step*/, const real_number* particles_positions, const std::unique_ptr<real_number[]>* particles_rhs,
+                           const int nb_particles, const int particles_idx_offset) final{
+        TIMEZONE("particles_output_mpiio::write");
+
+        assert(nb_step_prealloc == -1 || current_step_in_file < nb_step_prealloc);
+        assert(particles_idx_offset < Parent::getTotalNbParticles());
+        assert(particles_idx_offset+nb_particles <= Parent::getTotalNbParticles());
+
+        if(nb_step_prealloc == -1){
+            TIMEZONE("particles_output_mpiio::write::MPI_File_set_size");
+            AssertMpi(MPI_File_set_size(mpi_file,
+                (current_step_in_file+1)*Parent::getTotalNbParticles()*sizeof(real_number)*(size_particle_positions+size_particle_rhs*Parent::getNbRhs())));
+        }
+
+        const MPI_Offset globalParticlesOffset = current_step_in_file*Parent::getTotalNbParticles()*(size_particle_positions+size_particle_rhs*Parent::getNbRhs())
+                        + nb_particles*size_particle_positions;
+
+        const MPI_Offset writingOffset = globalParticlesOffset * sizeof(real_number);
+
+        AssertMpi(MPI_File_write_at(mpi_file, writingOffset,
+            const_cast<real_number*>(particles_positions), nb_particles*size_particle_positions, particles_utils::GetMpiType(real_number()),
+            MPI_STATUS_IGNORE));
+
+        for(int idx_rsh = 0 ; idx_rsh < Parent::getNbRhs() ; ++idx_rsh){
+            const MPI_Offset globalParticlesOffsetOutput = current_step_in_file*Parent::getTotalNbParticles()*(size_particle_positions+size_particle_rhs)
+                            + Parent::getTotalNbParticles()*size_particle_positions
+                            + idx_rsh*Parent::getTotalNbParticles()*size_particle_rhs
+                            + nb_particles*size_particle_rhs;
+
+            const MPI_Offset writingOffsetOutput = globalParticlesOffsetOutput * sizeof(real_number);
+
+            AssertMpi(MPI_File_write_at(mpi_file, writingOffsetOutput,
+                const_cast<real_number*>(particles_rhs[idx_rsh].get()), nb_particles*size_particle_rhs, particles_utils::GetMpiType(real_number()),
+                MPI_STATUS_IGNORE));
+        }
+
+        current_step_in_file += 1;
+    }
+};
+
+#endif
diff --git a/bfps/cpp/particles/particles_system.hpp b/bfps/cpp/particles/particles_system.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..472ca95d86ff14448f56092c9122229abfeebd1e
--- /dev/null
+++ b/bfps/cpp/particles/particles_system.hpp
@@ -0,0 +1,202 @@
+#ifndef PARTICLES_SYSTEM_HPP
+#define PARTICLES_SYSTEM_HPP
+
+#include <array>
+
+#include "abstract_particles_system.hpp"
+#include "particles_output_hdf5.hpp"
+#include "particles_output_mpiio.hpp"
+#include "particles_field_computer.hpp"
+#include "field_accessor.hpp"
+#include "abstract_particles_input.hpp"
+#include "particles_adams_bashforth.hpp"
+#include "scope_timer.hpp"
+
+template <class real_number, class field_rnumber, class interpolator_class, int interp_neighbours>
+class particles_system : public abstract_particles_system<real_number> {
+    MPI_Comm mpi_com;
+
+    const std::pair<int,int> current_partition_interval;
+    const int partition_interval_size;
+
+    field_accessor<field_rnumber> field;
+
+    interpolator_class interpolator;
+
+    particles_field_computer<real_number, interpolator_class, field_accessor<field_rnumber>, interp_neighbours, particles_adams_bashforth<real_number, 3,3>> computer;
+
+    std::unique_ptr<int[]> current_my_nb_particles_per_partition;
+    std::unique_ptr<int[]> current_offset_particles_for_partition;
+
+    const std::array<real_number,3> spatial_box_width;
+    const std::array<real_number,3> spatial_partition_width;
+    const real_number my_spatial_low_limit;
+    const real_number my_spatial_up_limit;
+
+    std::unique_ptr<real_number[]> my_particles_positions;
+    std::unique_ptr<int[]> my_particles_positions_indexes;
+    int my_nb_particles;
+    std::vector<std::unique_ptr<real_number[]>> my_particles_rhs;
+
+    int step_idx;
+
+public:
+    particles_system(const std::array<size_t,3>& field_grid_dim, const std::array<real_number,3>& in_spatial_box_width,
+                     const std::array<real_number,3>& in_spatial_partition_width,
+                     const real_number in_my_spatial_low_limit, const real_number in_my_spatial_up_limit,
+                     const field_rnumber* in_field_data, const std::array<size_t,3>& in_local_field_dims,
+                     const std::array<size_t,3>& in_local_field_offset,
+                     const std::array<size_t,3>& in_field_memory_dims,
+                     MPI_Comm in_mpi_com)
+        : mpi_com(in_mpi_com),
+          current_partition_interval({in_local_field_offset[IDX_Z], in_local_field_offset[IDX_Z] + in_local_field_dims[IDX_Z]}),
+          partition_interval_size(current_partition_interval.second - current_partition_interval.first),
+          field(in_field_data, in_local_field_dims, in_local_field_offset, in_field_memory_dims),
+          interpolator(),
+          computer(in_mpi_com, field_grid_dim, current_partition_interval,
+                   interpolator, field, in_spatial_box_width, in_spatial_partition_width,
+                   in_my_spatial_low_limit, in_my_spatial_up_limit),
+          spatial_box_width(in_spatial_box_width), spatial_partition_width(in_spatial_partition_width),
+          my_spatial_low_limit(in_my_spatial_low_limit), my_spatial_up_limit(in_my_spatial_up_limit),
+          my_nb_particles(0), step_idx(1){
+
+        current_my_nb_particles_per_partition.reset(new int[partition_interval_size]);
+        current_offset_particles_for_partition.reset(new int[partition_interval_size+1]);
+    }
+
+    ~particles_system(){
+    }
+
+    void init(abstract_particles_input<real_number>& particles_input){
+        TIMEZONE("particles_system::init");
+
+        my_particles_positions = particles_input.getMyParticles();
+        my_particles_positions_indexes = particles_input.getMyParticlesIndexes();
+        my_particles_rhs = particles_input.getMyRhs();
+        my_nb_particles = particles_input.getLocalNbParticles();
+
+        for(int idx_part = 0 ; idx_part < my_nb_particles ; ++idx_part){ // TODO remove me
+            assert(my_particles_positions[idx_part*3+IDX_Z] >= my_spatial_low_limit);
+            assert(my_particles_positions[idx_part*3+IDX_Z] < my_spatial_up_limit);
+        }
+
+        particles_utils::partition_extra_z<3>(&my_particles_positions[0], my_nb_particles, partition_interval_size,
+                                              current_my_nb_particles_per_partition.get(), current_offset_particles_for_partition.get(),
+        [&](const int idxPartition){
+            const real_number limitPartition = (idxPartition+1)*spatial_partition_width[IDX_Z] + my_spatial_low_limit;
+            return limitPartition;
+        },
+        [&](const int idx1, const int idx2){
+            std::swap(my_particles_positions_indexes[idx1], my_particles_positions_indexes[idx2]);
+            for(int idx_rhs = 0 ; idx_rhs < int(my_particles_rhs.size()) ; ++idx_rhs){
+                for(int idx_val = 0 ; idx_val < 3 ; ++idx_val){
+                    std::swap(my_particles_rhs[idx_rhs][idx1*3 + idx_val],
+                              my_particles_rhs[idx_rhs][idx2*3 + idx_val]);
+                }
+            }
+        });
+
+        {// TODO remove
+            for(int idxPartition = 0 ; idxPartition < partition_interval_size ; ++idxPartition){
+                assert(current_my_nb_particles_per_partition[idxPartition] ==
+                       current_offset_particles_for_partition[idxPartition+1] - current_offset_particles_for_partition[idxPartition]);
+                const real_number limitPartition = (idxPartition+1)*spatial_partition_width[IDX_Z] + my_spatial_low_limit;
+                for(int idx = 0 ; idx < current_offset_particles_for_partition[idxPartition+1] ; ++idx){
+                    assert(my_particles_positions[idx*3+IDX_Z] < limitPartition);
+                }
+                for(int idx = current_offset_particles_for_partition[idxPartition+1] ; idx < my_nb_particles ; ++idx){
+                    assert(my_particles_positions[idx*3+IDX_Z] >= limitPartition);
+                }
+            }
+        }
+    }
+
+
+    void compute() final {
+        TIMEZONE("particles_system::compute");
+        computer.compute_distr(current_my_nb_particles_per_partition.get(),
+                               my_particles_positions.get(),
+                               my_particles_rhs.front().get(),
+                               interp_neighbours);
+    }
+
+    void move(const real_number dt) final {
+        TIMEZONE("particles_system::move");
+        computer.move_particles(my_particles_positions.get(), my_nb_particles,
+                                my_particles_rhs.data(), std::min(step_idx+1,int(my_particles_rhs.size())),
+                                dt);
+    }
+
+    void redistribute() final {
+        TIMEZONE("particles_system::redistribute");
+        computer.redistribute(current_my_nb_particles_per_partition.get(),
+                              &my_nb_particles,
+                              &my_particles_positions,
+                              my_particles_rhs.data(), my_particles_rhs.size(),
+                              &my_particles_positions_indexes,
+                              my_spatial_low_limit,
+                              my_spatial_up_limit,
+                              spatial_partition_width[IDX_Z]);
+    }
+
+    void inc_step_idx() final {
+        step_idx += 1;
+    }
+
+    void shift_rhs_vectors() final {
+        if(my_particles_rhs.size()){
+            std::unique_ptr<real_number[]> next_current(std::move(my_particles_rhs.back()));
+            for(int idx_rhs = my_particles_rhs.size()-1 ; idx_rhs > 0 ; --idx_rhs){
+                my_particles_rhs[idx_rhs] = std::move(my_particles_rhs[idx_rhs-1]);
+            }
+            my_particles_rhs[0] = std::move(next_current);
+            particles_utils::memzero(my_particles_rhs[0], 3*my_nb_particles);
+        }
+    }
+
+    void completeLoop(const real_number dt) final {
+        TIMEZONE("particles_system::completeLoop");
+        compute();
+        move(dt);
+        redistribute();
+        inc_step_idx();
+        shift_rhs_vectors();
+    }
+
+    const real_number* getParticlesPositions() const final {
+        return my_particles_positions.get();
+    }
+
+    const std::unique_ptr<real_number[]>* getParticlesRhs() const final {
+        return my_particles_rhs.data();
+    }
+
+    const int* getParticlesIndexes() const final {
+        return my_particles_positions_indexes.get();
+    }
+
+    int getLocalNbParticles() const final {
+        return my_nb_particles;
+    }
+
+    int getNbRhs() const final {
+        return int(my_particles_rhs.size());
+    }
+
+    void checkNan() const { // TODO remove
+        for(int idx_part = 0 ; idx_part < my_nb_particles ; ++idx_part){ // TODO remove me
+            assert(std::isnan(my_particles_positions[idx_part*3+IDX_X]) == false);
+            assert(std::isnan(my_particles_positions[idx_part*3+IDX_Y]) == false);
+            assert(std::isnan(my_particles_positions[idx_part*3+IDX_Z]) == false);
+
+            for(int idx_rhs = 0 ; idx_rhs < my_particles_rhs.size() ; ++idx_rhs){
+                assert(std::isnan(my_particles_rhs[idx_rhs][idx_part*3+IDX_X]) == false);
+                assert(std::isnan(my_particles_rhs[idx_rhs][idx_part*3+IDX_Y]) == false);
+                assert(std::isnan(my_particles_rhs[idx_rhs][idx_part*3+IDX_Z]) == false);
+            }
+        }
+    }
+};
+
+
+#endif
diff --git a/bfps/cpp/particles/particles_system_builder.hpp b/bfps/cpp/particles/particles_system_builder.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d314ab5001410ae0c2529395e7910614b432819a
--- /dev/null
+++ b/bfps/cpp/particles/particles_system_builder.hpp
@@ -0,0 +1,253 @@
+#ifndef PARTICLES_SYSTEM_BUILDER_HPP
+#define PARTICLES_SYSTEM_BUILDER_HPP
+
+#include <string>
+
+#include "abstract_particles_system.hpp"
+#include "particles_system.hpp"
+#include "particles_input_hdf5.hpp"
+#include "particles_interp_spline.hpp"
+
+#include "field.hpp"
+#include "kspace.hpp"
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+///
+/// Double template "for"
+///
+//////////////////////////////////////////////////////////////////////////////
+
+namespace Template_double_for_if{
+
+template <class RetType,
+          class IterType1, IterType1 CurrentIter1,
+          class IterType2, const IterType2 CurrentIter2, const IterType2 iterTo2, const IterType2 IterStep2,
+          class Func, bool IsNotOver, typename... Args>
+struct For2{
+    static RetType evaluate(IterType2 value2, Args... args){
+        if(CurrentIter2 == value2){
+            return std::move(Func::template instanciate<CurrentIter1, CurrentIter2>(args...));
+        }
+        else{
+            return std::move(For2<RetType,
+                                        IterType1, CurrentIter1,
+                                        IterType2, CurrentIter2+IterStep2, iterTo2, IterStep2,
+                                        Func, (CurrentIter2+IterStep2 < iterTo2), Args...>::evaluate(value2, args...));
+        }
+    }
+};
+
+template <class RetType,
+          class IterType1, IterType1 CurrentIter1,
+          class IterType2, const IterType2 CurrentIter2, const IterType2 iterTo2, const IterType2 IterStep2,
+          class Func, typename... Args>
+struct For2<RetType,
+                  IterType1, CurrentIter1,
+                  IterType2, CurrentIter2, iterTo2, IterStep2,
+                  Func, false, Args...>{
+    static RetType evaluate(IterType2 value2, Args... args){
+        std::cout << __FUNCTION__ << "[ERROR] template values for loop 2 " << value2 << " does not exist\n";
+        return RetType();
+    }
+};
+
+template <class RetType,
+          class IterType1, const IterType1 CurrentIter1, const IterType1 iterTo1, const IterType1 IterStep1,
+          class IterType2, const IterType2 IterFrom2, const IterType2 iterTo2, const IterType2 IterStep2,
+          class Func, bool IsNotOver, typename... Args>
+struct For1{
+    static RetType evaluate(IterType1 value1, IterType2 value2, Args... args){
+        if(CurrentIter1 == value1){
+            return std::move(For2<RetType,
+                                        IterType1, CurrentIter1,
+                                        IterType2, IterFrom2, iterTo2, IterStep2,
+                                        Func, (IterFrom2<iterTo2), Args...>::evaluate(value2, args...));
+        }
+        else{
+            return std::move(For1<RetType,
+                              IterType1, CurrentIter1+IterStep1, iterTo1, IterStep1,
+                              IterType2, IterFrom2, iterTo2, IterStep2,
+                              Func, (CurrentIter1+IterStep1 < iterTo1), Args...>::evaluate(value1, value2, args...));
+        }
+    }
+};
+
+template <class RetType,
+          class IterType1, const IterType1 IterFrom1, const IterType1 iterTo1, const IterType1 IterStep1,
+          class IterType2, const IterType2 IterFrom2, const IterType2 iterTo2, const IterType2 IterStep2,
+          class Func, typename... Args>
+struct For1<RetType,
+                IterType1, IterFrom1, iterTo1, IterStep1,
+                IterType2, IterFrom2, iterTo2, IterStep2,
+                Func, false, Args...>{
+    static RetType evaluate(IterType1 value1, IterType2 value2, Args... args){
+        std::cout << __FUNCTION__ << "[ERROR] template values for loop 1 " << value1 << " does not exist\n";
+        return RetType();
+    }
+};
+
+template <class RetType,
+          class IterType1, const IterType1 IterFrom1, const IterType1 iterTo1, const IterType1 IterStep1,
+          class IterType2, const IterType2 IterFrom2, const IterType2 iterTo2, const IterType2 IterStep2,
+          class Func, typename... Args>
+inline RetType evaluate(IterType1 value1, IterType2 value2, Args... args){
+    return std::move(For1<RetType,
+            IterType1, IterFrom1, iterTo1, IterStep1,
+            IterType2, IterFrom2, iterTo2, IterStep2,
+            Func, (IterFrom1<iterTo1), Args...>::evaluate(value1, value2, args...));
+}
+
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+///
+/// Builder Functions
+///
+//////////////////////////////////////////////////////////////////////////////
+
+template <class field_rnumber, field_backend be, class particles_rnumber>
+struct particles_system_build_container {
+    template <const int interpolation_size, const int spline_mode>
+    static std::unique_ptr<abstract_particles_system<particles_rnumber>> instanciate(
+             const field<field_rnumber, be, THREE>* fs_field, // (field object)
+             const kspace<be, SMOOTH>* fs_kk, // (kspace object, contains dkx, dky, dkz)
+             const int nsteps, // to check coherency between parameters and hdf input file (nb rhs)
+             const int nparticles, // to check coherency between parameters and hdf input file
+             const std::string& fname_input, // particles input filename
+            const std::string& inDatanameState, const std::string& inDatanameRhs, // input dataset names
+             MPI_Comm mpi_comm){
+
+        // The size of the field grid (global size) all_size seems
+        std::array<size_t,3> field_grid_dim;
+        field_grid_dim[IDX_X] = fs_field->rlayout->sizes[IDX_X];// nx
+        field_grid_dim[IDX_Y] = fs_field->rlayout->sizes[IDX_Y];// nx
+        field_grid_dim[IDX_Z] = fs_field->rlayout->sizes[IDX_Z];// nz
+
+        // The size of the local field grid (the field nodes that belong to current process)
+        std::array<size_t,3> local_field_dims;
+        local_field_dims[IDX_X] = fs_field->rlayout->subsizes[IDX_X];
+        local_field_dims[IDX_Y] = fs_field->rlayout->subsizes[IDX_Y];
+        local_field_dims[IDX_Z] = fs_field->rlayout->subsizes[IDX_Z];
+
+        // The offset of the local field grid
+        std::array<size_t,3> local_field_offset;
+        local_field_offset[IDX_X] = fs_field->rlayout->starts[IDX_X];
+        local_field_offset[IDX_Y] = fs_field->rlayout->starts[IDX_Y];
+        local_field_offset[IDX_Z] = fs_field->rlayout->starts[IDX_Z];
+
+
+        // Retreive split from fftw to know processes that have no work
+        int my_rank, nb_processes;
+        AssertMpi(MPI_Comm_rank(mpi_comm, &my_rank));
+        AssertMpi(MPI_Comm_size(mpi_comm, &nb_processes));
+
+        const int split_step = (int(field_grid_dim[IDX_Z])+nb_processes-1)/nb_processes;
+        const int nb_processes_involved = (int(field_grid_dim[IDX_Z])+split_step-1)/split_step;
+
+        assert((my_rank < nb_processes_involved && local_field_dims[IDX_Z] != 0)
+               || (nb_processes_involved <= my_rank && local_field_dims[IDX_Z] == 0));
+        assert(nb_processes_involved <= int(field_grid_dim[IDX_Z]));
+
+        // Make the idle processes starting from the limit (and not 0 as set by fftw)
+        if(nb_processes_involved <= my_rank){
+            local_field_offset[IDX_Z] = field_grid_dim[IDX_Z];
+        }
+
+        // Ensure that 1D partitioning is used
+        {
+            assert(local_field_offset[IDX_X] == 0);
+            assert(local_field_offset[IDX_Y] == 0);
+            assert(local_field_dims[IDX_X] == field_grid_dim[IDX_X]);
+            assert(local_field_dims[IDX_Y] == field_grid_dim[IDX_Y]);
+
+            assert(my_rank >= nb_processes_involved || ((my_rank == 0 && local_field_offset[IDX_Z] == 0)
+                   || (my_rank != 0 && local_field_offset[IDX_Z] != 0)));
+            assert(my_rank >= nb_processes_involved || ((my_rank == nb_processes_involved-1 && local_field_offset[IDX_Z]+local_field_dims[IDX_Z] == field_grid_dim[IDX_Z])
+                   || (my_rank != nb_processes_involved-1 && local_field_offset[IDX_Z]+local_field_dims[IDX_Z] != field_grid_dim[IDX_Z])));
+        }
+
+        // The offset of the local field grid
+        std::array<size_t,3> local_field_mem_size;
+        local_field_mem_size[IDX_X] = fs_field->rmemlayout->subsizes[IDX_X];
+        local_field_mem_size[IDX_Y] = fs_field->rmemlayout->subsizes[IDX_Y];
+        local_field_mem_size[IDX_Z] = fs_field->rmemlayout->subsizes[IDX_Z];
+
+        // The spatial box size (all particles should be included inside)
+        std::array<particles_rnumber,3> spatial_box_width;
+        spatial_box_width[IDX_X] = 4 * acos(0) / (fs_kk->dkx);
+        spatial_box_width[IDX_Y] = 4 * acos(0) / (fs_kk->dky);
+        spatial_box_width[IDX_Z] = 4 * acos(0) / (fs_kk->dkz);
+
+        // The distance between two field nodes in z
+        std::array<particles_rnumber,3> spatial_partition_width;
+        spatial_partition_width[IDX_X] = spatial_box_width[IDX_X]/particles_rnumber(field_grid_dim[IDX_X]);
+        spatial_partition_width[IDX_Y] = spatial_box_width[IDX_Y]/particles_rnumber(field_grid_dim[IDX_Y]);
+        spatial_partition_width[IDX_Z] = spatial_box_width[IDX_Z]/particles_rnumber(field_grid_dim[IDX_Z]);
+        // The spatial interval of the current process
+        const particles_rnumber my_spatial_low_limit_z = particles_rnumber(local_field_offset[IDX_Z])*spatial_partition_width[IDX_Z];
+        const particles_rnumber my_spatial_up_limit_z = particles_rnumber(local_field_offset[IDX_Z]+local_field_dims[IDX_Z])*spatial_partition_width[IDX_Z];
+
+        // Create the particles system
+        particles_system<particles_rnumber, field_rnumber, particles_interp_spline<particles_rnumber, interpolation_size,spline_mode>, interpolation_size>* part_sys
+         = new particles_system<particles_rnumber, field_rnumber, particles_interp_spline<particles_rnumber, interpolation_size,spline_mode>, interpolation_size>(field_grid_dim,
+                                                                                                   spatial_box_width,
+                                                                                                   spatial_partition_width,
+                                                                                                   my_spatial_low_limit_z,
+                                                                                                   my_spatial_up_limit_z,
+                                                                                                   fs_field->get_rdata(),
+                                                                                                   local_field_dims,
+                                                                                                   local_field_offset,
+                                                                                                   local_field_mem_size,
+                                                                                                   mpi_comm);
+
+        // Load particles from hdf5
+        particles_input_hdf5<particles_rnumber, 3,3> generator(mpi_comm, fname_input,
+                                            inDatanameState, inDatanameRhs, my_spatial_low_limit_z, my_spatial_up_limit_z);
+
+        // Ensure parameters match the input file
+        if(generator.getNbRhs() != nsteps){
+            std::runtime_error(std::string("Nb steps is ") + std::to_string(nsteps)
+                               + " in the parameters but " + std::to_string(generator.getNbRhs()) + " in the particles file.");
+        }
+        // Ensure parameters match the input file
+        if(generator.getTotalNbParticles() != nparticles){
+            std::runtime_error(std::string("Nb particles is ") + std::to_string(nparticles)
+                               + " in the parameters but " + std::to_string(generator.getTotalNbParticles()) + " in the particles file.");
+        }
+
+        // Load the particles and move them to the particles system
+        part_sys->init(generator);
+
+        assert(part_sys->getNbRhs() == nsteps);
+
+        // Return the created particles system
+        return std::unique_ptr<abstract_particles_system<particles_rnumber>>(part_sys);
+    }
+};
+
+
+template <class field_rnumber, field_backend be, class particles_rnumber = double>
+inline std::unique_ptr<abstract_particles_system<particles_rnumber>> particles_system_builder(
+        const field<field_rnumber, be, THREE>* fs_field, // (field object)
+        const kspace<be, SMOOTH>* fs_kk, // (kspace object, contains dkx, dky, dkz)
+        const int nsteps, // to check coherency between parameters and hdf input file (nb rhs)
+        const int nparticles, // to check coherency between parameters and hdf input file
+        const std::string& fname_input, // particles input filename
+        const std::string& inDatanameState, const std::string& inDatanameRhs, // input dataset names
+        const int interpolation_size,
+        const int spline_mode,
+        MPI_Comm mpi_comm){
+    return Template_double_for_if::evaluate<std::unique_ptr<abstract_particles_system<particles_rnumber>>,
+                       int, 1, 7, 1, // interpolation_size
+                       int, 0, 3, 1, // spline_mode
+                       particles_system_build_container<field_rnumber,be,particles_rnumber>>(
+                           interpolation_size, // template iterator 1
+                           spline_mode, // template iterator 2
+                           fs_field,fs_kk, nsteps, nparticles, fname_input, inDatanameState, inDatanameRhs, mpi_comm);
+}
+
+
+#endif
diff --git a/bfps/cpp/particles/particles_utils.hpp b/bfps/cpp/particles/particles_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ebd79641bec71671fbcfa5788cf2134dad61b0e
--- /dev/null
+++ b/bfps/cpp/particles/particles_utils.hpp
@@ -0,0 +1,300 @@
+#ifndef PARTICLES_UTILS_HPP
+#define PARTICLES_UTILS_HPP
+
+#include <mpi.h>
+
+#include <cassert>
+#include <stack>
+#include <vector>
+#include <memory>
+#include <cstring>
+
+#if _OPENMP < 201511
+#warning Openmp priority is not supported here
+#define priority(x)
+#endif
+
+
+#ifndef AssertMpi
+#define AssertMpi(X) if(MPI_SUCCESS != (X)) { printf("MPI Error at line %d\n",__LINE__); fflush(stdout) ; throw std::runtime_error("Stop from from mpi erro"); }
+#endif
+
+enum IDXS_3D {
+    IDX_X = 2,
+    IDX_Y = 1,
+    IDX_Z = 0
+};
+
+namespace particles_utils {
+
+class GetMpiType{
+    const MPI_Datatype type;
+public:
+    explicit GetMpiType(const int&) : type(MPI_INT){}
+    explicit GetMpiType(const double&) : type(MPI_DOUBLE){}
+    explicit GetMpiType(const float&) : type(MPI_FLOAT){}
+    explicit GetMpiType(const char&) : type(MPI_CHAR){}
+    explicit GetMpiType(const long&) : type(MPI_LONG){}
+
+    /*do not make it explicit*/ operator MPI_Datatype() const { return type; }
+};
+
+
+template <int nb_values, class real_number, class Predicate>
+inline int partition(real_number* array, const int size, Predicate pdc)
+{
+    if(size == 0) return 0;
+    if(size == 1) return (pdc(&array[0])?1:0);
+
+    int idxInsert = 0;
+
+    for(int idx = 0 ; idx < size && pdc(&array[idx*nb_values]); ++idx){
+        idxInsert += 1;
+    }
+
+    for(int idx = idxInsert ; idx < size ; ++idx){
+        if(pdc(&array[idx*nb_values])){
+            for(int idxVal = 0 ; idxVal < nb_values ; ++idxVal){
+                std::swap(array[idx*nb_values + idxVal], array[idxInsert*nb_values + idxVal]);
+            }
+            idxInsert += 1;
+        }
+    }
+
+    return idxInsert;
+}
+
+
+template <int nb_values, class real_number, class Predicate1, class Predicate2>
+inline int partition_extra(real_number* array, const int size, Predicate1 pdc, Predicate2 pdcswap, const int offset_idx_swap = 0)
+{
+    if(size == 0) return 0;
+    if(size == 1) return (pdc(&array[0])?1:0);
+
+    int idxInsert = 0;
+
+    for(int idx = 0 ; idx < size && pdc(&array[idx*nb_values]); ++idx){
+        idxInsert += 1;
+    }
+
+    for(int idx = idxInsert ; idx < size ; ++idx){
+        if(pdc(&array[idx*nb_values])){
+            for(int idxVal = 0 ; idxVal < nb_values ; ++idxVal){
+                std::swap(array[idx*nb_values + idxVal], array[idxInsert*nb_values + idxVal]);
+            }
+            pdcswap(idx+offset_idx_swap, idxInsert+offset_idx_swap);
+            idxInsert += 1;
+        }
+    }
+
+    return idxInsert;
+}
+
+template <int nb_values, class real_number, class Predicate1, class Predicate2>
+inline void partition_extra_z(real_number* array, const int size, const int nb_partitions,
+                              int partitions_size[], int partitions_offset[],
+                              Predicate1 partitions_limits, Predicate2 pdcswap)
+{
+    if(nb_partitions == 0){
+        return ;
+    }
+
+    partitions_offset[0] = 0;
+    partitions_offset[nb_partitions] = size;
+
+    if(nb_partitions == 1){
+        partitions_size[0] = size;
+        return;
+    }
+
+    if(nb_partitions == 2){
+        const real_number limit = partitions_limits(0);
+        const int size_current = partition_extra<nb_values>(array, size,
+                [&](const real_number inval[]){
+            return inval[IDX_Z] < limit;
+        }, pdcswap);
+        partitions_size[0] = size_current;
+        partitions_size[1] = size-size_current;
+        partitions_offset[1] = size_current;
+        return;
+    }
+
+    std::stack<std::pair<int,int>> toproceed;
+
+    toproceed.push({0, nb_partitions});
+
+    while(toproceed.size()){
+        const std::pair<int,int> current_part = toproceed.top();
+        toproceed.pop();
+
+        assert(current_part.second-current_part.first >= 1);
+
+        if(current_part.second-current_part.first == 1){
+            partitions_size[current_part.first] = partitions_offset[current_part.first+1] - partitions_offset[current_part.first];
+        }
+        else{
+            const int idx_middle = (current_part.second-current_part.first)/2 + current_part.first - 1;
+
+            const int size_unpart = partitions_offset[current_part.second]- partitions_offset[current_part.first];
+
+            const real_number limit = partitions_limits(idx_middle);
+            const int size_current = partition_extra<nb_values>(&array[partitions_offset[current_part.first]*nb_values],
+                                                     size_unpart,
+                    [&](const real_number inval[]){
+                return inval[IDX_Z] < limit;
+            }, pdcswap, partitions_offset[current_part.first]);
+
+            partitions_offset[idx_middle+1] = size_current + partitions_offset[current_part.first];
+
+            toproceed.push({current_part.first, idx_middle+1});
+
+            toproceed.push({idx_middle+1, current_part.second});
+        }
+    }
+}
+
+template <int nb_values, class real_number, class Predicate1, class Predicate2>
+inline std::pair<std::vector<int>,std::vector<int>> partition_extra_z(real_number* array, const int size,
+                                                                      const int nb_partitions, Predicate1 partitions_limits,
+                                                                        Predicate2 pdcswap){
+
+    std::vector<int> partitions_size(nb_partitions);
+    std::vector<int> partitions_offset(nb_partitions+1);
+    partition_extra_z<nb_values, real_number, Predicate1, Predicate2>(array, size, nb_partitions,
+                                                         partitions_size.data(), partitions_offset.data(),
+                                                         partitions_limits, pdcswap);
+    return {std::move(partitions_size), std::move(partitions_offset)};
+}
+
+
+template <class NumType = int>
+class IntervalSplitter {
+    const NumType nb_items;
+    const NumType nb_intervals;
+    const NumType my_idx;
+
+    double step_split;
+    NumType offset_mine;
+    NumType size_mine;
+public:
+    IntervalSplitter(const NumType in_nb_items,
+                     const NumType in_nb_intervals,
+                     const NumType in_my_idx)
+        : nb_items(in_nb_items), nb_intervals(in_nb_intervals), my_idx(in_my_idx),
+          step_split(0), offset_mine(0), size_mine(0){
+        if(nb_items <= nb_intervals){
+            step_split = 1;
+            if(my_idx < nb_items){
+                offset_mine = my_idx;
+                size_mine = 1;
+            }
+            else{
+                offset_mine = nb_intervals;
+                size_mine = 0;
+            }
+        }
+        else{
+            step_split = double(nb_items)/double(nb_intervals);
+            offset_mine = NumType(step_split*double(my_idx));
+            size_mine = (my_idx != nb_intervals-1 ? NumType(step_split*double(my_idx+1)) : nb_items) -offset_mine;
+        }
+    }
+
+    NumType getMySize() const {
+        return size_mine;
+    }
+
+    NumType getMyOffset() const {
+        return offset_mine;
+    }
+
+    NumType getSizeOther(const NumType in_idx_other) const {
+        return IntervalSplitter<NumType>(nb_items, nb_intervals, in_idx_other).getMySize();
+    }
+
+    NumType getOffsetOther(const NumType in_idx_other) const {
+        return IntervalSplitter<NumType>(nb_items, nb_intervals, in_idx_other).getMyOffset();
+    }
+
+    NumType getOwner(const NumType in_item_idx) const {
+        NumType owner = NumType(double(in_item_idx)/step_split);
+        if(owner != nb_intervals-1 && NumType(step_split*double(owner+1)) <= in_item_idx){
+            owner += 1;
+        }
+        assert(owner < nb_intervals);
+        assert(IntervalSplitter(nb_items, nb_intervals, owner).getMyOffset() <= in_item_idx);
+        assert(in_item_idx < IntervalSplitter(nb_items, nb_intervals, owner).getMySize()+IntervalSplitter(nb_items, nb_intervals, owner).getMyOffset());
+        return owner;
+    }
+};
+
+// http://en.cppreference.com/w/cpp/algorithm/transform
+template<class InputIt, class OutputIt, class UnaryOperation>
+OutputIt transform(InputIt first1, InputIt last1, OutputIt d_first,
+                   UnaryOperation unary_op)
+{
+    while (first1 != last1) {
+        *d_first++ = unary_op(*first1++);
+    }
+    return d_first;
+}
+
+
+template <class NumType>
+void memzero(NumType* array, size_t size){
+    memset(array, 0, size*sizeof(NumType));
+}
+
+template <class NumType>
+void memzero(std::unique_ptr<NumType[]>& array, size_t size){
+    memset(array.get(), 0, size*sizeof(NumType));
+}
+
+
+class fixed_copy {
+    const size_t to_idx;
+    const size_t from_idx;
+    const size_t nb_elements_to_copy;
+
+public:
+    fixed_copy(const size_t in_to_idx, const size_t in_from_idx, const size_t in_nb_elements_to_copy)
+        : to_idx(in_to_idx), from_idx(in_from_idx), nb_elements_to_copy(in_nb_elements_to_copy){
+    }
+
+    fixed_copy(const size_t in_to_idx, const size_t in_nb_elements_to_copy)
+        : fixed_copy(in_to_idx, 0, in_nb_elements_to_copy){
+    }
+
+    fixed_copy(const size_t in_nb_elements_to_copy)
+        : fixed_copy(0, in_nb_elements_to_copy){
+    }
+
+    template <class ItemType>
+    const fixed_copy& copy(ItemType dest[], const ItemType source[]) const {
+        memcpy(&dest[to_idx], &source[from_idx], sizeof(ItemType)*nb_elements_to_copy);
+        return *this;
+    }
+
+    template <class ItemType>
+    const fixed_copy& copy(ItemType dest[], const ItemType source[], const size_t nb_values_per_element) const {
+        memcpy(&dest[to_idx*nb_values_per_element], &source[from_idx*nb_values_per_element], sizeof(ItemType)*nb_elements_to_copy*nb_values_per_element);
+        return *this;
+    }
+
+    template <class ItemType>
+    const fixed_copy& copy(std::unique_ptr<ItemType[]>& dest, const std::unique_ptr<ItemType[]>& source) const {
+        memcpy(&dest[to_idx], &source[from_idx], sizeof(ItemType)*nb_elements_to_copy);
+        return *this;
+    }
+
+    template <class ItemType>
+    const fixed_copy& copy(std::unique_ptr<ItemType[]>& dest, const std::unique_ptr<ItemType[]>& source, const size_t nb_values_per_element) const {
+        memcpy(&dest[to_idx*nb_values_per_element], &source[from_idx*nb_values_per_element], sizeof(ItemType)*nb_elements_to_copy*nb_values_per_element);
+        return *this;
+    }
+};
+
+
+}
+
+#endif
diff --git a/bfps/cpp/particles_base.cpp b/bfps/cpp/particles_base.cpp
index ff0fec32d4f0493814351788ca25081adfb27a12..1410488410a429ff463a1751e86f78cc2157679b 100644
--- a/bfps/cpp/particles_base.cpp
+++ b/bfps/cpp/particles_base.cpp
@@ -29,6 +29,7 @@
 #include <algorithm>
 #include <cassert>
 #include "particles_base.hpp"
+#include "scope_timer.hpp"
 
 template <particle_types particle_type>
 single_particle_state<particle_type>::single_particle_state()
@@ -88,6 +89,7 @@ int get_chunk_offsets(
         std::vector<hsize_t> chnk_dims,
         std::vector<std::vector<hsize_t>> &co)
 {
+    TIMEZONE("get_chunk_offsets");
     std::vector<hsize_t> nchunks(data_dims);
     int total_number_of_chunks = 1;
     for (unsigned i=0; i<nchunks.size(); i++)
@@ -121,6 +123,7 @@ particles_io_base<particle_type>::particles_io_base(
         const hid_t data_file_id,
         MPI_Comm COMM)
 {
+    TIMEZONE("particles_io_base::particles_io_base");
     this->name = std::string(NAME);
     this->traj_skip = TRAJ_SKIP;
     this->comm = COMM;
@@ -233,6 +236,7 @@ void particles_io_base<particle_type>::read_state_chunk(
         const int cindex,
         double *data)
 {
+    TIMEZONE("particles_io_base::read_state_chunk");
     DEBUG_MSG("entered read_state_chunk\n");
     hid_t dset = H5Dopen(this->hdf5_group_id, "state", H5P_DEFAULT);
     hid_t rspace = H5Dget_space(dset);
@@ -267,6 +271,7 @@ void particles_io_base<particle_type>::write_state_chunk(
         const int cindex,
         const double *data)
 {
+    TIMEZONE("particles_io_base::write_state_chunk");
     hid_t dset = H5Dopen(this->hdf5_group_id, "state", H5P_DEFAULT);
     hid_t rspace = H5Dget_space(dset);
     std::vector<hsize_t> mem_dims(this->hdf5_state_chunks);
@@ -300,6 +305,7 @@ void particles_io_base<particle_type>::read_rhs_chunk(
         const int rhsindex,
         double *data)
 {
+    TIMEZONE("particles_io_base::read_rhs_chunk");
     //DEBUG_MSG("entered read_rhs_chunk\n");
     hid_t dset = H5Dopen(this->hdf5_group_id, "rhs", H5P_DEFAULT);
     hid_t rspace = H5Dget_space(dset);
@@ -342,6 +348,7 @@ void particles_io_base<particle_type>::write_rhs_chunk(
         const int rhsindex,
         const double *data)
 {
+    TIMEZONE("particles_io_base::write_rhs_chunk");
     hid_t dset = H5Dopen(this->hdf5_group_id, "rhs", H5P_DEFAULT);
     hid_t rspace = H5Dget_space(dset);
     std::vector<hsize_t> mem_dims(this->hdf5_rhs_chunks);
@@ -379,6 +386,7 @@ void particles_io_base<particle_type>::write_point3D_chunk(
         const int cindex,
         const double *data)
 {
+    TIMEZONE("particles_io_base::write_point3D_chunk");
     hid_t dset = H5Dopen(this->hdf5_group_id, dset_name.c_str(), H5P_DEFAULT);
     hid_t rspace = H5Dget_space(dset);
     std::vector<hsize_t> mem_dims(this->hdf5_state_chunks);
diff --git a/bfps/cpp/rFFTW_distributed_particles.cpp b/bfps/cpp/rFFTW_distributed_particles.cpp
index ab694ab3cc226c4690970cf3959bb2c480207c61..265975f8c817a1b40942e076bd016c2921618bbc 100644
--- a/bfps/cpp/rFFTW_distributed_particles.cpp
+++ b/bfps/cpp/rFFTW_distributed_particles.cpp
@@ -32,10 +32,13 @@
 #include <string>
 #include <sstream>
 #include <set>
+#include <algorithm>
+#include <ctime>
 
 #include "base.hpp"
 #include "rFFTW_distributed_particles.hpp"
 #include "fftw_tools.hpp"
+#include "scope_timer.hpp"
 
 
 extern int myrank, nprocs;
@@ -44,14 +47,15 @@ template <particle_types particle_type, class rnumber, int interp_neighbours>
 rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::rFFTW_distributed_particles(
         const char *NAME,
         const hid_t data_file_id,
-        rFFTW_interpolator<rnumber, interp_neighbours> *FIELD,
+        rFFTW_interpolator<rnumber, interp_neighbours> *VEL,
         const int TRAJ_SKIP,
         const int INTEGRATION_STEPS) : particles_io_base<particle_type>(
             NAME,
             TRAJ_SKIP,
             data_file_id,
-            FIELD->descriptor->comm)
+            VEL->descriptor->comm)
 {
+    TIMEZONE("rFFTW_distributed_particles::rFFTW_distributed_particles");
     /* check that integration_steps has a valid value.
      * If NDEBUG is defined, "assert" doesn't do anything.
      * With NDEBUG defined, and an invalid INTEGRATION_STEPS,
@@ -65,18 +69,21 @@ rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::rFFTW_di
      * therefore I prefer to just kill the code at this point,
      * no matter whether or not NDEBUG is present.
      * */
-    if (interp_neighbours*2+2 > FIELD->descriptor->subsizes[0])
+    if (interp_neighbours*2+2 > VEL->descriptor->subsizes[0])
     {
         DEBUG_MSG("parameters incompatible with rFFTW_distributed_particles.\n"
                   "interp kernel size is %d, local_z_size is %d\n",
-                  interp_neighbours*2+2, FIELD->descriptor->subsizes[0]);
-        if (FIELD->descriptor->myrank == 0)
+                  interp_neighbours*2+2, VEL->descriptor->subsizes[0]);
+        if (VEL->descriptor->myrank == 0)
             std::cerr << "parameters incompatible with rFFTW_distributed_particles." << std::endl;
         exit(0);
     }
-    this->vel = FIELD;
+    this->vel = VEL;
     this->rhs.resize(INTEGRATION_STEPS);
     this->integration_steps = INTEGRATION_STEPS;
+    /* the particles are expected to be evenly distributed among processes.
+     * therefore allocating twice that amount of memory seems enough.
+     * */
     this->state.reserve(2*this->nparticles / this->nprocs);
     for (unsigned int i=0; i<this->rhs.size(); i++)
         this->rhs[i].reserve(2*this->nparticles / this->nprocs);
@@ -157,6 +164,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::sam
         const std::unordered_map<int, std::unordered_set<int>> &dp,
         std::unordered_map<int, single_particle_state<POINT3D>> &y)
 {
+    TIMEZONE("rFFTW_distributed_particles::sample");
     double *yyy;
     double *yy;
     y.clear();
@@ -184,24 +192,35 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::sam
             int tindex;
             tindex = 0;
             // can this sorting be done more efficiently?
-            std::set<int> ordered_dp;
+            std::vector<int> ordered_dp;
+            {
+                TIMEZONE("rFFTW_distributed_particles::sample::ordered_dp");
+            ordered_dp.reserve(dp.at(domain_index).size());
             for (auto p: dp.at(domain_index))
-                ordered_dp.insert(p);
+                ordered_dp.push_back(p);
+            //std::set<int> ordered_dp(dp.at(domain_index));
+            std::sort(ordered_dp.begin(), ordered_dp.end());
+            }
 
             for (auto p: ordered_dp)
+            //for (auto p: dp.at(domain_index))
             {
                 (*field)(x.at(p).data, yy + tindex*3);
                 tindex++;
             }
-            MPI_Allreduce(
+            {
+                TIMEZONE("rFFTW_distributed_particles::sample::MPI_Allreduce");
+                MPI_Allreduce(
                     yy,
                     yyy,
                     3*dp.at(domain_index).size(),
                     MPI_DOUBLE,
                     MPI_SUM,
                     this->domain_comm[domain_index]);
+            }
             tindex = 0;
             for (auto p: ordered_dp)
+            //for (auto p: dp.at(domain_index))
             {
                 y[p] = yyy + tindex*3;
                 tindex++;
@@ -224,8 +243,10 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::get
         case VELOCITY_TRACER:
             this->sample(this->vel, x, dp, yy);
             y.clear();
-            for (auto &pp: x)
-                y[pp.first] = yy[pp.first].data;
+            y.reserve(yy.size());
+            y.rehash(this->nparticles);
+            for (auto &pp: yy)
+                y[pp.first] = pp.second.data;
             break;
     }
 }
@@ -253,31 +274,38 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red
         std::vector<std::unordered_map<int, single_particle_state<particle_type>>> &vals,
         std::unordered_map<int, std::unordered_set<int>> &dp)
 {
+    TIMEZONE("rFFTW_distributed_particles::redistribute");
     //DEBUG_MSG("entered redistribute\n");
     /* get new distribution of particles */
     std::unordered_map<int, std::unordered_set<int>> newdp;
-    this->sort_into_domains(x, newdp);
+    {
+        TIMEZONE("sort_into_domains");
+        this->sort_into_domains(x, newdp);
+    }
     /* take care of particles that are leaving the shared domains */
     int dindex[2] = {-1, 1};
     // for each D of the 2 shared domains
-    for (int di=0; di<2; di++)
-        // for all particles previously in D
-        for (auto p: dp[dindex[di]])
-        {
-            // if the particle is no longer in D
-            if (newdp[dindex[di]].find(p) == newdp[dindex[di]].end())
+    {
+        TIMEZONE("Loop1");
+        for (int di=0; di<2; di++)
+            // for all particles previously in D
+            for (auto p: dp[dindex[di]])
             {
-                // and the particle is not in the local domain
-                if (newdp[0].find(p) == newdp[0].end())
+                // if the particle is no longer in D
+                if (newdp[dindex[di]].find(p) == newdp[dindex[di]].end())
                 {
-                    // remove the particle from the local list
-                    x.erase(p);
-                    for (unsigned int i=0; i<vals.size(); i++)
-                        vals[i].erase(p);
+                    // and the particle is not in the local domain
+                    if (newdp[0].find(p) == newdp[0].end())
+                    {
+                        // remove the particle from the local list
+                        x.erase(p);
+                        for (unsigned int i=0; i<vals.size(); i++)
+                            vals[i].erase(p);
+                    }
+                    // if the particle is in the local domain, do nothing
                 }
-                // if the particle is in the local domain, do nothing
             }
-        }
+    }
     /* take care of particles that are entering the shared domains */
     /* neighbouring rank offsets */
     int ro[2];
@@ -285,16 +313,23 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red
     ro[1] = 1;
     /* particles to send, particles to receive */
     std::vector<int> ps[2], pr[2];
+    for (int tcounter = 0; tcounter < 2; tcounter++)
+    {
+        ps[tcounter].reserve(newdp[dindex[tcounter]].size());
+    }
     /* number of particles to send, number of particles to receive */
     int nps[2], npr[2];
     int rsrc, rdst;
     /* get list of id-s to send */
-    for (auto &p: dp[0])
     {
-        for (int di=0; di<2; di++)
+        TIMEZONE("Loop2");
+        for (auto &p: dp[0])
         {
-            if (newdp[dindex[di]].find(p) != newdp[dindex[di]].end())
-                ps[di].push_back(p);
+            for (int di=0; di<2; di++)
+            {
+                if (newdp[dindex[di]].find(p) != newdp[dindex[di]].end())
+                    ps[di].push_back(p);
+            }
         }
     }
     /* prepare data for send recv */
@@ -304,7 +339,8 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red
         for (int i=0; i<2; i++)
         {
             rdst = MOD(rsrc+ro[i], this->nprocs);
-            if (this->myrank == rsrc)
+            if (this->myrank == rsrc){
+                TIMEZONE("MPI_Send");
                 MPI_Send(
                         nps+i,
                         1,
@@ -312,7 +348,9 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red
                         rdst,
                         2*(rsrc*this->nprocs + rdst)+i,
                         this->comm);
-            if (this->myrank == rdst)
+            }
+            if (this->myrank == rdst){
+                TIMEZONE("MPI_Recv");
                 MPI_Recv(
                         npr+1-i,
                         1,
@@ -321,6 +359,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red
                         2*(rsrc*this->nprocs + rdst)+i,
                         this->comm,
                         MPI_STATUS_IGNORE);
+            }
         }
     //DEBUG_MSG("I have to send %d %d particles\n", nps[0], nps[1]);
     //DEBUG_MSG("I have to recv %d %d particles\n", npr[0], npr[1]);
@@ -338,6 +377,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red
             rdst = MOD(rsrc+ro[i], this->nprocs);
             if (this->myrank == rsrc && nps[i] > 0)
             {
+                TIMEZONE("this->myrank == rsrc && nps[i] > 0");
                 MPI_Send(
                         &ps[i].front(),
                         nps[i],
@@ -369,6 +409,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red
             }
             if (this->myrank == rdst && npr[1-i] > 0)
             {
+                TIMEZONE("this->myrank == rdst && npr[1-i] > 0");
                 MPI_Recv(
                         &pr[1-i].front(),
                         npr[1-i],
@@ -401,8 +442,10 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::red
     delete[] buffer;
     // x has been changed, so newdp is obsolete
     // we need to sort into domains again
-    this->sort_into_domains(x, dp);
-
+    {
+        TIMEZONE("sort_into_domains2");
+        this->sort_into_domains(x, dp);
+    }
 
 #ifndef NDEBUG
     /* check that all particles at x are local */
@@ -425,44 +468,51 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::Ada
         const int nsteps)
 {
     this->get_rhs(this->state, this->domain_particles, this->rhs[0]);
-    for (auto &pp: this->state)
+#define AdamsBashforth_LOOP_PREAMBLE \
+    for (auto &pp: this->state) \
         for (unsigned int i=0; i<state_dimension(particle_type); i++)
-            switch(nsteps)
-            {
-                case 1:
-                    pp.second[i] += this->dt*this->rhs[0][pp.first][i];
-                    break;
-                case 2:
-                    pp.second[i] += this->dt*(3*this->rhs[0][pp.first][i]
-                                            -   this->rhs[1][pp.first][i])/2;
-                    break;
-                case 3:
-                    pp.second[i] += this->dt*(23*this->rhs[0][pp.first][i]
-                                            - 16*this->rhs[1][pp.first][i]
-                                            +  5*this->rhs[2][pp.first][i])/12;
-                    break;
-                case 4:
-                    pp.second[i] += this->dt*(55*this->rhs[0][pp.first][i]
-                                            - 59*this->rhs[1][pp.first][i]
-                                            + 37*this->rhs[2][pp.first][i]
-                                            -  9*this->rhs[3][pp.first][i])/24;
-                    break;
-                case 5:
-                    pp.second[i] += this->dt*(1901*this->rhs[0][pp.first][i]
-                                            - 2774*this->rhs[1][pp.first][i]
-                                            + 2616*this->rhs[2][pp.first][i]
-                                            - 1274*this->rhs[3][pp.first][i]
-                                            +  251*this->rhs[4][pp.first][i])/720;
-                    break;
-                case 6:
-                    pp.second[i] += this->dt*(4277*this->rhs[0][pp.first][i]
-                                            - 7923*this->rhs[1][pp.first][i]
-                                            + 9982*this->rhs[2][pp.first][i]
-                                            - 7298*this->rhs[3][pp.first][i]
-                                            + 2877*this->rhs[4][pp.first][i]
-                                            -  475*this->rhs[5][pp.first][i])/1440;
-                    break;
-            }
+    switch(nsteps)
+    {
+        case 1:
+            AdamsBashforth_LOOP_PREAMBLE
+            pp.second[i] += this->dt*this->rhs[0][pp.first][i];
+            break;
+        case 2:
+            AdamsBashforth_LOOP_PREAMBLE
+            pp.second[i] += this->dt*(3*this->rhs[0][pp.first][i]
+                                    -   this->rhs[1][pp.first][i])/2;
+            break;
+        case 3:
+            AdamsBashforth_LOOP_PREAMBLE
+            pp.second[i] += this->dt*(23*this->rhs[0][pp.first][i]
+                                    - 16*this->rhs[1][pp.first][i]
+                                    +  5*this->rhs[2][pp.first][i])/12;
+            break;
+        case 4:
+            AdamsBashforth_LOOP_PREAMBLE
+            pp.second[i] += this->dt*(55*this->rhs[0][pp.first][i]
+                                    - 59*this->rhs[1][pp.first][i]
+                                    + 37*this->rhs[2][pp.first][i]
+                                    -  9*this->rhs[3][pp.first][i])/24;
+            break;
+        case 5:
+            AdamsBashforth_LOOP_PREAMBLE
+            pp.second[i] += this->dt*(1901*this->rhs[0][pp.first][i]
+                                    - 2774*this->rhs[1][pp.first][i]
+                                    + 2616*this->rhs[2][pp.first][i]
+                                    - 1274*this->rhs[3][pp.first][i]
+                                    +  251*this->rhs[4][pp.first][i])/720;
+            break;
+        case 6:
+            AdamsBashforth_LOOP_PREAMBLE
+            pp.second[i] += this->dt*(4277*this->rhs[0][pp.first][i]
+                                    - 7923*this->rhs[1][pp.first][i]
+                                    + 9982*this->rhs[2][pp.first][i]
+                                    - 7298*this->rhs[3][pp.first][i]
+                                    + 2877*this->rhs[4][pp.first][i]
+                                    -  475*this->rhs[5][pp.first][i])/1440;
+            break;
+    }
     this->redistribute(this->state, this->rhs, this->domain_particles);
     this->roll_rhs();
 }
@@ -471,6 +521,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::Ada
 template <particle_types particle_type, class rnumber, int interp_neighbours>
 void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::step()
 {
+    TIMEZONE("rFFTW_distributed_particles::step");
     this->AdamsBashforth((this->iteration < this->integration_steps) ?
                           this->iteration+1 :
                           this->integration_steps);
@@ -483,6 +534,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::sor
         const std::unordered_map<int, single_particle_state<particle_type>> &x,
         std::unordered_map<int, std::unordered_set<int>> &dp)
 {
+    TIMEZONE("rFFTW_distributed_particles::sort_into_domains");
     int tmpint1, tmpint2;
     dp.clear();
     dp[-1] = std::unordered_set<int>();
@@ -521,19 +573,25 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::sor
 template <particle_types particle_type, class rnumber, int interp_neighbours>
 void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::read()
 {
+    TIMEZONE("rFFTW_distributed_particles::read");
     double *temp = new double[this->chunk_size*state_dimension(particle_type)];
     int tmpint1, tmpint2;
     for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++)
     {
         //read state
-        if (this->myrank == 0)
+        if (this->myrank == 0){
+            TIMEZONE("read_state_chunk");
             this->read_state_chunk(cindex, temp);
-        MPI_Bcast(
+        }
+        {
+            TIMEZONE("MPI_Bcast");
+            MPI_Bcast(
                 temp,
                 this->chunk_size*state_dimension(particle_type),
                 MPI_DOUBLE,
                 0,
                 this->comm);
+        }
         for (unsigned int p=0; p<this->chunk_size; p++)
         {
             if (this->vel->get_rank_info(temp[state_dimension(particle_type)*p+2], tmpint1, tmpint2))
@@ -542,17 +600,23 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::rea
             }
         }
         //read rhs
-        if (this->iteration > 0)
+        if (this->iteration > 0){
+            TIMEZONE("this->iteration > 0");
             for (int i=0; i<this->integration_steps; i++)
             {
-                if (this->myrank == 0)
+                if (this->myrank == 0){
+                    TIMEZONE("read_rhs_chunk");
                     this->read_rhs_chunk(cindex, i, temp);
-                MPI_Bcast(
+                }
+                {
+                    TIMEZONE("MPI_Bcast");
+                    MPI_Bcast(
                         temp,
                         this->chunk_size*state_dimension(particle_type),
                         MPI_DOUBLE,
                         0,
                         this->comm);
+                }
                 for (unsigned int p=0; p<this->chunk_size; p++)
                 {
                     auto pp = this->state.find(p+cindex*this->chunk_size);
@@ -560,6 +624,7 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::rea
                         this->rhs[i][p+cindex*this->chunk_size] = temp + state_dimension(particle_type)*p;
                 }
             }
+        }
     }
     this->sort_into_domains(this->state, this->domain_particles);
     DEBUG_MSG("%s->state.size = %ld\n", this->name.c_str(), this->state.size());
@@ -575,31 +640,48 @@ void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::wri
         const char *dset_name,
         std::unordered_map<int, single_particle_state<POINT3D>> &y)
 {
-    double *data = new double[this->nparticles*3];
-    double *yy = new double[this->nparticles*3];
-    int pindex = 0;
-    for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++)
+    TIMEZONE("rFFTW_distributed_particles::write");
+    double *data = new double[this->chunk_size*3];
+    double *yy = new double[this->chunk_size*3];
+    //int pindex = 0;
+   for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++)
     {
         std::fill_n(yy, this->chunk_size*3, 0);
-        for (unsigned int p=0; p<this->chunk_size; p++, pindex++)
+        //for (unsigned int p=0; p<this->chunk_size; p++, pindex++)
+        //{
+        //    if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() ||
+        //        this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end())
+        //    {
+        //        std::copy(y[pindex].data,
+        //                  y[pindex].data + 3,
+        //                  yy + p*3);
+        //    }
+        //}
+        for (int s = -1; s <= 0; s++)
+             for (auto &pp: this->domain_particles[s])
+             {
+                 if (pp >= int(cindex*this->chunk_size) &&
+                     pp < int((cindex+1)*this->chunk_size))
+                {
+                    std::copy(y[pp].data,
+                              y[pp].data + 3,
+                              yy + (pp-cindex*this->chunk_size)*3);
+                }
+             }
         {
-            if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() ||
-                this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end())
-            {
-                std::copy(y[pindex].data,
-                          y[pindex].data + 3,
-                          yy + p*3);
-            }
-        }
-        MPI_Allreduce(
+            TIMEZONE("MPI_Allreduce");
+            MPI_Allreduce(
                 yy,
                 data,
                 3*this->chunk_size,
                 MPI_DOUBLE,
                 MPI_SUM,
                 this->comm);
-        if (this->myrank == 0)
+        }
+        if (this->myrank == 0){
+            TIMEZONE("write_point3D_chunk");
             this->write_point3D_chunk(dset_name, cindex, data);
+        }
     }
     delete[] yy;
     delete[] data;
@@ -609,59 +691,96 @@ template <particle_types particle_type, class rnumber, int interp_neighbours>
 void rFFTW_distributed_particles<particle_type, rnumber, interp_neighbours>::write(
         const bool write_rhs)
 {
+    TIMEZONE("rFFTW_distributed_particles::write2");
     double *temp0 = new double[this->chunk_size*state_dimension(particle_type)];
     double *temp1 = new double[this->chunk_size*state_dimension(particle_type)];
-    int pindex = 0;
+    //int pindex = 0;
     for (unsigned int cindex=0; cindex<this->get_number_of_chunks(); cindex++)
     {
         //write state
         std::fill_n(temp0, state_dimension(particle_type)*this->chunk_size, 0);
-        pindex = cindex*this->chunk_size;
-        for (unsigned int p=0; p<this->chunk_size; p++, pindex++)
+        //pindex = cindex*this->chunk_size;
+        //for (unsigned int p=0; p<this->chunk_size; p++, pindex++)
+        //{
+        //    if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() ||
+        //        this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end())
+        //    {
+        //        TIMEZONE("std::copy");
+        //        std::copy(this->state[pindex].data,
+        //                  this->state[pindex].data + state_dimension(particle_type),
+        //                  temp0 + p*state_dimension(particle_type));
+        //    }
+        //}
+        for (int s = -1; s <= 0; s++)
+             for (auto &pp: this->domain_particles[s])
+             {
+                 if (pp >= int(cindex*this->chunk_size) &&
+                     pp < int((cindex+1)*this->chunk_size))
+                {
+                    std::copy(this->state[pp].data,
+                              this->state[pp].data + state_dimension(particle_type),
+                              temp0 + (pp-cindex*this->chunk_size)*state_dimension(particle_type));
+                }
+             }
         {
-            if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() ||
-                this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end())
-            {
-                std::copy(this->state[pindex].data,
-                          this->state[pindex].data + state_dimension(particle_type),
-                          temp0 + p*state_dimension(particle_type));
-            }
+            TIMEZONE("MPI_Allreduce");
+            MPI_Allreduce(
+                    temp0,
+                    temp1,
+                    state_dimension(particle_type)*this->chunk_size,
+                    MPI_DOUBLE,
+                    MPI_SUM,
+                    this->comm);
         }
-        MPI_Allreduce(
-                temp0,
-                temp1,
-                state_dimension(particle_type)*this->chunk_size,
-                MPI_DOUBLE,
-                MPI_SUM,
-                this->comm);
-        if (this->myrank == 0)
+        if (this->myrank == 0){
+            TIMEZONE("write_state_chunk");
             this->write_state_chunk(cindex, temp1);
+        }
         //write rhs
-        if (write_rhs)
+        if (write_rhs){
+            TIMEZONE("write_rhs");
             for (int i=0; i<this->integration_steps; i++)
             {
                 std::fill_n(temp0, state_dimension(particle_type)*this->chunk_size, 0);
-                pindex = cindex*this->chunk_size;
-                for (unsigned int p=0; p<this->chunk_size; p++, pindex++)
+                //pindex = cindex*this->chunk_size;
+                //for (unsigned int p=0; p<this->chunk_size; p++, pindex++)
+                //{
+                //    if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() ||
+                //        this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end())
+                //    {
+                //        TIMEZONE("std::copy");
+                //        std::copy(this->rhs[i][pindex].data,
+                //                  this->rhs[i][pindex].data + state_dimension(particle_type),
+                //                  temp0 + p*state_dimension(particle_type));
+                //    }
+                //}
+                for (int s = -1; s <= 0; s++)
+                     for (auto &pp: this->domain_particles[s])
+                     {
+                         if (pp >= int(cindex*this->chunk_size) &&
+                             pp < int((cindex+1)*this->chunk_size))
+                        {
+                            std::copy(this->rhs[i][pp].data,
+                                      this->rhs[i][pp].data + state_dimension(particle_type),
+                                      temp0 + (pp-cindex*this->chunk_size)*state_dimension(particle_type));
+                        }
+                     }
                 {
-                    if (this->domain_particles[-1].find(pindex) != this->domain_particles[-1].end() ||
-                        this->domain_particles[ 0].find(pindex) != this->domain_particles[ 0].end())
-                    {
-                        std::copy(this->rhs[i][pindex].data,
-                                  this->rhs[i][pindex].data + state_dimension(particle_type),
-                                  temp0 + p*state_dimension(particle_type));
-                    }
-                }
-                MPI_Allreduce(
+                    TIMEZONE("MPI_Allreduce");
+                    MPI_Allreduce(
                         temp0,
                         temp1,
                         state_dimension(particle_type)*this->chunk_size,
                         MPI_DOUBLE,
                         MPI_SUM,
                         this->comm);
-                if (this->myrank == 0)
+                }
+                if (this->myrank == 0){
+                    TIMEZONE("write_rhs_chunk");
                     this->write_rhs_chunk(cindex, i, temp1);
+                }
             }
+        }
     }
     delete[] temp0;
     delete[] temp1;
diff --git a/bfps/cpp/rFFTW_distributed_particles.hpp b/bfps/cpp/rFFTW_distributed_particles.hpp
index e271bbfae56c0d49bf66cebcb5e8e8158f81940b..400411d5f1fd6e597714be494a72272a76e01206 100644
--- a/bfps/cpp/rFFTW_distributed_particles.hpp
+++ b/bfps/cpp/rFFTW_distributed_particles.hpp
@@ -44,12 +44,25 @@ template <particle_types particle_type, class rnumber, int interp_neighbours>
 class rFFTW_distributed_particles: public particles_io_base<particle_type>
 {
     private:
-        std::unordered_map<int, single_particle_state<particle_type>> state;
-        std::vector<std::unordered_map<int, single_particle_state<particle_type>>> rhs;
+        // a "domain" corresponds to a region in 3D real space where a fixed set
+        // of MPI processes are required to participate in the interpolation
+        // formula (i.e. they all contain required information).
+        // we need to know how many processes there are for each of the domains
+        // to which the local process belongs.
         std::unordered_map<int, int> domain_nprocs;
+        // each domain has an associated communicator, and we keep a list of the
+        // communicators to which the local process belongs
         std::unordered_map<int, MPI_Comm> domain_comm;
+        // for each domain, we need a list of the IDs of the particles located
+        // in that domain
         std::unordered_map<int, std::unordered_set<int>> domain_particles;
 
+        // for each domain, we need the state of each particle
+        std::unordered_map<int, single_particle_state<particle_type>> state;
+        // for each domain, we also need the last few values of the right hand
+        // side of the ODE, since we use Adams-Bashforth integration
+        std::vector<std::unordered_map<int, single_particle_state<particle_type>>> rhs;
+
     public:
         int integration_steps;
         // this class only works with rFFTW interpolator
@@ -87,9 +100,24 @@ class rFFTW_distributed_particles: public particles_io_base<particle_type>
                 std::unordered_map<int, single_particle_state<particle_type>> &y);
 
 
+        /* given a list of particle positions,
+         * figure out which go into what local domain, and construct the relevant
+         * map of ID lists "dp" (for domain particles).
+         * */
         void sort_into_domains(
                 const std::unordered_map<int, single_particle_state<particle_type>> &x,
                 std::unordered_map<int, std::unordered_set<int>> &dp);
+        /* suppose the particles are currently badly distributed, and some
+         * arbitrary quantities (stored in "vals") are associated to the particles,
+         * and we need to properly distribute them among processes.
+         * that's what this function does.
+         * In practice it's only used to redistribute the rhs values (and it
+         * automatically redistributes the state x being passed).
+         * Some more comments are present in the .cpp file, but, in brief: the
+         * particles are simply moved from one domain to another.
+         * If it turns out that the new domain contains a process which does not
+         * know about a particle, that information is sent from the closest process.
+         * */
         void redistribute(
                 std::unordered_map<int, single_particle_state<particle_type>> &x,
                 std::vector<std::unordered_map<int, single_particle_state<particle_type>>> &vals,
diff --git a/bfps/cpp/rFFTW_interpolator.cpp b/bfps/cpp/rFFTW_interpolator.cpp
index bffae44f5986f9873a231442e92cba6cf005d3a4..55388e4e6800b86ed71291508b74e4595b24845c 100644
--- a/bfps/cpp/rFFTW_interpolator.cpp
+++ b/bfps/cpp/rFFTW_interpolator.cpp
@@ -28,15 +28,15 @@
 
 #include <cmath>
 #include "rFFTW_interpolator.hpp"
+#include "scope_timer.hpp"
 
 template <class rnumber, int interp_neighbours>
 rFFTW_interpolator<rnumber, interp_neighbours>::rFFTW_interpolator(
         fluid_solver_base<rnumber> *fs,
         base_polynomial_values BETA_POLYS,
-        rnumber *FIELD) : interpolator_base<rnumber, interp_neighbours>(fs, BETA_POLYS)
+        rnumber *FIELD_POINTER) : interpolator_base<rnumber, interp_neighbours>(fs, BETA_POLYS)
 {
-    this->field_size = 2*fs->cd->local_size;
-    this->field = FIELD;
+    this->field = FIELD_POINTER;
 
 
     // generate compute array
@@ -48,6 +48,24 @@ rFFTW_interpolator<rnumber, interp_neighbours>::rFFTW_interpolator(
         this->compute[((iz + this->descriptor->sizes[0]) % this->descriptor->sizes[0])] = true;
 }
 
+template <class rnumber, int interp_neighbours>
+rFFTW_interpolator<rnumber, interp_neighbours>::rFFTW_interpolator(
+        vorticity_equation<rnumber, FFTW> *fs,
+        base_polynomial_values BETA_POLYS,
+        rnumber *FIELD_POINTER) : interpolator_base<rnumber, interp_neighbours>(fs, BETA_POLYS)
+{
+//    this->field = FIELD_POINTER;
+//
+//
+//    // generate compute array
+//    this->compute = new bool[this->descriptor->sizes[0]];
+//    std::fill_n(this->compute, this->descriptor->sizes[0], false);
+//    for (int iz = this->descriptor->starts[0]-interp_neighbours-1;
+//            iz <= this->descriptor->starts[0]+this->descriptor->subsizes[0]+interp_neighbours;
+//            iz++)
+//        this->compute[((iz + this->descriptor->sizes[0]) % this->descriptor->sizes[0])] = true;
+}
+
 template <class rnumber, int interp_neighbours>
 rFFTW_interpolator<rnumber, interp_neighbours>::~rFFTW_interpolator()
 {
@@ -80,6 +98,7 @@ void rFFTW_interpolator<rnumber, interp_neighbours>::sample(
         double *__restrict__ y,
         const int *deriv)
 {
+    TIMEZONE("rFFTW_interpolator::sample");
     /* get grid coordinates */
     int *xg = new int[3*nparticles];
     double *xx = new double[3*nparticles];
@@ -109,7 +128,14 @@ void rFFTW_interpolator<rnumber, interp_neighbours>::operator()(
         double *dest,
         const int *deriv)
 {
+    TIMEZONE("rFFTW_interpolator::operator()");
     double bx[interp_neighbours*2+2], by[interp_neighbours*2+2], bz[interp_neighbours*2+2];
+    /* please note that the polynomials in z are computed for all the different
+     * iz values, independently of whether or not "myrank" will perform the
+     * computation for all the different iz slices.
+     * I don't know how big a deal this really is, but it is something that we can
+     * optimize.
+     * */
     if (deriv == NULL)
     {
         this->compute_beta(0, xx[0], bx);
@@ -124,17 +150,30 @@ void rFFTW_interpolator<rnumber, interp_neighbours>::operator()(
     }
     std::fill_n(dest, 3, 0);
     ptrdiff_t bigiz, bigiy, bigix;
+    // loop over the 2*interp_neighbours + 2 z slices
     for (int iz = -interp_neighbours; iz <= interp_neighbours+1; iz++)
     {
+        // bigiz is the z index of the cell containing the particles
+        // this->descriptor->sizes[0] is added before taking the modulo
+        // because we want to be sure that "bigiz" is a positive number.
+        // I'm no longer sure why I don't use the MOD function here.
         bigiz = ptrdiff_t(((xg[2]+iz) + this->descriptor->sizes[0]) % this->descriptor->sizes[0]);
+        // once we know bigiz, we know whether "myrank" has the relevant slice.
+        // if not, go to next value of bigiz
         if (this->descriptor->myrank == this->descriptor->rank[bigiz])
         {
             for (int iy = -interp_neighbours; iy <= interp_neighbours+1; iy++)
             {
+                // bigiy is the y index of the cell
+                // since we have all the y indices in myrank, we can safely use the
+                // modulo value
                 bigiy = ptrdiff_t(MOD(xg[1]+iy, this->descriptor->sizes[1]));
                 for (int ix = -interp_neighbours; ix <= interp_neighbours+1; ix++)
                 {
+                    // bigix is the x index of the cell
                     bigix = ptrdiff_t(MOD(xg[0]+ix, this->descriptor->sizes[2]));
+                    // here we create the index to the current grid node
+                    // note the removal of local_z_start from bigiz.
                     ptrdiff_t tindex = (((bigiz-this->descriptor->starts[0])*this->descriptor->sizes[1] +
                                          bigiy)*(this->descriptor->sizes[2]+2) +
                                          bigix)*3;
diff --git a/bfps/cpp/rFFTW_interpolator.hpp b/bfps/cpp/rFFTW_interpolator.hpp
index 795257d2744e432d9c346b93848cadfbd8cc85dc..5088be8b2f3094fd96332af0c923d7cc905e4f3f 100644
--- a/bfps/cpp/rFFTW_interpolator.hpp
+++ b/bfps/cpp/rFFTW_interpolator.hpp
@@ -27,6 +27,7 @@
 #include "field_descriptor.hpp"
 #include "fftw_tools.hpp"
 #include "fluid_solver_base.hpp"
+#include "vorticity_equation.hpp"
 #include "interpolator_base.hpp"
 
 #ifndef RFFTW_INTERPOLATOR
@@ -38,41 +39,74 @@ class rFFTW_interpolator:public interpolator_base<rnumber, interp_neighbours>
 {
     public:
         using interpolator_base<rnumber, interp_neighbours>::operator();
-        /* size of field to interpolate */
-        ptrdiff_t field_size;
 
-        /* pointers to fields that are to be interpolated
+        /* pointer to field that has to be interpolated
+         * The reason this is a member variable is because I want this class
+         * to be consistent with the "interpolator" class, where a member
+         * variable is absolutely required (since that class uses padding).
          * */
         rnumber *field;
 
-        /* compute[iz] is true if .
+        /* compute[iz] is an array that says whether or not the current MPI
+         * process is involved in the interpolation formula for a particle
+         * located in cell "iz".
+         * It is mostly used in the formula itself.
+         * This translates as the following condition:
          * local_zstart - neighbours <= iz <= local_zend + 1 + neighbours
+         * I think it's cleaner to keep things in an array, especially since
+         * "local_zend" is shorthand for another arithmetic operation anyway.
          * */
         bool *compute;
 
+
+        /* Constructors */
         rFFTW_interpolator(
                 fluid_solver_base<rnumber> *FSOLVER,
                 base_polynomial_values BETA_POLYS,
                 rnumber *FIELD_DATA);
+
+        /* this constructor is empty, I just needed for a quick hack of the
+         * "vorticity_equation" class.
+         * It should be removed soon.
+         * */
+        rFFTW_interpolator(
+                vorticity_equation<rnumber, FFTW> *FSOLVER,
+                base_polynomial_values BETA_POLYS,
+                rnumber *FIELD_DATA);
         ~rFFTW_interpolator();
 
-        /* does not destroy input */
+        /* This method is provided for consistency with "interpolator", and it
+         * does not destroy input */
         inline int read_rFFTW(const void *src)
         {
             this->field = (rnumber*)src;
             return EXIT_SUCCESS;
         }
 
+        /* This is used when "compute" is not enough.
+         * For a given z location, it gives the outermost ranks that are relevant
+         * for the interpolation formula.
+         * */
         bool get_rank_info(double z, int &maxz_rank, int &minz_rank);
 
-        /* interpolate field at an array of locations */
+        /* interpolate field at an array of locations.
+         * After interpolation is performed, call Allreduce for "y", over
+         * this->descriptor->comm --- generally MPI_COMM_WORLD.
+         * This is useful for the simple "particles" class, where particle
+         * information is synchronized across all processes.
+         * */
         void sample(
                 const int nparticles,
                 const int pdimension,
                 const double *__restrict__ x,
                 double *__restrict__ y,
                 const int *deriv = NULL);
-        /* interpolate 1 point */
+        /* interpolate 1 point.
+         * Result is kept local.
+         * This is used in the "rFFTW_distributed_particles" class, with the
+         * result being synchronized across the relevant "local particle
+         * communicator".
+         * */
         void operator()(
                 const int *__restrict__ xg,
                 const double *__restrict__ xx,
diff --git a/bfps/cpp/scope_timer.cpp b/bfps/cpp/scope_timer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..61ddd89583fe8d53cee328c4267df603e128d417
--- /dev/null
+++ b/bfps/cpp/scope_timer.cpp
@@ -0,0 +1,8 @@
+
+
+#include "scope_timer.hpp"
+
+
+#ifdef USE_TIMINGOUTPUT
+EventManager global_timer_manager("BFPS", std::cout);
+#endif
diff --git a/bfps/cpp/scope_timer.hpp b/bfps/cpp/scope_timer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e513e8e6e47a14d69fc0c695894fc2114a9b6058
--- /dev/null
+++ b/bfps/cpp/scope_timer.hpp
@@ -0,0 +1,821 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+#ifndef SCOPE_TIMER_HPP
+#define SCOPE_TIMER_HPP
+
+#include <memory>
+#include <iostream>
+#include <vector>
+#include <stack>
+#include <string>
+#include <limits>
+#include <cassert>
+#include <sstream>
+#include <unordered_map>
+#include <mpi.h>
+#include <cstring>
+#include <stdexcept>
+#include <omp.h>
+#include <iomanip>
+#include <fstream>
+
+#include "base.hpp"
+#include "bfps_timer.hpp"
+
+//< To add it as friend of EventManager
+class ScopeEvent;
+
+class EventManager {
+protected:
+
+    class CoreEvent {
+     protected:
+      //< Name of the event (from the user)
+      const std::string m_name;
+      //< Previous events (stack of parents)
+      std::stack<CoreEvent*> m_parentStack;
+      //< Current event children
+      std::vector<CoreEvent*> m_children;
+
+      //< Total execution time
+      double m_totalTime;
+      //< Minimum execution time
+      double m_minTime;
+      //< Maximum execution time
+      double m_maxTime;
+      //< Number of occurrence for this event
+      int m_occurrence;
+      //< Number of occurrence that are tasks for this event
+      int m_nbTasks;
+      //< Children lock
+      omp_lock_t m_childrenLock;
+      //< Children lock
+      omp_lock_t m_updateLock;
+
+     public:
+      /** Create a core-event from the name and the current stack */
+      CoreEvent(const std::string& inName,
+                const std::stack<CoreEvent*>& inParentStack)
+          : m_name(inName),
+            m_parentStack(inParentStack),
+            m_totalTime(0),
+            m_minTime(std::numeric_limits<double>::max()),
+            m_maxTime(std::numeric_limits<double>::min()),
+            m_occurrence(0),
+            m_nbTasks(0) {
+        omp_init_lock(&m_childrenLock);
+        omp_init_lock(&m_updateLock);
+      }
+
+      ~CoreEvent() {
+        omp_destroy_lock(&m_childrenLock);
+        omp_destroy_lock(&m_updateLock);
+      }
+
+      /** Add a record */
+      void addRecord(const double inDuration, const bool isTask) {
+  #pragma omp atomic update
+        m_totalTime += inDuration;
+  #pragma omp atomic update
+        m_occurrence += 1;
+  #pragma omp flush  // (m_minTime, m_maxTime)
+        if (inDuration < m_minTime || m_maxTime < inDuration) {
+          omp_set_lock(&m_updateLock);
+          m_minTime = std::min(m_minTime, inDuration);
+          m_maxTime = std::max(m_maxTime, inDuration);
+          omp_unset_lock(&m_updateLock);
+        }
+        if (isTask) {
+  #pragma omp atomic update
+          m_nbTasks += 1;
+        }
+      }
+
+      const std::stack<CoreEvent*>& getParents() const { return m_parentStack; }
+
+      std::stack<CoreEvent*>& getParents() { return m_parentStack; }
+
+      void addChild(CoreEvent* inChild) {
+        omp_set_lock(&m_childrenLock);
+        m_children.push_back(inChild);
+        omp_unset_lock(&m_childrenLock);
+      }
+
+      //! Must not be called during a paralle execution
+      const std::vector<CoreEvent*>& getChildren() const {
+        assert(omp_in_parallel() == 0);
+        return m_children;
+      }
+
+      const std::string& getName() const { return m_name; }
+
+      double getMin() const { return m_minTime; }
+
+      double getMax() const { return m_maxTime; }
+
+      int getOccurrence() const { return m_occurrence; }
+
+      double getAverage() const {
+        return m_totalTime / static_cast<double>(m_occurrence);
+      }
+
+      double getDuration() const { return m_totalTime; }
+
+      int getNbTasks() const { return m_nbTasks; }
+    };
+
+    ///////////////////////////////////////////////////////////////
+
+    //< The main node
+    std::unique_ptr<CoreEvent> m_root;
+    //< Output stream to print out
+    std::ostream& m_outputStream;
+
+    //< Current stack, there are one stack of stack per thread
+    std::vector<std::stack<std::stack<CoreEvent*>>> m_currentEventsStackPerThread;
+    //< All recorded events (that will then be delete at the end)
+    std::unordered_multimap<std::string, CoreEvent*> m_records;
+    //< Lock for m_records
+    omp_lock_t m_recordsLock;
+
+    /** Find a event from its name. If such even does not exist
+   * the function creates one. If an event with the same name exists
+   * but with a different stack, a new one is created.
+   * It pushes the returned event in the stack.
+   */
+    CoreEvent* getEvent(const std::string& inName,
+                        const std::string& inUniqueKey) {
+        const std::string completeName = inName + inUniqueKey;
+        CoreEvent* foundEvent = nullptr;
+
+        omp_set_lock(&m_recordsLock);
+        // find all events with this name
+        auto range = m_records.equal_range(completeName);
+        for (auto iter = range.first; iter != range.second; ++iter) {
+          // events are equal if same name and same parents
+          if ((*iter).second->getParents() ==
+              m_currentEventsStackPerThread[omp_get_thread_num()].top()) {
+            foundEvent = (*iter).second;
+            break;
+          }
+        }
+
+        // Keep the lock to ensure that not two threads create the same event
+
+        if (!foundEvent) {
+          // create this event
+          foundEvent = new CoreEvent(
+              inName, m_currentEventsStackPerThread[omp_get_thread_num()].top());
+          m_currentEventsStackPerThread[omp_get_thread_num()].top().top()->addChild(
+              foundEvent);
+          m_records.insert({completeName, foundEvent});
+        }
+        omp_unset_lock(&m_recordsLock);
+
+        m_currentEventsStackPerThread[omp_get_thread_num()].top().push(foundEvent);
+        return foundEvent;
+    }
+
+    CoreEvent* getEventFromContext(const std::string& inName,
+                                   const std::string& inUniqueKey,
+                                   const std::stack<CoreEvent*>& inParentStack) {
+      m_currentEventsStackPerThread[omp_get_thread_num()].push(inParentStack);
+      return getEvent(inName, inUniqueKey);
+    }
+
+    /** Pop current event */
+    void popEvent(const CoreEvent* eventToRemove) {
+        assert(m_currentEventsStackPerThread[omp_get_thread_num()].top().size() > 1);
+        // Comparing address is cheaper
+        if (m_currentEventsStackPerThread[omp_get_thread_num()].top().top() !=
+            eventToRemove) {
+          throw std::runtime_error(
+              "You must end events (ScopeEvent/TIMEZONE) in order.\n"
+              "Please make sure that you only ask to the last event to finish.");
+        }
+        m_currentEventsStackPerThread[omp_get_thread_num()].top().pop();
+    }
+
+    /** Pop current context */
+    void popContext(const CoreEvent* eventToRemove) {
+      assert(m_currentEventsStackPerThread[omp_get_thread_num()].size() > 1);
+      assert(m_currentEventsStackPerThread[omp_get_thread_num()].top().size() > 1);
+      // Comparing address is cheaper
+      if (m_currentEventsStackPerThread[omp_get_thread_num()].top().top() !=
+          eventToRemove) {
+        throw std::runtime_error(
+            "You must end events (ScopeEvent/TIMEZONE) in order.\n"
+            "Please make sure that you only ask to the last event to finish.");
+      }
+      m_currentEventsStackPerThread[omp_get_thread_num()].pop();
+    }
+
+public:
+    /** Create an event manager */
+    EventManager(const std::string& inAppName, std::ostream& inOutputStream)
+        : m_root(new CoreEvent(inAppName, std::stack<CoreEvent*>())),
+          m_outputStream(inOutputStream),
+          m_currentEventsStackPerThread(1) {
+      m_currentEventsStackPerThread[0].emplace();
+      m_currentEventsStackPerThread[0].top().push(m_root.get());
+      omp_init_lock(&m_recordsLock);
+    }
+
+    ~EventManager() throw() {
+        assert(m_currentEventsStackPerThread[0].size() == 1);
+
+        assert(m_currentEventsStackPerThread[0].top().size() == 1);
+
+        omp_destroy_lock(&m_recordsLock);
+
+        for (auto event : m_records) {
+          delete event.second;
+        }
+    }
+
+    void startParallelRegion(const int inNbThreads) {
+      m_currentEventsStackPerThread.resize(1);
+      m_currentEventsStackPerThread.resize(inNbThreads,
+                                           m_currentEventsStackPerThread[0]);
+    }
+
+    void showDistributed(const MPI_Comm inComm) const {
+        int myRank, nbProcess;
+        int retMpi = MPI_Comm_rank( inComm, &myRank);
+        variable_used_only_in_assert(retMpi);
+        assert(retMpi == MPI_SUCCESS);
+        retMpi = MPI_Comm_size( inComm, &nbProcess);
+        assert(retMpi == MPI_SUCCESS);
+
+        if((&m_outputStream == &std::cout || &m_outputStream == &std::clog) && myrank != nbProcess-1){
+            // Print in reverse order
+            char tmp;
+            retMpi = MPI_Recv(&tmp, 1, MPI_BYTE, myrank+1, 99, inComm, MPI_STATUS_IGNORE);
+            assert(retMpi == MPI_SUCCESS);
+        }
+        m_outputStream.flush();
+
+        std::stack<std::pair<int, const CoreEvent*>> events;
+
+        for (int idx = static_cast<int>(m_root->getChildren().size()) - 1; idx >= 0; --idx) {
+            events.push({0, m_root->getChildren()[idx]});
+        }
+
+        m_outputStream << "[TIMING-" <<  myRank<< "] Local times.\n";
+        m_outputStream << "[TIMING-" <<  myRank<< "] :" << m_root->getName() << "\n";
+
+        while (events.size()) {
+            const std::pair<int, const CoreEvent*> eventToShow =
+                    events.top();
+            events.pop();
+
+            m_outputStream << "[TIMING-" <<  myRank<< "] ";
+
+            int offsetTab = eventToShow.first;
+            while (offsetTab--) {
+                m_outputStream << "\t";
+            }
+            m_outputStream << "@" << eventToShow.second->getName() << " = " << eventToShow.second->getDuration() << "s";
+            if (eventToShow.second->getOccurrence() != 1) {
+                m_outputStream << " (Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax()
+                             << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = "
+                             << eventToShow.second->getOccurrence() << ")";
+            }
+
+            m_outputStream << "\n";
+            for (int idx =
+                 static_cast<int>(eventToShow.second->getChildren().size()) - 1;
+                 idx >= 0; --idx) {
+                events.push(
+                {eventToShow.first + 1, eventToShow.second->getChildren()[idx]});
+            }
+        }
+        m_outputStream.flush();
+
+        if((&m_outputStream == &std::cout || &m_outputStream == &std::clog) && myrank != 0){
+            // Print in reverse order
+            char tmp;
+            retMpi = MPI_Send(&tmp, 1, MPI_BYTE, myrank-1, 99, inComm);
+            assert(retMpi == MPI_SUCCESS);
+        }
+    }
+
+    void show(const MPI_Comm inComm, const bool onlyP0 = true) const {
+        int myRank, nbProcess;
+        int retMpi = MPI_Comm_rank( inComm, &myRank);
+        variable_used_only_in_assert(retMpi);
+        assert(retMpi == MPI_SUCCESS);
+        retMpi = MPI_Comm_size( inComm, &nbProcess);
+        assert(retMpi == MPI_SUCCESS);
+
+        if(onlyP0 && myRank != 0){
+            return;
+        }
+
+        std::stringstream myResults;
+
+        std::stack<std::pair<int, const CoreEvent*>> events;
+
+        for (int idx = static_cast<int>(m_root->getChildren().size()) - 1; idx >= 0; --idx) {
+            events.push({0, m_root->getChildren()[idx]});
+        }
+
+        myResults << "[TIMING-" <<  myRank<< "] Local times.\n";
+        myResults << "[TIMING-" <<  myRank<< "] :" << m_root->getName() << "\n";
+
+        while (events.size()) {
+            const std::pair<int, const CoreEvent*> eventToShow =
+                    events.top();
+            events.pop();
+
+            myResults << "[TIMING-" <<  myRank<< "] ";
+
+            int offsetTab = eventToShow.first;
+            while (offsetTab--) {
+                myResults << "\t";
+            }
+            myResults << "@" << eventToShow.second->getName() << " = " << eventToShow.second->getDuration() << "s";
+            if (eventToShow.second->getOccurrence() != 1) {
+                myResults << " (Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax()
+                             << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = "
+                             << eventToShow.second->getOccurrence() << ")";
+            }
+
+            myResults << "\n";
+            for (int idx =
+                 static_cast<int>(eventToShow.second->getChildren().size()) - 1;
+                 idx >= 0; --idx) {
+                events.push(
+                {eventToShow.first + 1, eventToShow.second->getChildren()[idx]});
+            }
+        }
+
+        if(myrank != 0){
+            const std::string strOutput = myResults.str();
+            int sizeOutput = strOutput.length();
+            retMpi = MPI_Send(&sizeOutput, 1, MPI_INT, 0, 99, inComm);
+            assert(retMpi == MPI_SUCCESS);
+            retMpi = MPI_Send((void*)strOutput.data(), sizeOutput, MPI_CHAR, 0, 100, inComm);
+            assert(retMpi == MPI_SUCCESS);
+        }
+        else{
+            if(onlyP0 == false){
+		        std::vector<char> buffer;
+		        for(int idxProc = nbProcess-1 ; idxProc > 0 ; --idxProc){
+		            int sizeRecv;
+		            retMpi = MPI_Recv(&sizeRecv, 1, MPI_INT, idxProc, 99, inComm, MPI_STATUS_IGNORE);
+		            assert(retMpi == MPI_SUCCESS);
+		            buffer.resize(sizeRecv+1);
+		            retMpi = MPI_Recv(buffer.data(), sizeRecv, MPI_CHAR, idxProc, 100, inComm, MPI_STATUS_IGNORE);
+		            assert(retMpi == MPI_SUCCESS);
+		            buffer[sizeRecv]='\0';
+		            m_outputStream << buffer.data();
+		        }
+			}
+            m_outputStream << myResults.str();
+            m_outputStream.flush();
+        }
+    }
+
+    void showMpi(const MPI_Comm inComm) const {
+        struct SerializedEvent {
+            char path[512];
+            char name[128];
+            double totalTime;
+            double minTime;
+            double maxTime;
+            int occurrence;
+        };
+
+        // Convert my events into sendable object
+
+        std::vector<SerializedEvent> myEvents;
+        myEvents.reserve(m_records.size());
+
+        for(const std::pair<std::string, const CoreEvent*>& event : m_records){
+            myEvents.emplace_back();
+            SerializedEvent& current_event = myEvents.back();
+
+            current_event.totalTime = event.second->getDuration();
+            current_event.minTime = event.second->getMin();
+            current_event.maxTime = event.second->getMax();
+            current_event.occurrence = event.second->getOccurrence();
+
+            strncpy(current_event.name, event.second->getName().c_str(), 128);
+            std::stringstream path;
+            std::stack<CoreEvent*> parents = event.second->getParents();
+            while(parents.size()){
+                path << parents.top()->getName() << " << ";
+                parents.pop();
+            }
+
+            strncpy(current_event.path, path.str().c_str(), 512);
+        }
+
+        // Send to process 0
+        int myRank, nbProcess;
+        int retMpi = MPI_Comm_rank( inComm, &myRank);
+        variable_used_only_in_assert(retMpi);
+        assert(retMpi == MPI_SUCCESS);
+        retMpi = MPI_Comm_size( inComm, &nbProcess);
+        assert(retMpi == MPI_SUCCESS);
+        std::unique_ptr<int[]> nbEventsPerProc;
+        if(myRank == 0){
+            nbEventsPerProc.reset(new int[nbProcess]);
+        }
+        const int myNbEvents = myEvents.size();
+        retMpi = MPI_Gather(const_cast<int*>(&myNbEvents), 1, MPI_INT,
+                       nbEventsPerProc.get(), 1, MPI_INT,
+                       0, inComm);
+        assert(retMpi == MPI_SUCCESS);
+        // Process 0 merge and print results
+        std::unique_ptr<int[]> dipls;
+        std::unique_ptr<SerializedEvent[]> allEvents;
+        std::unique_ptr<int[]> nbEventsPerProcByte;
+        std::unique_ptr<int[]> diplsByte;
+        if(myRank == 0){
+            dipls.reset(new int[nbProcess+1]);
+            diplsByte.reset(new int[nbProcess+1]);
+            nbEventsPerProcByte.reset(new int[nbProcess]);
+            dipls[0] = 0;
+            diplsByte[0] = 0;
+            for(int idx = 1 ; idx <= nbProcess ; ++idx){
+                dipls[idx] = dipls[idx-1] + nbEventsPerProc[idx-1];
+                diplsByte[idx] = dipls[idx] * sizeof(SerializedEvent);
+                nbEventsPerProcByte[idx-1] = nbEventsPerProc[idx-1] * sizeof(SerializedEvent);
+            }
+            allEvents.reset(new SerializedEvent[dipls[nbProcess]]);
+        }
+
+        retMpi = MPI_Gatherv(myEvents.data(), myNbEvents * sizeof(SerializedEvent), MPI_BYTE,
+                    allEvents.get(), nbEventsPerProcByte.get(), diplsByte.get(),
+                    MPI_BYTE, 0, inComm);
+        assert(retMpi == MPI_SUCCESS);
+
+        if(myRank == 0){
+            struct GlobalEvent {
+                char path[512];
+                char name[128];
+                double totalTime;
+                double minTime;
+                double maxTime;
+                int occurrence;
+                int nbProcess;
+                double minTimeProcess;
+                double maxTimeProcess;
+            };
+
+            std::unordered_map<std::string, GlobalEvent> mapEvents;
+            for(int idxEvent = 0 ; idxEvent < dipls[nbProcess] ; ++idxEvent){
+                const std::string key = std::string(allEvents[idxEvent].path) + std::string(allEvents[idxEvent].name);
+                if(mapEvents.find(key) == mapEvents.end()){
+                    GlobalEvent& newEvent = mapEvents[key];
+                    strncpy(newEvent.path, allEvents[idxEvent].path, 512);
+                    strncpy(newEvent.name, allEvents[idxEvent].name, 128);
+                    newEvent.totalTime = allEvents[idxEvent].totalTime;
+                    newEvent.minTime = allEvents[idxEvent].minTime;
+                    newEvent.maxTime = allEvents[idxEvent].maxTime;
+                    newEvent.occurrence = allEvents[idxEvent].totalTime;
+                    newEvent.nbProcess = 1;
+                    newEvent.minTimeProcess = allEvents[idxEvent].totalTime;
+                    newEvent.maxTimeProcess = allEvents[idxEvent].totalTime;
+                }
+                else{
+                    GlobalEvent& newEvent = mapEvents[key];
+                    assert(strcmp(newEvent.path, allEvents[idxEvent].path) == 0);
+                    assert(strcmp(newEvent.name, allEvents[idxEvent].name) == 0);
+                    newEvent.totalTime += allEvents[idxEvent].totalTime;
+                    newEvent.minTime = std::min(newEvent.minTime, allEvents[idxEvent].minTime);
+                    newEvent.maxTime = std::max(newEvent.maxTime, allEvents[idxEvent].maxTime);
+                    newEvent.occurrence += allEvents[idxEvent].occurrence;
+                    newEvent.nbProcess += 1;
+                    newEvent.minTimeProcess = std::min(newEvent.minTimeProcess,
+                                                       allEvents[idxEvent].totalTime);
+                    newEvent.maxTimeProcess = std::max(newEvent.maxTimeProcess,
+                                                       allEvents[idxEvent].totalTime);
+                }
+            }
+
+            m_outputStream << "[MPI-TIMING] Mpi times.\n";
+            for(const auto& iter : mapEvents){
+                const GlobalEvent& gevent = iter.second;
+                m_outputStream << "[MPI-TIMING] @" << gevent.name << "\n";
+                m_outputStream << "[MPI-TIMING] Stack => " << gevent.path << "\n";
+                m_outputStream << "[MPI-TIMING] \t Done by " << gevent.nbProcess << " processes\n";
+                m_outputStream << "[MPI-TIMING] \t Total time for all " << gevent.totalTime
+                          << "s (average per process " << gevent.totalTime/gevent.nbProcess << "s)\n";
+                m_outputStream << "[MPI-TIMING] \t Min time for a process " << gevent.minTimeProcess
+                          << "s Max time for a process " << gevent.maxTimeProcess << "s\n";
+                m_outputStream << "[MPI-TIMING] \t The same call has been done " << gevent.occurrence
+                          << " times by all process (duration min " << gevent.minTime << "s max " << gevent.maxTime << "s avg "
+                          << gevent.totalTime/gevent.occurrence << "s)\n";
+            }
+        }
+        m_outputStream.flush();
+    }
+
+    void showHtml(const MPI_Comm inComm, const bool onlyP0 = true) const {
+            int myRank, nbProcess;
+            int retMpi = MPI_Comm_rank( inComm, &myRank);
+            assert(retMpi == MPI_SUCCESS);
+            variable_used_only_in_assert(retMpi);
+            retMpi = MPI_Comm_size( inComm, &nbProcess);
+            assert(retMpi == MPI_SUCCESS);
+
+            if(onlyP0 && myRank != 0){
+                return;
+            }
+
+            std::stringstream myResults;
+
+            std::stack<std::pair<int, const CoreEvent*>> events;
+
+            for (int idx = static_cast<int>(m_root->getChildren().size()) - 1; idx >= 0; --idx) {
+                events.push({0, m_root->getChildren()[idx]});
+            }
+
+            myResults << "<h1>Process : " << myRank << "</h1>\n";
+
+            double totalDuration = 0;
+            for (int idx =
+                 static_cast<int>(m_root->getChildren().size()) - 1;
+                 idx >= 0; --idx) {
+                totalDuration += m_root->getChildren()[idx]->getDuration();
+            }
+
+            myResults << "<h2> " << m_root->getName() << " (" << totalDuration << "s)</h2>\n";
+            myResults << "<ul>\n";
+            int idxBox = myRank*100000;
+
+            while (events.size()) {
+                const std::pair<int, const CoreEvent*> eventToShow =
+                        events.top();
+                events.pop();
+
+                if(eventToShow.first == -1){
+                    myResults << "</ul>\n";
+                    myResults << "</li>\n";
+                }
+                else if(eventToShow.second->getChildren().size() == 0){
+                    myResults << "<li>&#9679; <span title=\"";
+                    if (eventToShow.second->getOccurrence() != 1) {
+                        myResults << "Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax()
+                                     << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = "
+                                     << eventToShow.second->getOccurrence();
+                    }
+                    myResults << "\">" << eventToShow.second->getName();
+                    const double percentage =  100*eventToShow.second->getDuration()/totalDuration;
+                    if( percentage < 0.001 ){
+                        myResults << " (< 0.001% -- " ;
+                    }
+                    else{
+                        myResults << " (" << std::fixed << std::setprecision(3) << percentage << "% -- " ;
+                    }
+                    if(eventToShow.second->getParents().size()){
+                        const double percentageParent = 100*eventToShow.second->getDuration()/eventToShow.second->getParents().top()->getDuration();
+                        myResults << "[" << std::fixed << std::setprecision(3) << percentageParent << "%] -- " ;
+                    }
+                    myResults << eventToShow.second->getDuration() <<"s)</span></li>\n";
+                }
+                else{
+                    myResults << "<li><input type=\"checkbox\" id=\"c" << idxBox << "\" />\n";
+                    myResults << "  <i class=\"fa fa-angle-double-right\">&rarr; </i>\n";
+                    myResults << "  <i class=\"fa fa-angle-double-down\">&darr; </i>\n";
+                    myResults << "  <label for=\"c" << idxBox++ << "\"><span title=\"";
+                    if (eventToShow.second->getOccurrence() != 1) {
+                        myResults << "Min = " << eventToShow.second->getMin() << "s ; Max = " << eventToShow.second->getMax()
+                                     << "s ; Average = " << eventToShow.second->getAverage() << "s ; Occurrence = "
+                                     << eventToShow.second->getOccurrence();
+                    }
+                    myResults << "\">" << eventToShow.second->getName();
+                    const double percentage =  100*eventToShow.second->getDuration()/totalDuration;
+                    if( percentage < 0.001 ){
+                        myResults << " (< 0.001% -- " ;
+                    }
+                    else{
+                        myResults << " (" << std::fixed << std::setprecision(3) << percentage << "% -- " ;
+                    }
+                    if(eventToShow.second->getParents().size()){
+                        const double percentageParent = 100*eventToShow.second->getDuration()/eventToShow.second->getParents().top()->getDuration();
+                        myResults << "[" << std::fixed << std::setprecision(3) << percentageParent << "%] -- " ;
+                    }
+                    myResults << eventToShow.second->getDuration() <<"s)</span></label>\n";
+                    myResults << "<ul>\n";
+                    events.push({-1, nullptr});
+
+                    for (int idx =
+                         static_cast<int>(eventToShow.second->getChildren().size()) - 1;
+                         idx >= 0; --idx) {
+                        events.push(
+                        {eventToShow.first + 1, eventToShow.second->getChildren()[idx]});
+                    }
+                }
+            }
+
+            myResults << "</ul>\n";
+
+            if(myRank != 0){
+                const std::string strOutput = myResults.str();
+                int sizeOutput = strOutput.length();
+                retMpi = MPI_Send(&sizeOutput, 1, MPI_INT, 0, 99, inComm);
+                assert(retMpi == MPI_SUCCESS);
+                retMpi = MPI_Send((void*)strOutput.data(), sizeOutput, MPI_CHAR, 0, 100, inComm);
+                assert(retMpi == MPI_SUCCESS);
+            }
+            else{
+                const std::string htmlOutput = (getenv("HTMLOUTPUT")?getenv("HTMLOUTPUT"):"timings.html");
+
+                std::cout << "Timing output html set to : " << htmlOutput << std::endl;
+
+                std::ofstream htmlfile(htmlOutput);
+
+                htmlfile << "<html>\
+                            <head>\
+                            <style>\
+                            input {\
+                              display: none;\
+                            }\
+                            input ~ ul {\
+                             display: none;\
+                            }\
+                            input:checked ~ ul {\
+                             display: block;\
+                            }\
+                            input ~ .fa-angle-double-down {\
+                              display: none;\
+                            }\
+                            input:checked ~ .fa-angle-double-right {\
+                              display: none;\
+                            }\
+                            input:checked ~ .fa-angle-double-down {\
+                              display: inline;\
+                            }\
+                            li {\
+                              display: block;\
+                              font-family: 'Arial';\
+                              font-size: 15px;\
+                              padding: 0.2em;\
+                              border: 1px solid transparent;\
+                            }\
+                            li:hover {\
+                              border: 1px solid grey;\
+                              border-radius: 3px;\
+                              background-color: lightgrey;\
+                            }\
+                            span:hover {\
+                                color: blue;\
+                            }\
+                            </style>\
+                            </head>\
+                            <body>";
+
+                if(onlyP0 == false){
+                    std::vector<char> buffer;
+                    for(int idxProc = nbProcess-1 ; idxProc > 0 ; --idxProc){
+                        int sizeRecv;
+                        retMpi = MPI_Recv(&sizeRecv, 1, MPI_INT, idxProc, 99, inComm, MPI_STATUS_IGNORE);
+                        assert(retMpi == MPI_SUCCESS);
+                        buffer.resize(sizeRecv+1);
+                        retMpi = MPI_Recv(buffer.data(), sizeRecv, MPI_CHAR, idxProc, 100, inComm, MPI_STATUS_IGNORE);
+                        assert(retMpi == MPI_SUCCESS);
+                        buffer[sizeRecv]='\0';
+                        htmlfile << buffer.data();
+                    }
+                }
+                htmlfile << myResults.str();
+                htmlfile << "</body>\
+                            </html>";
+            }
+        }
+
+
+    std::stack<CoreEvent*> getCurrentThreadEvent() const {
+      return m_currentEventsStackPerThread[omp_get_thread_num()].top();
+    }
+
+    friend ScopeEvent;
+};
+
+///////////////////////////////////////////////////////////////
+
+/** A scope event should be used
+ * to record the duration of a part of the code
+ * (section, scope, etc.).
+ * The timer is stoped automatically when the object is destroyed
+ * or when "finish" is explicitely called.
+ * The object cannot be copied/moved to ensure coherency in the
+ * events hierarchy.
+ */
+class ScopeEvent {
+protected:
+    //< The manager to refer to
+    EventManager& m_manager;
+    //< The core event
+    EventManager::CoreEvent* m_event;
+    //< Time to get elapsed time
+    bfps_timer m_timer;
+    //< Is true if it has been created for task
+    bool m_isTask;
+
+public:
+    ScopeEvent(const std::string& inName, EventManager& inManager,
+               const std::string& inUniqueKey)
+        : m_manager(inManager),
+          m_event(inManager.getEvent(inName, inUniqueKey)),
+          m_isTask(false) {
+      m_timer.start();
+    }
+
+    ScopeEvent(const std::string& inName, EventManager& inManager,
+               const std::string& inUniqueKey,
+               const std::stack<EventManager::CoreEvent*>& inParentStack)
+        : m_manager(inManager),
+          m_event(
+              inManager.getEventFromContext(inName, inUniqueKey, inParentStack)),
+          m_isTask(true) {
+      m_timer.start();
+    }
+
+    ~ScopeEvent() {
+      m_event->addRecord(m_timer.stopAndGetElapsed(), m_isTask);
+      if (m_isTask == false) {
+        m_manager.popEvent(m_event);
+      } else {
+        m_manager.popContext(m_event);
+      }
+    }
+
+    ScopeEvent(const ScopeEvent&) = delete;
+    ScopeEvent& operator=(const ScopeEvent&) = delete;
+    ScopeEvent(ScopeEvent&&) = delete;
+    ScopeEvent& operator=(ScopeEvent&&) = delete;
+};
+
+#define ScopeEventUniqueKey_Core_To_Str_Ext(X) #X
+#define ScopeEventUniqueKey_Core_To_Str(X) \
+    ScopeEventUniqueKey_Core_To_Str_Ext(X)
+#define ScopeEventUniqueKey __FILE__ ScopeEventUniqueKey_Core_To_Str(__LINE__)
+
+#define ScopeEventMultiRefKey std::string("-- multiref event --")
+
+#ifdef USE_TIMINGOUTPUT
+
+extern EventManager global_timer_manager;
+
+#define TIMEZONE_Core_Merge(x, y) x##y
+#define TIMEZONE_Core_Pre_Merge(x, y) TIMEZONE_Core_Merge(x, y)
+
+#define TIMEZONE(NAME)                                                      \
+  ScopeEvent TIMEZONE_Core_Pre_Merge(____TIMEZONE_AUTO_ID, __LINE__)( \
+      NAME, global_timer_manager, ScopeEventUniqueKey);
+#define TIMEZONE_MULTI_REF(NAME)                                            \
+  ScopeEvent TIMEZONE_Core_Pre_Merge(____TIMEZONE_AUTO_ID, __LINE__)( \
+      NAME, global_timer_manager, ScopeEventMultiRefKey);
+
+#define TIMEZONE_OMP_INIT_PRETASK(VARNAME)                         \
+  auto VARNAME##core = global_timer_manager.getCurrentThreadEvent(); \
+  auto VARNAME = &VARNAME##core;
+#define TIMEZONE_OMP_TASK(NAME, VARNAME)                                    \
+  ScopeEvent TIMEZONE_Core_Pre_Merge(____TIMEZONE_AUTO_ID, __LINE__)( \
+      NAME, global_timer_manager, ScopeEventUniqueKey, *VARNAME);
+#define TIMEZONE_OMP_PRAGMA_TASK_KEY(VARNAME) \
+  shared(global_timer_manager) firstprivate(VARNAME)
+
+#define TIMEZONE_OMP_INIT_PREPARALLEL(NBTHREADS) \
+  global_timer_manager.startParallelRegion(NBTHREADS);
+
+#else
+
+#define TIMEZONE(NAME)
+#define TIMEZONE_MULTI_REF(NAME)
+#define TIMEZONE_OMP_INIT_PRETASK(VARNAME)
+#define TIMEZONE_OMP_TASK(NAME, VARNAME)
+#define TIMEZONE_OMP_PRAGMA_TASK_KEY(VARNAME)
+#define TIMEZONE_OMP_INIT_PREPARALLEL(NBTHREADS)
+
+#endif
+
+
+#endif
diff --git a/bfps/cpp/shared_array.hpp b/bfps/cpp/shared_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1951e2f9838ccf37367d859206453d3db91e8e19
--- /dev/null
+++ b/bfps/cpp/shared_array.hpp
@@ -0,0 +1,110 @@
+#ifndef SHAREDARRAY_HPP
+#define SHAREDARRAY_HPP
+
+#include <omp.h>
+#include <functional>
+#include <iostream>
+
+// Cannot be used by different parallel section at the same time
+template <class ValueType>
+class shared_array{
+    int currentNbThreads;
+    ValueType** __restrict__ values;
+    size_t dim;
+
+    std::function<void(ValueType*)> initFunc;
+
+    bool hasBeenMerged;
+
+public:
+    shared_array(const size_t inDim)
+            : currentNbThreads(omp_get_max_threads()),
+              values(nullptr), dim(inDim), hasBeenMerged(false){
+        values = new ValueType*[currentNbThreads];
+        values[0] = new ValueType[dim];
+        for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){
+            values[idxThread] = nullptr;
+        }
+    }
+
+    shared_array(const size_t inDim, std::function<void(ValueType*)> inInitFunc)
+            : shared_array(inDim){
+        setInitFunction(inInitFunc);
+    }
+
+    ~shared_array(){
+        for(int idxThread = 0 ; idxThread < currentNbThreads ; ++idxThread){
+            delete[] values[idxThread];
+        }
+        delete[] values;
+        if(hasBeenMerged == false){
+        }
+    }
+
+    ValueType* getMasterData(){
+        return values[0];
+    }
+
+    const ValueType* getMasterData() const{
+        return values[0];
+    }
+
+    void merge(){
+        ValueType* __restrict__ dest = values[0];
+        for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){
+            if(values[idxThread]){
+                const ValueType* __restrict__ src = values[idxThread];
+                for( size_t idxVal = 0 ; idxVal < dim ; ++idxVal){
+                    dest[idxVal] += src[idxVal];
+                }
+            }
+        }
+        hasBeenMerged = true;
+    }
+    
+    template <class Func>
+    void merge(Func func){
+        ValueType* __restrict__ dest = values[0];
+        for(int idxThread = 1 ; idxThread < currentNbThreads ; ++idxThread){
+            if(values[idxThread]){
+                const ValueType* __restrict__ src = values[idxThread];
+                for( size_t idxVal = 0 ; idxVal < dim ; ++idxVal){
+                    dest[idxVal] = func(idxVal, dest[idxVal], src[idxVal]);
+                }
+            }
+        }
+        hasBeenMerged = true;
+    }
+
+    void mergeParallel(){
+        merge(); // not done yet
+    }
+    
+    template <class Func>
+    void mergeParallel(Func func){
+        merge(func); // not done yet
+    }
+
+    void setInitFunction(std::function<void(ValueType*)> inInitFunc){
+        initFunc = inInitFunc;
+        initFunc(values[0]);
+    }
+
+    ValueType* getMine(){
+        assert(omp_get_thread_num() < currentNbThreads);
+
+        if(values[omp_get_thread_num()] == nullptr){
+            ValueType* myValue = new ValueType[dim];
+            if(initFunc){
+                initFunc(myValue);
+            }
+
+            values[omp_get_thread_num()] = myValue;
+	        return myValue;
+        }
+
+        return values[omp_get_thread_num()];
+    }
+};
+
+#endif
diff --git a/bfps/cpp/vorticity_equation.cpp b/bfps/cpp/vorticity_equation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a84e8a30aceefa9943c982b46389a3245aba2b34
--- /dev/null
+++ b/bfps/cpp/vorticity_equation.cpp
@@ -0,0 +1,716 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+
+
+#define NDEBUG
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include "fftw_tools.hpp"
+#include "vorticity_equation.hpp"
+#include "scope_timer.hpp"
+
+
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::impose_zero_modes()
+{
+    TIMEZONE("vorticity_equation::impose_zero_modes");
+    this->u->impose_zero_mode();
+    this->v[0]->impose_zero_mode();
+    this->v[1]->impose_zero_mode();
+    this->v[2]->impose_zero_mode();
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::update_checkpoint()
+{
+    std::string fname = this->get_current_fname();
+    if (this->kk->layout->myrank == 0)
+    {
+        bool file_exists = false;
+        {
+            struct stat file_buffer;
+            file_exists = (stat(fname.c_str(), &file_buffer) == 0);
+        }
+        if (file_exists)
+        {
+            // check how many fields there are in the checkpoint file
+            // increment checkpoint if needed
+            hsize_t fields_stored;
+            hid_t fid, group_id;
+            fid = H5Fopen(fname.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+            group_id = H5Gopen(fid, "vorticity/complex", H5P_DEFAULT);
+            H5Gget_num_objs(
+                    group_id,
+                    &fields_stored);
+            bool dset_exists = H5Lexists(
+                    group_id,
+                    std::to_string(this->iteration).c_str(),
+                    H5P_DEFAULT);
+            H5Gclose(group_id);
+            H5Fclose(fid);
+            if ((fields_stored >= this->checkpoints_per_file) &&
+                !dset_exists)
+                this->checkpoint++;
+        }
+        else
+        {
+            // create file, create fields_stored dset
+            hid_t fid = H5Fcreate(
+                    fname.c_str(),
+                    H5F_ACC_EXCL,
+                    H5P_DEFAULT,
+                    H5P_DEFAULT);
+            hid_t gg = H5Gcreate(
+                    fid,
+                    "vorticity",
+                    H5P_DEFAULT,
+                    H5P_DEFAULT,
+                    H5P_DEFAULT);
+            hid_t ggg = H5Gcreate(
+                    gg,
+                    "complex",
+                    H5P_DEFAULT,
+                    H5P_DEFAULT,
+                    H5P_DEFAULT);
+            H5Gclose(ggg);
+            H5Gclose(gg);
+            H5Fclose(fid);
+        }
+    }
+    MPI_Bcast(&this->checkpoint, 1, MPI_INT, 0, this->kk->layout->comm);
+}
+
+template <class rnumber,
+          field_backend be>
+vorticity_equation<rnumber, be>::vorticity_equation(
+        const char *NAME,
+        int nx,
+        int ny,
+        int nz,
+        double DKX,
+        double DKY,
+        double DKZ,
+        unsigned FFTW_PLAN_RIGOR)
+{
+    TIMEZONE("vorticity_equation::vorticity_equation");
+    /* initialize name and basic stuff */
+    strncpy(this->name, NAME, 256);
+    this->name[255] = '\0';
+    this->iteration = 0;
+    this->checkpoint = 0;
+
+    /* initialize fields */
+    this->cvorticity = new field<rnumber, be, THREE>(
+            nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR);
+    this->rvorticity = new field<rnumber, be, THREE>(
+            nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR);
+    this->v[1] = new field<rnumber, be, THREE>(
+            nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR);
+    this->v[2] = new field<rnumber, be, THREE>(
+            nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR);
+    this->v[0] = this->cvorticity;
+    this->v[3] = this->cvorticity;
+
+    this->cvelocity = new field<rnumber, be, THREE>(
+            nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR);
+    this->rvelocity = new field<rnumber, be, THREE>(
+            nx, ny, nz, MPI_COMM_WORLD, FFTW_PLAN_RIGOR);
+    this->u = this->cvelocity;
+
+    /* initialize kspace */
+    this->kk = new kspace<be, SMOOTH>(
+            this->cvorticity->clayout, DKX, DKY, DKZ);
+
+    /* ``physical'' parameters etc, initialized here just in case */
+
+    this->nu = 0.1;
+    this->fmode = 1;
+    this->famplitude = 1.0;
+    this->fk0  = 2.0;
+    this->fk1 = 4.0;
+}
+
+template <class rnumber,
+          field_backend be>
+vorticity_equation<rnumber, be>::~vorticity_equation()
+{
+    TIMEZONE("vorticity_equation::~vorticity_equation");
+    delete this->kk;
+    delete this->cvorticity;
+    delete this->rvorticity;
+    delete this->v[1];
+    delete this->v[2];
+    delete this->cvelocity;
+    delete this->rvelocity;
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::compute_vorticity()
+{
+    TIMEZONE("vorticity_equation::compute_vorticity");
+    this->cvorticity->real_space_representation = false;
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2)
+        {
+            this->cvorticity->cval(cindex,0,0) = -(this->kk->ky[yindex]*this->u->cval(cindex,2,1) - this->kk->kz[zindex]*this->u->cval(cindex,1,1));
+            this->cvorticity->cval(cindex,0,1) =  (this->kk->ky[yindex]*this->u->cval(cindex,2,0) - this->kk->kz[zindex]*this->u->cval(cindex,1,0));
+            this->cvorticity->cval(cindex,1,0) = -(this->kk->kz[zindex]*this->u->cval(cindex,0,1) - this->kk->kx[xindex]*this->u->cval(cindex,2,1));
+            this->cvorticity->cval(cindex,1,1) =  (this->kk->kz[zindex]*this->u->cval(cindex,0,0) - this->kk->kx[xindex]*this->u->cval(cindex,2,0));
+            this->cvorticity->cval(cindex,2,0) = -(this->kk->kx[xindex]*this->u->cval(cindex,1,1) - this->kk->ky[yindex]*this->u->cval(cindex,0,1));
+            this->cvorticity->cval(cindex,2,1) =  (this->kk->kx[xindex]*this->u->cval(cindex,1,0) - this->kk->ky[yindex]*this->u->cval(cindex,0,0));
+            //ptrdiff_t tindex = 3*cindex;
+            //this->cvorticity->get_cdata()[tindex+0][0] = -(this->kk->ky[yindex]*this->u->get_cdata()[tindex+2][1] - this->kk->kz[zindex]*this->u->get_cdata()[tindex+1][1]);
+            //this->cvorticity->get_cdata()[tindex+1][0] = -(this->kk->kz[zindex]*this->u->get_cdata()[tindex+0][1] - this->kk->kx[xindex]*this->u->get_cdata()[tindex+2][1]);
+            //this->cvorticity->get_cdata()[tindex+2][0] = -(this->kk->kx[xindex]*this->u->get_cdata()[tindex+1][1] - this->kk->ky[yindex]*this->u->get_cdata()[tindex+0][1]);
+            //this->cvorticity->get_cdata()[tindex+0][1] =  (this->kk->ky[yindex]*this->u->get_cdata()[tindex+2][0] - this->kk->kz[zindex]*this->u->get_cdata()[tindex+1][0]);
+            //this->cvorticity->get_cdata()[tindex+1][1] =  (this->kk->kz[zindex]*this->u->get_cdata()[tindex+0][0] - this->kk->kx[xindex]*this->u->get_cdata()[tindex+2][0]);
+            //this->cvorticity->get_cdata()[tindex+2][1] =  (this->kk->kx[xindex]*this->u->get_cdata()[tindex+1][0] - this->kk->ky[yindex]*this->u->get_cdata()[tindex+0][0]);
+        }
+        else
+            std::fill_n((rnumber*)(this->cvorticity->get_cdata()+3*cindex), 6, 0.0);
+    }
+    );
+    this->cvorticity->symmetrize();
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::compute_velocity(field<rnumber, be, THREE> *vorticity)
+{
+    TIMEZONE("vorticity_equation::compute_velocity");
+    this->u->real_space_representation = false;
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2 && k2 > 0)
+        {
+            this->u->cval(cindex,0,0) = -(this->kk->ky[yindex]*vorticity->cval(cindex,2,1) - this->kk->kz[zindex]*vorticity->cval(cindex,1,1)) / k2;
+            this->u->cval(cindex,0,1) =  (this->kk->ky[yindex]*vorticity->cval(cindex,2,0) - this->kk->kz[zindex]*vorticity->cval(cindex,1,0)) / k2;
+            this->u->cval(cindex,1,0) = -(this->kk->kz[zindex]*vorticity->cval(cindex,0,1) - this->kk->kx[xindex]*vorticity->cval(cindex,2,1)) / k2;
+            this->u->cval(cindex,1,1) =  (this->kk->kz[zindex]*vorticity->cval(cindex,0,0) - this->kk->kx[xindex]*vorticity->cval(cindex,2,0)) / k2;
+            this->u->cval(cindex,2,0) = -(this->kk->kx[xindex]*vorticity->cval(cindex,1,1) - this->kk->ky[yindex]*vorticity->cval(cindex,0,1)) / k2;
+            this->u->cval(cindex,2,1) =  (this->kk->kx[xindex]*vorticity->cval(cindex,1,0) - this->kk->ky[yindex]*vorticity->cval(cindex,0,0)) / k2;
+            //ptrdiff_t tindex = 3*cindex;
+            //this->u->get_cdata()[tindex+0][0] = -(this->kk->ky[yindex]*vorticity->get_cdata()[tindex+2][1] - this->kk->kz[zindex]*vorticity->get_cdata()[tindex+1][1]) / k2;
+            //this->u->get_cdata()[tindex+0][1] =  (this->kk->ky[yindex]*vorticity->get_cdata()[tindex+2][0] - this->kk->kz[zindex]*vorticity->get_cdata()[tindex+1][0]) / k2;
+            //this->u->get_cdata()[tindex+1][0] = -(this->kk->kz[zindex]*vorticity->get_cdata()[tindex+0][1] - this->kk->kx[xindex]*vorticity->get_cdata()[tindex+2][1]) / k2;
+            //this->u->get_cdata()[tindex+1][1] =  (this->kk->kz[zindex]*vorticity->get_cdata()[tindex+0][0] - this->kk->kx[xindex]*vorticity->get_cdata()[tindex+2][0]) / k2;
+            //this->u->get_cdata()[tindex+2][0] = -(this->kk->kx[xindex]*vorticity->get_cdata()[tindex+1][1] - this->kk->ky[yindex]*vorticity->get_cdata()[tindex+0][1]) / k2;
+            //this->u->get_cdata()[tindex+2][1] =  (this->kk->kx[xindex]*vorticity->get_cdata()[tindex+1][0] - this->kk->ky[yindex]*vorticity->get_cdata()[tindex+0][0]) / k2;
+        }
+        else
+            std::fill_n((rnumber*)(this->u->get_cdata()+3*cindex), 6, 0.0);
+    }
+    );
+    this->u->symmetrize();
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::add_forcing(
+        field<rnumber, be, THREE> *dst,
+        field<rnumber, be, THREE> *vort_field,
+        rnumber factor)
+{
+    TIMEZONE("vorticity_equation::add_forcing");
+    if (strcmp(this->forcing_type, "none") == 0)
+        return;
+    if (strcmp(this->forcing_type, "Kolmogorov") == 0)
+    {
+        ptrdiff_t cindex;
+        if (this->cvorticity->clayout->myrank == this->cvorticity->clayout->rank[0][this->fmode])
+        {
+            cindex = ((this->fmode - this->cvorticity->clayout->starts[0]) * this->cvorticity->clayout->sizes[1])*this->cvorticity->clayout->sizes[2];
+            dst->cval(cindex,2, 0) -= this->famplitude*factor/2;
+            //dst->get_cdata()[cindex*3+2][0] -= this->famplitude*factor/2;
+        }
+        if (this->cvorticity->clayout->myrank == this->cvorticity->clayout->rank[0][this->cvorticity->clayout->sizes[0] - this->fmode])
+        {
+            cindex = ((this->cvorticity->clayout->sizes[0] - this->fmode - this->cvorticity->clayout->starts[0]) * this->cvorticity->clayout->sizes[1])*this->cvorticity->clayout->sizes[2];
+            dst->cval(cindex, 2, 0) -= this->famplitude*factor/2;
+            //dst->get_cdata()[cindex*3+2][0] -= this->famplitude*factor/2;
+        }
+        return;
+    }
+    if (strcmp(this->forcing_type, "linear") == 0)
+    {
+        this->kk->CLOOP(
+                    [&](ptrdiff_t cindex,
+                        ptrdiff_t xindex,
+                        ptrdiff_t yindex,
+                        ptrdiff_t zindex){
+            double knorm = sqrt(this->kk->kx[xindex]*this->kk->kx[xindex] +
+                                this->kk->ky[yindex]*this->kk->ky[yindex] +
+                                this->kk->kz[zindex]*this->kk->kz[zindex]);
+            if ((this->fk0 <= knorm) &&
+                    (this->fk1 >= knorm))
+                for (int c=0; c<3; c++)
+                    for (int i=0; i<2; i++)
+                        dst->cval(cindex,c,i) += this->famplitude*vort_field->cval(cindex,c,i)*factor;
+                        //dst->get_cdata()[cindex*3+c][i] += this->famplitude*vort_field->get_cdata()[cindex*3+c][i]*factor;
+        }
+        );
+        return;
+    }
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::omega_nonlin(
+        int src)
+{
+    DEBUG_MSG("vorticity_equation::omega_nonlin(%d)\n", src);
+    assert(src >= 0 && src < 3);
+    this->compute_velocity(this->v[src]);
+    /* get fields from Fourier space to real space */
+    this->u->ift();
+    this->rvorticity->real_space_representation = false;
+    *this->rvorticity = this->v[src]->get_cdata();
+    this->rvorticity->ift();
+    /* compute cross product $u \times \omega$, and normalize */
+    this->u->RLOOP(
+                [&](ptrdiff_t rindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex){
+        //ptrdiff_t tindex = 3*rindex;
+        rnumber tmp[3];
+        for (int cc=0; cc<3; cc++)
+            tmp[cc] = (this->u->rval(rindex,(cc+1)%3)*this->rvorticity->rval(rindex,(cc+2)%3) -
+                       this->u->rval(rindex,(cc+2)%3)*this->rvorticity->rval(rindex,(cc+1)%3));
+            //tmp[cc][0] = (this->u->get_rdata()[tindex+(cc+1)%3]*this->rvorticity->get_rdata()[tindex+(cc+2)%3] -
+            //              this->u->get_rdata()[tindex+(cc+2)%3]*this->rvorticity->get_rdata()[tindex+(cc+1)%3]);
+        for (int cc=0; cc<3; cc++)
+            this->u->rval(rindex,cc) = tmp[cc] / this->u->npoints;
+            //this->u->get_rdata()[(3*rindex)+cc] = tmp[cc][0] / this->u->npoints;
+    }
+    );
+    /* go back to Fourier space */
+    //this->clean_up_real_space(this->ru, 3);
+    this->u->dft();
+    this->kk->template dealias<rnumber, THREE>(this->u->get_cdata());
+    /* $\imath k \times Fourier(u \times \omega)$ */
+    this->kk->CLOOP(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex){
+        rnumber tmp[3][2];
+        {
+            tmp[0][0] = -(this->kk->ky[yindex]*this->u->cval(cindex,2,1) - this->kk->kz[zindex]*this->u->cval(cindex,1,1));
+            tmp[1][0] = -(this->kk->kz[zindex]*this->u->cval(cindex,0,1) - this->kk->kx[xindex]*this->u->cval(cindex,2,1));
+            tmp[2][0] = -(this->kk->kx[xindex]*this->u->cval(cindex,1,1) - this->kk->ky[yindex]*this->u->cval(cindex,0,1));
+            tmp[0][1] =  (this->kk->ky[yindex]*this->u->cval(cindex,2,0) - this->kk->kz[zindex]*this->u->cval(cindex,1,0));
+            tmp[1][1] =  (this->kk->kz[zindex]*this->u->cval(cindex,0,0) - this->kk->kx[xindex]*this->u->cval(cindex,2,0));
+            tmp[2][1] =  (this->kk->kx[xindex]*this->u->cval(cindex,1,0) - this->kk->ky[yindex]*this->u->cval(cindex,0,0));
+        }
+        //ptrdiff_t tindex = 3*cindex;
+        //{
+        //    tmp[0][0] = -(this->kk->ky[yindex]*this->u->get_cdata()[tindex+2][1] - this->kk->kz[zindex]*this->u->get_cdata()[tindex+1][1]);
+        //    tmp[1][0] = -(this->kk->kz[zindex]*this->u->get_cdata()[tindex+0][1] - this->kk->kx[xindex]*this->u->get_cdata()[tindex+2][1]);
+        //    tmp[2][0] = -(this->kk->kx[xindex]*this->u->get_cdata()[tindex+1][1] - this->kk->ky[yindex]*this->u->get_cdata()[tindex+0][1]);
+        //    tmp[0][1] =  (this->kk->ky[yindex]*this->u->get_cdata()[tindex+2][0] - this->kk->kz[zindex]*this->u->get_cdata()[tindex+1][0]);
+        //    tmp[1][1] =  (this->kk->kz[zindex]*this->u->get_cdata()[tindex+0][0] - this->kk->kx[xindex]*this->u->get_cdata()[tindex+2][0]);
+        //    tmp[2][1] =  (this->kk->kx[xindex]*this->u->get_cdata()[tindex+1][0] - this->kk->ky[yindex]*this->u->get_cdata()[tindex+0][0]);
+        //}
+        for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+            this->u->cval(cindex, cc, i) = tmp[cc][i];
+            //this->u->get_cdata()[3*cindex+cc][i] = tmp[cc][i];
+    }
+    );
+    this->add_forcing(this->u, this->v[src], 1.0);
+    this->kk->template force_divfree<rnumber>(this->u->get_cdata());
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::step(double dt)
+{
+    DEBUG_MSG("vorticity_equation::step\n");
+    TIMEZONE("vorticity_equation::step");
+    *this->v[1] = 0.0;
+    this->omega_nonlin(0);
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2)
+        {
+            double factor0;
+            factor0 = exp(-this->nu * k2 * dt);
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->v[1]->cval(cindex,cc,i) = (
+                        this->v[0]->cval(cindex,cc,i) +
+                        dt*this->u->cval(cindex,cc,i))*factor0;
+                //this->v[1]->get_cdata()[3*cindex+cc][i] = (
+                //        this->v[0]->get_cdata()[3*cindex+cc][i] +
+                //        dt*this->u->get_cdata()[3*cindex+cc][i])*factor0;
+        }
+    }
+    );
+
+    this->omega_nonlin(1);
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2)
+        {
+            double factor0, factor1;
+            factor0 = exp(-this->nu * k2 * dt/2);
+            factor1 = exp( this->nu * k2 * dt/2);
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->v[2]->cval(cindex, cc, i) = (
+                        3*this->v[0]->cval(cindex,cc,i)*factor0 +
+                        ( this->v[1]->cval(cindex,cc,i) +
+                         dt*this->u->cval(cindex,cc,i))*factor1)*0.25;
+                //this->v[2]->get_cdata()[3*cindex+cc][i] = (
+                //        3*this->v[0]->get_cdata()[3*cindex+cc][i]*factor0 +
+                //        (this->v[1]->get_cdata()[3*cindex+cc][i] +
+                //         dt*this->u->get_cdata()[3*cindex+cc][i])*factor1)*0.25;
+        }
+    }
+    );
+
+    this->omega_nonlin(2);
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2)
+        {
+            double factor0;
+            factor0 = exp(-this->nu * k2 * dt * 0.5);
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->v[3]->cval(cindex,cc,i) = (
+                        this->v[0]->cval(cindex,cc,i)*factor0 +
+                        2*(this->v[2]->cval(cindex,cc,i) +
+                           dt*this->u->cval(cindex,cc,i)))*factor0/3;
+                //this->v[3]->get_cdata()[3*cindex+cc][i] = (
+                //        this->v[0]->get_cdata()[3*cindex+cc][i]*factor0 +
+                //        2*(this->v[2]->get_cdata()[3*cindex+cc][i] +
+                //           dt*this->u->get_cdata()[3*cindex+cc][i]))*factor0/3;
+        }
+    }
+    );
+
+    this->kk->template force_divfree<rnumber>(this->cvorticity->get_cdata());
+    this->cvorticity->symmetrize();
+    this->iteration++;
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::compute_pressure(field<rnumber, be, ONE> *pressure)
+{
+    TIMEZONE("vorticity_equation::compute_pressure");
+    /* assume velocity is already in real space representation */
+
+    this->v[1]->real_space_representation = true;
+    /* diagonal terms 11 22 33 */
+    this->v[1]->RLOOP (
+                [&](ptrdiff_t rindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex){
+        //ptrdiff_t tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->v[1]->rval(rindex,cc) = this->u->rval(rindex,cc)*this->u->rval(rindex,cc);
+            //this->v[1]->get_rdata()[tindex+cc] = this->u->get_rdata()[tindex+cc]*this->u->get_rdata()[tindex+cc];
+        }
+        );
+    //this->clean_up_real_space(this->rv[1], 3);
+    this->v[1]->dft();
+    this->kk->template dealias<rnumber, THREE>(this->v[1]->get_cdata());
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2 && k2 > 0)
+        {
+            ptrdiff_t tindex = 3*cindex;
+            for (int i=0; i<2; i++)
+            {
+                pressure->get_cdata()[cindex][i] = \
+                    -(this->kk->kx[xindex]*this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][i] +
+                      this->kk->ky[yindex]*this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][i] +
+                      this->kk->kz[zindex]*this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][i]);
+            }
+        }
+        else
+            std::fill_n((rnumber*)(pressure->get_cdata()+cindex), 2, 0.0);
+    }
+    );
+    /* off-diagonal terms 12 23 31 */
+    this->v[1]->real_space_representation = true;
+    this->v[1]->RLOOP (
+                [&](ptrdiff_t rindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex){
+        //ptrdiff_t tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->v[1]->rval(rindex,cc) = this->u->rval(rindex,cc)*this->u->rval(rindex,(cc+1)%3);
+            //this->v[1]->get_rdata()[tindex+cc] = this->u->get_rdata()[tindex+cc]*this->u->get_rdata()[tindex+(cc+1)%3];
+    }
+    );
+    //this->clean_up_real_space(this->rv[1], 3);
+    this->v[1]->dft();
+    this->kk->template dealias<rnumber, THREE>(this->v[1]->get_cdata());
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2 && k2 > 0)
+        {
+            ptrdiff_t tindex = 3*cindex;
+            for (int i=0; i<2; i++)
+            {
+                pressure->get_cdata()[cindex][i] -= \
+                    2*(this->kk->kx[xindex]*this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+0][i] +
+                       this->kk->ky[yindex]*this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+1][i] +
+                       this->kk->kz[zindex]*this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+2][i]);
+                pressure->get_cdata()[cindex][i] /= pressure->npoints*k2;
+            }
+        }
+    }
+    );
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::compute_Lagrangian_acceleration(
+        field<rnumber, be, THREE> *acceleration)
+{
+    field<rnumber, be, ONE> *pressure = new field<rnumber, be, ONE>(
+            this->cvelocity->rlayout->sizes[2],
+            this->cvelocity->rlayout->sizes[1],
+            this->cvelocity->rlayout->sizes[0],
+            this->cvelocity->rlayout->comm,
+            this->cvelocity->fftw_plan_rigor);
+    this->compute_velocity(this->cvorticity);
+    this->cvelocity->ift();
+    this->compute_pressure(pressure);
+    this->compute_velocity(this->cvorticity);
+    acceleration->real_space_representation = false;
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2)
+        {
+            ptrdiff_t tindex = 3*cindex;
+            for (int cc=0; cc<3; cc++)
+                for (int i=0; i<2; i++)
+                    acceleration->get_cdata()[tindex+cc][i] = \
+                        - this->nu*k2*this->cvelocity->get_cdata()[tindex+cc][i];
+            if (strcmp(this->forcing_type, "linear") == 0)
+            {
+                double knorm = sqrt(k2);
+                if ((this->fk0 <= knorm) &&
+                        (this->fk1 >= knorm))
+                    for (int c=0; c<3; c++)
+                        for (int i=0; i<2; i++)
+                            acceleration->get_cdata()[tindex+c][i] += \
+                                this->famplitude*this->cvelocity->get_cdata()[tindex+c][i];
+            }
+            acceleration->get_cdata()[tindex+0][0] += this->kk->kx[xindex]*pressure->get_cdata()[cindex][1];
+            acceleration->get_cdata()[tindex+1][0] += this->kk->ky[yindex]*pressure->get_cdata()[cindex][1];
+            acceleration->get_cdata()[tindex+2][0] += this->kk->kz[zindex]*pressure->get_cdata()[cindex][1];
+            acceleration->get_cdata()[tindex+0][1] -= this->kk->kx[xindex]*pressure->get_cdata()[cindex][0];
+            acceleration->get_cdata()[tindex+1][1] -= this->kk->ky[yindex]*pressure->get_cdata()[cindex][0];
+            acceleration->get_cdata()[tindex+2][1] -= this->kk->kz[zindex]*pressure->get_cdata()[cindex][0];
+        }
+        });
+    delete pressure;
+}
+
+template <class rnumber,
+          field_backend be>
+void vorticity_equation<rnumber, be>::compute_Eulerian_acceleration(
+        field<rnumber, be, THREE> *acceleration)
+{
+    this->compute_velocity(this->cvorticity);
+    acceleration->real_space_representation = false;
+    /* put in linear terms */
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2)
+        {
+            ptrdiff_t tindex = 3*cindex;
+            for (int cc=0; cc<3; cc++)
+                for (int i=0; i<2; i++)
+                    acceleration->get_cdata()[tindex+cc][i] = \
+                        - this->nu*k2*this->cvelocity->get_cdata()[tindex+cc][i];
+            if (strcmp(this->forcing_type, "linear") == 0)
+            {
+                double knorm = sqrt(k2);
+                if ((this->fk0 <= knorm) &&
+                        (this->fk1 >= knorm))
+                {
+                    for (int c=0; c<3; c++)
+                        for (int i=0; i<2; i++)
+                            acceleration->get_cdata()[tindex+c][i] += \
+                                this->famplitude*this->cvelocity->get_cdata()[tindex+c][i];
+                }
+            }
+        }
+    }
+    );
+    this->cvelocity->ift();
+    /* compute uu */
+    /* 11 22 33 */
+    this->v[1]->real_space_representation = true;
+    this->cvelocity->RLOOP (
+                [&](ptrdiff_t rindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex){
+        //ptrdiff_t tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->v[1]->rval(rindex,cc) = \
+                this->cvelocity->rval(rindex,cc)*this->cvelocity->rval(rindex,cc) / this->cvelocity->npoints;
+            //this->v[1]->get_rdata()[tindex+cc] = this->cvelocity->get_rdata()[tindex+cc]*this->cvelocity->get_rdata()[tindex+cc] / this->cvelocity->npoints;
+    }
+    );
+    this->v[1]->dft();
+    this->kk->template dealias<rnumber, THREE>(this->v[1]->get_cdata());
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2)
+        {
+            ptrdiff_t tindex = 3*cindex;
+            acceleration->get_cdata()[tindex+0][0] +=
+                    this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][1];
+            acceleration->get_cdata()[tindex+0][1] +=
+                   -this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][0];
+            acceleration->get_cdata()[tindex+1][0] +=
+                    this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][1];
+            acceleration->get_cdata()[tindex+1][1] +=
+                   -this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][0];
+            acceleration->get_cdata()[tindex+2][0] +=
+                    this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][1];
+            acceleration->get_cdata()[tindex+2][1] +=
+                   -this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][0];
+        }
+    }
+    );
+    /* 12 23 31 */
+    this->v[1]->real_space_representation = true;
+    this->cvelocity->RLOOP (
+                [&](ptrdiff_t rindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex){
+        //ptrdiff_t tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->v[1]->rval(rindex,cc) = \
+                this->cvelocity->rval(rindex,cc)*this->cvelocity->rval(rindex,(cc+1)%3) / this->cvelocity->npoints;
+            //this->v[1]->get_rdata()[tindex+cc] = this->cvelocity->get_rdata()[tindex+cc]*this->cvelocity->get_rdata()[tindex+(cc+1)%3] / this->cvelocity->npoints;
+    }
+    );
+    this->v[1]->dft();
+    this->kk->template dealias<rnumber, THREE>(this->v[1]->get_cdata());
+    this->kk->CLOOP_K2(
+                [&](ptrdiff_t cindex,
+                    ptrdiff_t xindex,
+                    ptrdiff_t yindex,
+                    ptrdiff_t zindex,
+                    double k2){
+        if (k2 <= this->kk->kM2)
+        {
+            ptrdiff_t tindex = 3*cindex;
+            acceleration->get_cdata()[tindex+0][0] +=
+                    (this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+0][1] +
+                     this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][1]);
+            acceleration->get_cdata()[tindex+0][1] +=
+                  - (this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+0][0] +
+                     this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+2][0]);
+            acceleration->get_cdata()[tindex+1][0] +=
+                    (this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+1][1] +
+                     this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][1]);
+            acceleration->get_cdata()[tindex+1][1] +=
+                  - (this->kk->kz[zindex]*this->v[1]->get_cdata()[tindex+1][0] +
+                     this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+0][0]);
+            acceleration->get_cdata()[tindex+2][0] +=
+                    (this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+2][1] +
+                     this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][1]);
+            acceleration->get_cdata()[tindex+2][1] +=
+                  - (this->kk->kx[xindex]*this->v[1]->get_cdata()[tindex+2][0] +
+                     this->kk->ky[yindex]*this->v[1]->get_cdata()[tindex+1][0]);
+        }
+    }
+    );
+    if (this->kk->layout->myrank == this->kk->layout->rank[0][0])
+        std::fill_n((rnumber*)(acceleration->get_cdata()), 6, 0.0);
+    this->kk->template force_divfree<rnumber>(acceleration->get_cdata());
+}
+
+
+/*****************************************************************************/
+
+
+
+
+/*****************************************************************************/
+/* finally, force generation of code for single precision                    */
+template class vorticity_equation<float, FFTW>;
+template class vorticity_equation<double, FFTW>;
+/*****************************************************************************/
+
diff --git a/bfps/cpp/vorticity_equation.hpp b/bfps/cpp/vorticity_equation.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..60d566ed9f149c5a5e4848a2b4640c7378b05e98
--- /dev/null
+++ b/bfps/cpp/vorticity_equation.hpp
@@ -0,0 +1,137 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+
+#include "field.hpp"
+#include "field_descriptor.hpp"
+
+#ifndef VORTICITY_EQUATION
+
+#define VORTICITY_EQUATION
+
+extern int myrank, nprocs;
+
+
+/* container for field descriptor, fields themselves, parameters, etc
+ * This particular class is only meant as a stepping stone to a proper solver
+ * that only uses the field class (and related layout and kspace classes), and
+ * HDF5 for I/O.
+ * */
+
+template <typename rnumber,
+          field_backend be>
+class vorticity_equation
+{
+    public:
+        /* name */
+        char name[256];
+
+        /* iteration */
+        int iteration;
+        int checkpoint;
+        int checkpoints_per_file;
+
+        /* fields */
+        field<rnumber, be, THREE> *cvorticity, *cvelocity;
+        field<rnumber, be, THREE> *rvorticity, *rvelocity;
+        kspace<be, SMOOTH> *kk;
+
+
+        /* short names for velocity, and 4 vorticity fields */
+        field<rnumber, be, THREE> *u, *v[4];
+
+        /* physical parameters */
+        double nu;
+        int fmode;         // for Kolmogorov flow
+        double famplitude; // both for Kflow and band forcing
+        double fk0, fk1;   // for band forcing
+        char forcing_type[128];
+
+        /* constructor, destructor */
+        vorticity_equation(
+                const char *NAME,
+                int nx,
+                int ny,
+                int nz,
+                double DKX = 1.0,
+                double DKY = 1.0,
+                double DKZ = 1.0,
+                unsigned FFTW_PLAN_RIGOR = FFTW_MEASURE);
+        ~vorticity_equation(void);
+
+        /* solver essential methods */
+        void omega_nonlin(int src);
+        void step(double dt);
+        void impose_zero_modes(void);
+        void add_forcing(field<rnumber, be, THREE> *dst,
+                         field<rnumber, be, THREE> *src_vorticity,
+                         rnumber factor);
+        void compute_vorticity(void);
+        void compute_velocity(field<rnumber, be, THREE> *vorticity);
+
+        /* I/O stuff */
+        inline std::string get_current_fname()
+        {
+            return (
+                    std::string(this->name) +
+                    std::string("_checkpoint_") +
+                    std::to_string(this->checkpoint) +
+                    std::string(".h5"));
+        }
+        void update_checkpoint(void);
+        inline void io_checkpoint(bool read = true)
+        {
+            assert(!this->cvorticity->real_space_representation);
+            if (!read)
+                this->update_checkpoint();
+            std::string fname = this->get_current_fname();
+            this->cvorticity->io(
+                    fname,
+                    "vorticity",
+                    this->iteration,
+                    read);
+            if (read)
+            {
+                #if (__GNUC__ <= 4 && __GNUC_MINOR__ <= 7)
+                    this->kk->low_pass<rnumber, THREE>(this->cvorticity->get_cdata(), this->kk->kM);
+                    this->kk->force_divfree<rnumber>(this->cvorticity->get_cdata());
+                #else
+                    this->kk->template low_pass<rnumber, THREE>(this->cvorticity->get_cdata(), this->kk->kM);
+                    this->kk->template force_divfree<rnumber>(this->cvorticity->get_cdata());
+                #endif
+            }
+        }
+
+        /* statistics and general postprocessing */
+        void compute_pressure(field<rnumber, be, ONE> *pressure);
+        void compute_Eulerian_acceleration(field<rnumber, be, THREE> *acceleration);
+        void compute_Lagrangian_acceleration(field<rnumber, be, THREE> *acceleration);
+};
+
+#endif//VORTICITY_EQUATION
+
diff --git a/bfps/tools.py b/bfps/tools.py
index ff5d365aa979fd0c98b9ab64fe8a2a5404f05474..69756ec648409ab52d57930d26b1ab1ca8b942c1 100644
--- a/bfps/tools.py
+++ b/bfps/tools.py
@@ -28,6 +28,36 @@ import sys
 import math
 import numpy as np
 
+import h5py
+
+def create_alloc_early_dataset(
+        data_file,
+        dset_name,
+        dset_shape,
+        dset_maxshape,
+        dset_chunks,
+        # maybe something more general can be used here
+        dset_dtype = h5py.h5t.IEEE_F64LE):
+    # create the dataspace.
+    space_id = h5py.h5s.create_simple(
+            dset_shape,
+            dset_maxshape)
+    # create the dataset creation property list.
+    dcpl = h5py.h5p.create(h5py.h5p.DATASET_CREATE)
+    # set the allocation time to "early".
+    dcpl.set_alloc_time(h5py.h5d.ALLOC_TIME_EARLY)
+    dcpl.set_chunk(dset_chunks)
+    # and now create dataset
+    if sys.version_info[0] == 3:
+        dset_name = dset_name.encode()
+    return h5py.h5d.create(
+            data_file.id,
+            dset_name,
+            dset_dtype,
+            space_id,
+            dcpl,
+            h5py.h5p.DEFAULT)
+
 def generate_data_3D_uniform(
         n0, n1, n2,
         dtype = np.complex128,
diff --git a/documentation/_static/overview.rst b/documentation/_static/overview.rst
index 607cfcc4774cbfd583240d2e7f9bad3cc766af80..afe7a753666e6ea5911ce1266d0803aa25ea5c45 100644
--- a/documentation/_static/overview.rst
+++ b/documentation/_static/overview.rst
@@ -2,6 +2,65 @@
 Overview and Tutorial
 =====================
 
+----------------
+General comments
+----------------
+
+The purpose of this code is to run pseudo-spectral DNS of turbulence,
+and integrate particle trajectories in the resulting fields.
+In brief, the main aim of the code is to simplify the launching of
+compute jobs and postprocessing, up to and including the generation of
+publication-ready figures.
+
+For research, people routinely write code from scratch because research
+goals change to a point where modifying the previous code is too
+expensive.
+With bfps, the desire is to identify core functionality that should be
+implemented in a library.
+The core library can then be used by many problem-specific codes.
+
+In this sense, the structuring of the code-base is non-standard.
+The core functionality is implemented in C++ (classes useful for
+describing working with fields or sets of particles), while a python
+wrapper is used for generating "main" programmes to be linked against
+the core library.
+The core library uses MPI for parallelization, and the python wrapper
+compiles this core library when being installed.
+The compilation environment can be configured for different
+machines as required.
+
+Python3 "wrapper"
+-----------------
+
+In principle, users of the code should only need to use python3 for
+launching jobs and postprocessing data.
+While python2 compatibility should not be too hard to maintain, the
+usage of strings makes it a bit cumbersome ---
+the code makes extensive usage of strings for `HDF5` I/O.
+
+Classes defined in the python package can be used to generate executable
+codes, compile/launch them, and then for accessing and postprocessing
+data.
+Obviously, postprocessing methods can be optimized with C extensions or
+otherwise, as needed.
+
+Code generation is quite straightforward, with C++ code snippets handled
+as strings in the python code, such that they can be combined in
+different ways.
+
+Once a "main" file has been written, it is compiled and linked against
+the core library.
+Depending on machine-specific settings, the code can then be launched
+directly, or job scripts appropriate for queueing systems are generated
+and submitted.
+
+C++ core library
+----------------
+
+A small set of base classes are implemented.
+
+[ some details to be added here ]
+
 ---------
 Equations
 ---------
diff --git a/documentation/figs/interpolation.py b/documentation/figs/interpolation.py
new file mode 100644
index 0000000000000000000000000000000000000000..302efcc157971b8b0407bb76bd3e7be6437f1206
--- /dev/null
+++ b/documentation/figs/interpolation.py
@@ -0,0 +1,52 @@
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import math
+
+def main():
+    slab = 2
+    nproc = 5
+    f = plt.figure(figsize = (6, 4.5))
+    a = f.add_subplot(111)
+    for p in range(nproc):
+        color = plt.get_cmap('plasma')(p*1./nproc)
+        a.add_patch(
+                mpatches.Rectangle(
+                        [0, p*slab],
+                        slab*(nproc+2)-1, 1,
+                        color = color,
+                        alpha = .2))
+        a.text(-.5, p*slab+.5, '$p_{0}$'.format(p),
+               verticalalignment = 'center')
+        for y in range((nproc+2)*slab):
+            a.plot([y, y],
+                   range(p*slab, (p+1)*slab),
+                   marker = '.',
+                   linestyle = 'none',
+                   color = color)
+    for X, Y in [(9.9, 6.3),
+                 (3.3, 3.7)]:
+        a.plot([X], [Y],
+               color = 'black',
+               marker = 'x')
+        for n in [1, 2]:
+            a.add_patch(
+                    mpatches.Rectangle(
+                            [math.floor(X-n), math.floor(Y-n)],
+                            2*n+1, 2*n+1,
+                            color = 'green',
+                            alpha = .2))
+            a.text(math.floor(X)+.5, math.floor(Y - n)-.3,
+                   '$n = {0}$'.format(n),
+                   horizontalalignment = 'center')
+    a.set_ylim(bottom = -1, top = 10)
+    a.set_xlim(left = -1)
+    a.set_ylabel('$z$')
+    a.set_xlabel('$x,y$')
+    a.set_aspect('equal')
+    f.tight_layout()
+    f.savefig('interp_problem.pdf')
+    return None
+
+if __name__ == '__main__':
+    main()
+
diff --git a/done.txt b/done.txt
deleted file mode 100644
index 2064592cc9dd7a6e278c9980770882e636b8a2be..0000000000000000000000000000000000000000
--- a/done.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-x 2015-12-04 make code py3 compatible                                @python3
-x 2015-12-23 decide on versioning system                                           +merge0
-x 2015-12-24 move get_grid coords to interpolator                    @optimization +v1.0
-x 2015-12-25 get rid of temporal interpolation                       @optimization +v1.0
-x 2015-12-26 call interpolation only when needed                     @optimization +v1.0
-x 2015-12-26 clean up tox files, make sure all tests run             @tests        +v1.0
-x 2016-01-03 check divfree function
-x 2016-01-03 compute kMeta(t) as well
-x 2016-01-03 split library into core and extra                                       @optimization +v1.0
-x 2016-01-07 FFTW interpolator doesn't need its own field                            @optimization +v1.0 +particle_api
-x 2016-01-08 simplify tracer/field addition mechanism                                @design +v1.0 +particle_api
-x 2016-01-08 add stat choice parameter to add_particles                              @design +v1.0 +particle_api
-x 2016-01-15 particle output is broken when niter_part != 1                          @bugfix
-x 2016-01-19 clean up machine_settings mess                                          @design @documentation +v2.0
-x 2016-01-24 clear delimitation of public API                                        @documentation +v1.0
-x 2016-01-24 document coordinate conventions                                         @documentation +v1.0
-x 2016-01-24 move parameters from _fluid_particle_base to NavierStokes etc           @design
-x 2016-01-29 install_info should be renamed to bfps_info in data file
-x 2016-02-01 tweak HDF5 settings                                                     @optimization @HDF5 +I/O
-x 2016-03-02 code overview                                                           @documentation
-x 2016-04-29 use HDF5 io for fields                                                  @design @HDF5 +I/O
diff --git a/examples/NS0SliceParticles.py b/examples/NS0SliceParticles.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c089405988a1c6eef6a1c7649e11c7a4a6edcaa
--- /dev/null
+++ b/examples/NS0SliceParticles.py
@@ -0,0 +1,126 @@
+#######################################################################
+#                                                                     #
+#  Copyright 2015 Max Planck Institute                                #
+#                 for Dynamics and Self-Organization                  #
+#                                                                     #
+#  This file is part of bfps.                                         #
+#                                                                     #
+#  bfps is free software: you can redistribute it and/or modify       #
+#  it under the terms of the GNU General Public License as published  #
+#  by the Free Software Foundation, either version 3 of the License,  #
+#  or (at your option) any later version.                             #
+#                                                                     #
+#  bfps is distributed in the hope that it will be useful,            #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of     #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      #
+#  GNU General Public License for more details.                       #
+#                                                                     #
+#  You should have received a copy of the GNU General Public License  #
+#  along with bfps.  If not, see <http://www.gnu.org/licenses/>       #
+#                                                                     #
+# Contact: Cristian.Lalescu@ds.mpg.de                                 #
+#                                                                     #
+#######################################################################
+
+
+
+import os
+import sys
+import bfps
+import numpy as np
+
+class NS0SliceParticles(bfps.NavierStokes):
+    """
+        Example of how bfps is envisioned to be used.
+        Standard NavierStokes class is inherited, and then new functionality
+        added on top.
+        In particular, this class will a DNS with particles starting on a square
+        grid in the z=0 slice of the field.
+    """
+    standard_names = ['NS0SP',
+                      'NS0SP-single',
+                      'NS0SP-double']
+    def __init__(
+            self,
+            name = 'NS0SliceParticles-v' + bfps.__version__,
+            **kwargs):
+        bfps.NavierStokes.__init__(
+                self,
+                name = name,
+                **kwargs)
+        return None
+    def specific_parser_arguments(
+            self,
+            parser):
+        bfps.NavierStokes.specific_parser_arguments(self, parser)
+        parser.add_argument(
+                '--pcloudX',
+                type = float,
+                dest = 'pcloudX',
+                default = 0.0)
+        parser.add_argument(
+                '--pcloudY',
+                type = float,
+                dest = 'pcloudY',
+                default = 0.0)
+        return None
+    def launch_jobs(
+            self,
+            opt = None):
+        if not os.path.exists(os.path.join(self.work_dir, self.simname + '.h5')):
+            particle_initial_condition = None
+            if self.parameters['nparticles'] > 0:
+                # the extra dimension of 1 is because I want
+                # a single chunk of particles.
+                particle_initial_condition = np.zeros(
+                        (1,
+                         self.parameters['nparticles'],
+                         self.parameters['nparticles'],
+                         3),
+                        dtype = np.float64)
+                xvals = (opt.pcloudX +
+                         np.linspace(-opt.particle_cloud_size/2,
+                                      opt.particle_cloud_size/2,
+                                      self.parameters['nparticles']))
+                yvals = (opt.pcloudY +
+                         np.linspace(-opt.particle_cloud_size/2,
+                                      opt.particle_cloud_size/2,
+                                      self.parameters['nparticles']))
+                particle_initial_condition[..., 0] = xvals[None, None, :]
+                particle_initial_condition[..., 1] = yvals[None, :, None]
+            self.write_par(
+                    particle_ic = particle_initial_condition)
+            if self.parameters['nparticles'] > 0:
+                data = self.generate_tracer_state(
+                        species = 0,
+                        rseed = opt.particle_rand_seed,
+                        data = particle_initial_condition)
+            init_condition_file = os.path.join(
+                    self.work_dir,
+                    self.simname + '_cvorticity_i{0:0>5x}'.format(0))
+            if not os.path.exists(init_condition_file):
+                if len(opt.src_simname) > 0:
+                    src_file = os.path.join(
+                            os.path.realpath(opt.src_work_dir),
+                            opt.src_simname + '_cvorticity_i{0:0>5x}'.format(opt.src_iteration))
+                    os.symlink(src_file, init_condition_file)
+                else:
+                   self.generate_vector_field(
+                           write_to_file = True,
+                           spectra_slope = 2.0,
+                           amplitude = 0.05)
+        self.run(
+                ncpu = opt.ncpu,
+                njobs = opt.njobs,
+                hours = opt.minutes // 60,
+                minutes = opt.minutes % 60)
+        return None
+
+def main():
+    c = NS0SliceParticles()
+    c.launch(args = sys.argv[1:])
+    return None
+
+if __name__ == '__main__':
+    main()
+
diff --git a/examples/NSBufferedParticles.py b/examples/NSBufferedParticles.py
new file mode 100644
index 0000000000000000000000000000000000000000..34906576d62e2b2cac68f2d6c261129b23d667b7
--- /dev/null
+++ b/examples/NSBufferedParticles.py
@@ -0,0 +1,51 @@
+import bfps
+import argparse
+import sys
+
+class NSBufferedParticles(bfps.NavierStokes):
+    """
+        Another example.
+        This class behaves identically to NavierStokes, except that it uses a
+        buffered interpolator, and the corresponding distributed_particles class.
+    """
+    standard_names = ['NSBP',
+                      'NSBP-single',
+                      'NSBP-double']
+    def launch(
+            self,
+            args = [],
+            noparticles = False,
+            **kwargs):
+        self.name = 'NSBufferedParticles-v' + bfps.__version__
+        opt = self.prepare_launch(args = args)
+        self.fill_up_fluid_code()
+        if noparticles:
+            opt.nparticles = 0
+        elif type(opt.nparticles) == int:
+            if opt.nparticles > 0:
+                self.name += '-particles'
+                self.add_3D_rFFTW_field(
+                        name = 'rFFTW_acc')
+                self.add_interpolator(
+                        name = 'cubic_spline',
+                        neighbours = opt.neighbours,
+                        smoothness = opt.smoothness,
+                        class_name = 'interpolator')
+                self.add_particles(
+                        integration_steps = [4],
+                        interpolator = 'cubic_spline',
+                        acc_name = 'rFFTW_acc',
+                        class_name = 'distributed_particles')
+        self.finalize_code()
+        self.launch_jobs(opt = opt)
+        return None
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog = 'NSBufferedParticles')
+    parser.add_argument(
+            '-v', '--version',
+            action = 'version',
+            version = '%(prog)s ' + bfps.__version__)
+    c = NSBufferedParticles(fluid_precision = 'single')
+    c.launch(args = sys.argv[1:])
+
diff --git a/examples/NavierStokesDB.py b/examples/NavierStokesDB.py
new file mode 100644
index 0000000000000000000000000000000000000000..d099ad308e8fa47aea08275bc80694da796465b2
--- /dev/null
+++ b/examples/NavierStokesDB.py
@@ -0,0 +1,112 @@
+#######################################################################
+#                                                                     #
+#  Copyright 2015 Max Planck Institute                                #
+#                 for Dynamics and Self-Organization                  #
+#                                                                     #
+#  This file is part of bfps.                                         #
+#                                                                     #
+#  bfps is free software: you can redistribute it and/or modify       #
+#  it under the terms of the GNU General Public License as published  #
+#  by the Free Software Foundation, either version 3 of the License,  #
+#  or (at your option) any later version.                             #
+#                                                                     #
+#  bfps is distributed in the hope that it will be useful,            #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of     #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      #
+#  GNU General Public License for more details.                       #
+#                                                                     #
+#  You should have received a copy of the GNU General Public License  #
+#  along with bfps.  If not, see <http://www.gnu.org/licenses/>       #
+#                                                                     #
+# Contact: Cristian.Lalescu@ds.mpg.de                                 #
+#                                                                     #
+#######################################################################
+
+
+
+import os
+import h5py
+import bfps
+
+class NavierStokesDB(bfps.NavierStokes):
+    """
+        Example of how bfps is envisioned to be used.
+        Standard NavierStokes class is inherited, and then new functionality
+        added on top.
+        In particular, this class will generate an HDF5 file containing a 5D
+        array representing the time history of the velocity field.
+        Snapshots are saved every "niter_stat" iterations.
+
+        No effort was spent on optimizing the HDF5 file access, since the code
+        was only used for a teeny DNS of 72^3 so far.
+    """
+    standard_names = ['NSDB',
+                      'NSDB-single',
+                      'NSDB-double']
+    def __init__(
+            self,
+            name = 'NavierStokesDataBase-v' + bfps.__version__,
+            **kwargs):
+        bfps.NavierStokes.__init__(
+                self,
+                name = name,
+                **kwargs)
+        self.file_datasets_grow += """
+                {
+                    if (myrank == 0)
+                    {
+                        hid_t database_file;
+                        char dbfname[256];
+                        sprintf(dbfname, "%s_field_database.h5", simname);
+                        database_file = H5Fopen(dbfname, H5F_ACC_RDWR, H5P_DEFAULT);
+                        hsize_t dset = H5Dopen(database_file, "rvelocity", H5P_DEFAULT);
+                        grow_single_dataset(dset, niter_todo/niter_stat);
+                        H5Dclose(dset);
+                        H5Fclose(database_file);
+                    }
+                }
+                """
+        self.stat_src += """
+                {
+                    fs->compute_velocity(fs->cvorticity);
+                    *tmp_vec_field = fs->cvelocity;
+                    tmp_vec_field->ift();
+                    char dbfname[256];
+                    sprintf(dbfname, "%s_field_database.h5", simname);
+                    tmp_vec_field->io(dbfname, "rvelocity", fs->iteration / niter_stat, false);
+                }
+                """
+        return None
+    def get_database_file_name(self):
+        return os.path.join(self.work_dir, self.simname + '_field_database.h5')
+    def get_database_file(self):
+        return h5py.File(self.get_postprocess_file_name(), 'r')
+    def write_par(
+            self,
+            iter0 = 0,
+            **kwargs):
+        bfps.NavierStokes.write_par(
+                self,
+                iter0 = iter0,
+                **kwargs)
+        with h5py.File(self.get_database_file_name(), 'a') as ofile:
+            ofile.create_dataset(
+                    'rvelocity',
+                    (1,
+                     self.parameters['nz'],
+                     self.parameters['ny'],
+                     self.parameters['nx'],
+                     3),
+                    chunks = (1,
+                              self.parameters['nz'],
+                              self.parameters['ny'],
+                              self.parameters['nx'],
+                              3),
+                    maxshape = (None,
+                                self.parameters['nz'],
+                                self.parameters['ny'],
+                                self.parameters['nx'],
+                                3),
+                    dtype = self.rtype)
+        return None
+
diff --git a/machine_settings_py.py b/machine_settings_py.py
index 22123e391aa14151e2f1d4b4c8c0b5c8d6a1c435..787f1d5a10b9b0b260b42a1da18d35e67c56dacc 100644
--- a/machine_settings_py.py
+++ b/machine_settings_py.py
@@ -37,6 +37,7 @@ import os
 
 hostname = os.getenv('HOSTNAME')
 
+compiler = 'g++'
 extra_compile_args = ['-Wall', '-O2', '-g', '-mtune=native', '-ffast-math', '-std=c++11']
 extra_libraries = ['hdf5']
 include_dirs = []
diff --git a/setup.py b/setup.py
index c9bbc9c1d956d4d74d6344e19d1d220b1ff12b0b..e1d85b38a95a4a47186e74c44a1e4aeb52098da2 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@ if not os.path.exists(os.path.join(bfpsfolder, 'host_information.py')):
     shutil.copyfile('./machine_settings_py.py', os.path.join(bfpsfolder, 'machine_settings.py'))
 sys.path.insert(0, bfpsfolder)
 # import stuff required for compilation of static library
-from machine_settings import include_dirs, library_dirs, extra_compile_args, extra_libraries
+from machine_settings import compiler, include_dirs, library_dirs, extra_compile_args, extra_libraries
 
 
 ### package versioning
@@ -88,7 +88,10 @@ print('This is bfps version ' + VERSION)
 
 
 ### lists of files and MANIFEST.in
-src_file_list = ['field',
+src_file_list = ['vorticity_equation',
+                 'field',
+                 'kspace',
+                 'field_layout',
                  'field_descriptor',
                  'rFFTW_distributed_particles',
                  'distributed_particles',
@@ -107,11 +110,34 @@ src_file_list = ['field',
                  'spline_n4',
                  'spline_n5',
                  'spline_n6',
-                 'Lagrange_polys']
+                 'Lagrange_polys',
+                 'scope_timer']
+
+particle_headers = [
+        'cpp/particles/abstract_particles_distr.hpp',
+        'cpp/particles/abstract_particles_input.hpp',
+        'cpp/particles/abstract_particles_output.hpp',
+        'cpp/particles/abstract_particles_system.hpp',
+        'cpp/particles/alltoall_exchanger.hpp',
+        'cpp/particles/field_accessor.hpp',
+        'cpp/particles/particles_adams_bashforth.hpp',
+        'cpp/particles/particles_field_computer.hpp',
+        'cpp/particles/particles_input_hdf5.hpp',
+        'cpp/particles/particles_interp_spline.hpp',
+        'cpp/particles/particles_output_hdf5.hpp',
+        'cpp/particles/particles_output_mpiio.hpp',
+        'cpp/particles/particles_system_builder.hpp',
+        'cpp/particles/particles_system.hpp',
+        'cpp/particles/particles_utils.hpp']
 
 header_list = (['cpp/base.hpp'] +
+               ['cpp/fftw_interface.hpp'] +
+               ['cpp/bfps_timer.hpp'] +
+               ['cpp/omputils.hpp'] +
+               ['cpp/shared_array.hpp'] +
                ['cpp/' + fname + '.hpp'
-                for fname in src_file_list])
+                for fname in src_file_list] +
+               particle_headers)
 
 with open('MANIFEST.in', 'w') as manifest_in_file:
     for fname in (['bfps/cpp/' + ff + '.cpp' for ff in src_file_list] +
@@ -121,77 +147,86 @@ with open('MANIFEST.in', 'w') as manifest_in_file:
 
 
 ### libraries
-libraries = ['fftw3_mpi',
-             'fftw3',
-             'fftw3f_mpi',
-             'fftw3f']
-libraries += extra_libraries
-
-
-
-### save compiling information
-pickle.dump(
-        {'include_dirs' : include_dirs,
-         'library_dirs' : library_dirs,
-         'extra_compile_args' : extra_compile_args,
-         'libraries' : libraries,
-         'install_date' : now,
-         'VERSION' : VERSION,
-         'git_revision' : git_revision},
-        open('bfps/install_info.pickle', 'wb'),
-        protocol = 2)
-
-
-
-def compile_bfps_library():
-    if not os.path.isdir('obj'):
-        os.makedirs('obj')
-        need_to_compile = True
-    else:
-        ofile = 'bfps/libbfps.a'
-        libtime = datetime.datetime.fromtimestamp(os.path.getctime(ofile))
-        latest = libtime
-        for fname in header_list:
-            latest = max(latest,
-                         datetime.datetime.fromtimestamp(os.path.getctime('bfps/' + fname)))
-        need_to_compile = (latest > libtime)
-    for fname in src_file_list:
-        ifile = 'bfps/cpp/' + fname + '.cpp'
-        ofile = 'obj/' + fname + '.o'
-        if not os.path.exists(ofile):
-            need_to_compile_file = True
-        else:
-            need_to_compile_file = (need_to_compile or
-                                    (datetime.datetime.fromtimestamp(os.path.getctime(ofile)) <
-                                     datetime.datetime.fromtimestamp(os.path.getctime(ifile))))
-        if need_to_compile_file:
-            command_strings = ['g++', '-c']
-            command_strings += ['bfps/cpp/' + fname + '.cpp']
-            command_strings += ['-o', 'obj/' + fname + '.o']
-            command_strings += extra_compile_args
-            command_strings += ['-I' + idir for idir in include_dirs]
-            command_strings.append('-Ibfps/cpp/')
-            print(' '.join(command_strings))
-            assert(subprocess.call(command_strings) == 0)
-    command_strings = ['ar', 'rvs', 'bfps/libbfps.a']
-    command_strings += ['obj/' + fname + '.o' for fname in src_file_list]
-    print(' '.join(command_strings))
-    assert(subprocess.call(command_strings) == 0)
-    return None
-
-from distutils.command.build import build as DistutilsBuild
-from distutils.command.install import install as DistutilsInstall
-
-class CustomBuild(DistutilsBuild):
+libraries = extra_libraries
+
+
+import distutils.cmd
+
+class CompileLibCommand(distutils.cmd.Command):
+    description = 'Compile bfps library.'
+    user_options = [
+            ('timing-output=', None, 'Toggle timing output.'),
+            ('fftw-estimate=', None, 'Use FFTW ESTIMATE.'),
+            ('disable-fftw-omp=', None, 'Turn Off FFTW OpenMP.'),
+            ]
+    def initialize_options(self):
+        self.timing_output = 0
+        self.fftw_estimate = 0
+        self.disable_fftw_omp = 0
+        return None
+    def finalize_options(self):
+        self.timing_output = (int(self.timing_output) == 1)
+        self.fftw_estimate = (int(self.fftw_estimate) == 1)
+        self.disable_fftw_omp = (int(self.disable_fftw_omp) == 1)
+        return None
     def run(self):
-        compile_bfps_library()
-        DistutilsBuild.run(self)
-
-# this custom install leads to a broken installation. no idea why...
-class CustomInstall(DistutilsInstall):
-    def run(self):
-        compile_bfps_library()
-        DistutilsInstall.run(self)
+        if not os.path.isdir('obj'):
+            os.makedirs('obj')
+            need_to_compile = True
+        if not os.path.isfile('bfps/libbfps.a'):
+            need_to_compile = True
+        else:
+            ofile = 'bfps/libbfps.a'
+            libtime = datetime.datetime.fromtimestamp(os.path.getctime(ofile))
+            latest = libtime
+            for fname in header_list:
+                latest = max(latest,
+                             datetime.datetime.fromtimestamp(os.path.getctime('bfps/' + fname)))
+            need_to_compile = (latest > libtime)
+        eca = extra_compile_args
+        eca += ['-fPIC']
+        if self.timing_output:
+            eca += ['-DUSE_TIMINGOUTPUT']
+        if self.fftw_estimate:
+            eca += ['-DUSE_FFTWESTIMATE']
+        if self.disable_fftw_omp:
+            eca += ['-DNO_FFTWOMP']
+        for fname in src_file_list:
+            ifile = 'bfps/cpp/' + fname + '.cpp'
+            ofile = 'obj/' + fname + '.o'
+            if not os.path.exists(ofile):
+                need_to_compile_file = True
+            else:
+                need_to_compile_file = (need_to_compile or
+                                        (datetime.datetime.fromtimestamp(os.path.getctime(ofile)) <
+                                         datetime.datetime.fromtimestamp(os.path.getctime(ifile))))
+            if need_to_compile_file:
+                command_strings = [compiler, '-c']
+                command_strings += ['bfps/cpp/' + fname + '.cpp']
+                command_strings += ['-o', 'obj/' + fname + '.o']
+                command_strings += eca
+                command_strings += ['-I' + idir for idir in include_dirs]
+                command_strings.append('-Ibfps/cpp/')
+                print(' '.join(command_strings))
+                subprocess.check_call(command_strings)
+        command_strings = ['ar', 'rvs', 'bfps/libbfps.a']
+        command_strings += ['obj/' + fname + '.o' for fname in src_file_list]
+        print(' '.join(command_strings))
+        subprocess.check_call(command_strings)
+
+        ### save compiling information
+        pickle.dump(
+                {'include_dirs' : include_dirs,
+                 'library_dirs' : library_dirs,
+                 'compiler'     : compiler,
+                 'extra_compile_args' : eca,
+                 'libraries' : libraries,
+                 'install_date' : now,
+                 'VERSION' : VERSION,
+                 'git_revision' : git_revision},
+                open('bfps/install_info.pickle', 'wb'),
+                protocol = 2)
+        return None
 
 from setuptools import setup
 
@@ -199,7 +234,7 @@ setup(
         name = 'bfps',
         packages = ['bfps'],
         install_requires = ['numpy>=1.8', 'h5py>=2.2.1'],
-        cmdclass={'build' : CustomBuild},
+        cmdclass={'compile_library' : CompileLibCommand},
         package_data = {'bfps': header_list + ['libbfps.a',
                                                'install_info.pickle']},
         entry_points = {
diff --git a/tests/test_field_class.py b/tests/test_field_class.py
index fc52f419a5ab2dd7a5231676c41b9d586d497080..110d9be685ef42d4ed231a3a3c723ac34e3d916d 100644
--- a/tests/test_field_class.py
+++ b/tests/test_field_class.py
@@ -32,32 +32,37 @@ class TestField(_fluid_particle_base):
         self.fluid_includes += '#include "fftw_tools.hpp"\n'
         self.fluid_includes += '#include "field.hpp"\n'
         self.fluid_variables += ('field<' + self.C_dtype + ', FFTW, ONE> *f;\n' +
+                                 'field<' + self.C_dtype + ', FFTW, THREE> *v;\n' +
                                  'kspace<FFTW, SMOOTH> *kk;\n')
         self.fluid_start += """
                 //begincpp
                 f = new field<{0}, FFTW, ONE>(
                         nx, ny, nz, MPI_COMM_WORLD);
+                v = new field<{0}, FFTW, THREE>(
+                        nx, ny, nz, MPI_COMM_WORLD);
                 kk = new kspace<FFTW, SMOOTH>(
                         f->clayout, 1., 1., 1.);
                 // read rdata
-                f->io("field.h5", "rdata", 0, true);
+                f->real_space_representation = true;
+                f->io("field.h5", "scal", 0, true);
                 // go to fourier space, write into cdata_tmp
                 f->dft();
-                f->io("field.h5", "cdata_tmp", 0, false);
+                f->io("field.h5", "scal_tmp", 0, false);
                 f->ift();
-                f->io("field.h5", "rdata", 0, false);
-                f->io("field.h5", "cdata", 0, true);
+                f->io("field.h5", "scal", 0, false);
+                f->real_space_representation = false;
+                f->io("field.h5", "scal", 0, true);
                 hid_t gg;
                 if (f->myrank == 0)
                     gg = H5Fopen("field.h5", H5F_ACC_RDWR, H5P_DEFAULT);
                 kk->cospectrum<float, ONE>(
-                        f->get_rdata(),
-                        f->get_rdata(),
+                        f->get_cdata(),
+                        f->get_cdata(),
                         gg,
                         "scal",
                         0);
                 f->ift();
-                f->io("field.h5", "rdata_tmp", 0, false);
+                f->io("field.h5", "scal_tmp", 0, false);
                 std::vector<double> me;
                 me.resize(1);
                 me[0] = 30;
@@ -66,11 +71,15 @@ class TestField(_fluid_particle_base):
                         0, me);
                 if (f->myrank == 0)
                     H5Fclose(gg);
+                v->real_space_representation = false;
+                v->io("field.h5", "vec", 0, true);
+                v->io("field.h5", "vec_tmp", 0, false);
                 //endcpp
                 """.format(self.C_dtype)
         self.fluid_end += """
                 //begincpp
                 delete f;
+                delete v;
                 //endcpp
                 """
         return None
@@ -92,7 +101,7 @@ class TestField(_fluid_particle_base):
         return None
 
 def main():
-    n = 128
+    n = 32
     kdata = pyfftw.n_byte_align_empty(
             (n, n, n//2 + 1),
             pyfftw.simd_alignment,
@@ -116,10 +125,10 @@ def main():
     tf.parameters['ny'] = n
     tf.parameters['nz'] = n
     f = h5py.File('field.h5', 'w')
-    f['cdata'] = cdata.reshape((1,) + cdata.shape)
-    f['cdata_tmp'] = np.zeros(shape=(1,) + cdata.shape).astype(cdata.dtype)
-    f['rdata'] = rdata.reshape((1,) + rdata.shape)
-    f['rdata_tmp'] = np.zeros(shape=(1,) + rdata.shape).astype(rdata.dtype)
+    f['scal/complex/0'] = cdata
+    f['scal/real/0'] = rdata
+    f['vec/complex/0'] = np.array([cdata, cdata, cdata]).reshape(cdata.shape + (3,))
+    f['vec/real/0'] = np.array([rdata, rdata, rdata]).reshape(rdata.shape + (3,))
     f['moments/scal'] = np.zeros(shape = (1, 10)).astype(np.float)
     f['histograms/scal'] = np.zeros(shape = (1, 64)).astype(np.float)
     kspace = tf.get_kspace()
@@ -133,35 +142,60 @@ def main():
              '--ncpu', '2'])
 
     f = h5py.File('field.h5', 'r')
-    err0 = np.max(np.abs(f['rdata_tmp'][0] - rdata)) / np.mean(np.abs(rdata))
-    err1 = np.max(np.abs(f['rdata'][0]/(n**3) - rdata)) / np.mean(np.abs(rdata))
-    err2 = np.max(np.abs(f['cdata_tmp'][0]/(n**3) - cdata)) / np.mean(np.abs(cdata))
-    print(err0, err1, err2)
-    assert(err0 < 1e-5)
-    assert(err1 < 1e-5)
-    assert(err2 < 1e-4)
-    ### compare
-    #fig = plt.figure(figsize=(12, 6))
-    #a = fig.add_subplot(121)
-    #a.set_axis_off()
-    #a.imshow(rdata[0, :, :], interpolation = 'none')
-    #a = fig.add_subplot(122)
-    #a.set_axis_off()
-    #a.imshow(f['rdata_tmp'][0, 0, :, :], interpolation = 'none')
+    #err0 = np.max(np.abs(f['scal_tmp/real/0'].value - rdata)) / np.mean(np.abs(rdata))
+    #err1 = np.max(np.abs(f['scal/real/0'].value/(n**3) - rdata)) / np.mean(np.abs(rdata))
+    #err2 = np.max(np.abs(f['scal_tmp/complex/0'].value/(n**3) - cdata)) / np.mean(np.abs(cdata))
+    #print(err0, err1, err2)
+    #assert(err0 < 1e-5)
+    #assert(err1 < 1e-5)
+    #assert(err2 < 1e-4)
+    ## compare
+    fig = plt.figure(figsize=(18, 6))
+    a = fig.add_subplot(131)
+    a.set_axis_off()
+    v0 = f['vec/complex/0'][:, :, 0, 0]
+    v1 = f['vec_tmp/complex/0'][:, :, 0, 0]
+    a.imshow(np.log(np.abs(v0 - v1)),
+             interpolation = 'none')
+    a = fig.add_subplot(132)
+    a.set_axis_off()
+    a.imshow(np.log(np.abs(v0)),
+             interpolation = 'none')
+    a = fig.add_subplot(133)
+    a.set_axis_off()
+    a.imshow(np.log(np.abs(v1)),
+             interpolation = 'none')
+    fig.tight_layout()
+    fig.savefig('tst_fields.pdf')
+    fig = plt.figure(figsize=(18, 6))
+    a = fig.add_subplot(131)
+    a.set_axis_off()
+    v0 = f['scal/complex/0'][:, :, 0]
+    v1 = f['scal_tmp/complex/0'][:, :, 0]
+    a.imshow(np.log(np.abs(v0 - v1)),
+             interpolation = 'none')
+    a = fig.add_subplot(132)
+    a.set_axis_off()
+    a.imshow(np.log(np.abs(v0)),
+             interpolation = 'none')
+    a = fig.add_subplot(133)
+    a.set_axis_off()
+    a.imshow(np.log(np.abs(v1)),
+             interpolation = 'none')
+    fig.tight_layout()
+    fig.savefig('tst_sfields.pdf')
+    # look at moments and histogram
+    #print('moments are ', f['moments/scal'][0])
+    #fig = plt.figure(figsize=(6,6))
+    #a = fig.add_subplot(211)
+    #a.plot(f['histograms/scal'][0])
+    #a.set_yscale('log')
+    #a = fig.add_subplot(212)
+    #a.plot(f['spectra/scal'][0])
+    #a.set_xscale('log')
+    #a.set_yscale('log')
     #fig.tight_layout()
     #fig.savefig('tst.pdf')
-    # look at moments and histogram
-    print('moments are ', f['moments/scal'][0])
-    fig = plt.figure(figsize=(6,6))
-    a = fig.add_subplot(211)
-    a.plot(f['histograms/scal'][0])
-    a.set_yscale('log')
-    a = fig.add_subplot(212)
-    a.plot(f['spectra/scal'][0])
-    a.set_xscale('log')
-    a.set_yscale('log')
-    fig.tight_layout()
-    fig.savefig('tst.pdf')
     return None
 
 if __name__ == '__main__':
diff --git a/tests/test_io.py b/tests/test_io.py
index ce825c808785266c5199149aac5a4ab481ffedc2..624d357b0950eb8c3ae18c1f4a9ae7f47f45b0f8 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -54,6 +54,6 @@ if __name__ == '__main__':
     c = test_io(work_dir = opt.work_dir + '/io')
     c.write_src()
     c.write_par()
-    c.set_host_info({'type' : 'pc'})
-    c.run(ncpu = opt.ncpu)
+    c.set_host_info(bfps.host_info)
+    c.run(opt.ncpu, 1)
 
diff --git a/tests/test_io_00.py b/tests/test_io_00.py
new file mode 100644
index 0000000000000000000000000000000000000000..f558cb8c6fc87be0518a7f63b4fadb0f06acd293
--- /dev/null
+++ b/tests/test_io_00.py
@@ -0,0 +1,37 @@
+#######################################################################
+#                                                                     #
+#  Copyright 2015 Max Planck Institute                                #
+#                 for Dynamics and Self-Organization                  #
+#                                                                     #
+#  This file is part of bfps.                                         #
+#                                                                     #
+#  bfps is free software: you can redistribute it and/or modify       #
+#  it under the terms of the GNU General Public License as published  #
+#  by the Free Software Foundation, either version 3 of the License,  #
+#  or (at your option) any later version.                             #
+#                                                                     #
+#  bfps is distributed in the hope that it will be useful,            #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of     #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      #
+#  GNU General Public License for more details.                       #
+#                                                                     #
+#  You should have received a copy of the GNU General Public License  #
+#  along with bfps.  If not, see <http://www.gnu.org/licenses/>       #
+#                                                                     #
+# Contact: Cristian.Lalescu@ds.mpg.de                                 #
+#                                                                     #
+#######################################################################
+
+
+
+from test_io import *
+
+if __name__ == '__main__':
+    opt = parser.parse_args(
+            ['-n', '32',
+             '--ncpu', '2'] +
+            sys.argv[1:])
+    print('about to create test_io object')
+    c = test_io(work_dir = opt.work_dir + '/io')
+    print('congratulations, test_io object was created')
+
diff --git a/tests/test_io_01_write.py b/tests/test_io_01_write.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3876da168d55cc3c44b86f08fde653b61aa4301
--- /dev/null
+++ b/tests/test_io_01_write.py
@@ -0,0 +1,37 @@
+#######################################################################
+#                                                                     #
+#  Copyright 2015 Max Planck Institute                                #
+#                 for Dynamics and Self-Organization                  #
+#                                                                     #
+#  This file is part of bfps.                                         #
+#                                                                     #
+#  bfps is free software: you can redistribute it and/or modify       #
+#  it under the terms of the GNU General Public License as published  #
+#  by the Free Software Foundation, either version 3 of the License,  #
+#  or (at your option) any later version.                             #
+#                                                                     #
+#  bfps is distributed in the hope that it will be useful,            #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of     #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      #
+#  GNU General Public License for more details.                       #
+#                                                                     #
+#  You should have received a copy of the GNU General Public License  #
+#  along with bfps.  If not, see <http://www.gnu.org/licenses/>       #
+#                                                                     #
+# Contact: Cristian.Lalescu@ds.mpg.de                                 #
+#                                                                     #
+#######################################################################
+
+
+
+from test_io import *
+
+if __name__ == '__main__':
+    opt = parser.parse_args(
+            ['-n', '32',
+             '--ncpu', '2'] +
+            sys.argv[1:])
+    c = test_io(work_dir = opt.work_dir + '/io')
+    c.write_src()
+    c.write_par()
+
diff --git a/tests/test_io_02_compile.py b/tests/test_io_02_compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..5db5cba3520a5c9b28015d5099e4afb7ecd9ebf3
--- /dev/null
+++ b/tests/test_io_02_compile.py
@@ -0,0 +1,39 @@
+#######################################################################
+#                                                                     #
+#  Copyright 2015 Max Planck Institute                                #
+#                 for Dynamics and Self-Organization                  #
+#                                                                     #
+#  This file is part of bfps.                                         #
+#                                                                     #
+#  bfps is free software: you can redistribute it and/or modify       #
+#  it under the terms of the GNU General Public License as published  #
+#  by the Free Software Foundation, either version 3 of the License,  #
+#  or (at your option) any later version.                             #
+#                                                                     #
+#  bfps is distributed in the hope that it will be useful,            #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of     #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      #
+#  GNU General Public License for more details.                       #
+#                                                                     #
+#  You should have received a copy of the GNU General Public License  #
+#  along with bfps.  If not, see <http://www.gnu.org/licenses/>       #
+#                                                                     #
+# Contact: Cristian.Lalescu@ds.mpg.de                                 #
+#                                                                     #
+#######################################################################
+
+
+
+from test_io import *
+
+if __name__ == '__main__':
+    opt = parser.parse_args(
+            ['-n', '32',
+             '--ncpu', '2'] +
+            sys.argv[1:])
+    c = test_io(work_dir = opt.work_dir + '/io')
+    c.write_src()
+    c.write_par()
+    c.set_host_info(bfps.host_info)
+    c.compile_code()
+
diff --git a/tests/test_io_03_run.py b/tests/test_io_03_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..a789ac66fd99d8e5525ce69b1e861f609d969212
--- /dev/null
+++ b/tests/test_io_03_run.py
@@ -0,0 +1,39 @@
+#######################################################################
+#                                                                     #
+#  Copyright 2015 Max Planck Institute                                #
+#                 for Dynamics and Self-Organization                  #
+#                                                                     #
+#  This file is part of bfps.                                         #
+#                                                                     #
+#  bfps is free software: you can redistribute it and/or modify       #
+#  it under the terms of the GNU General Public License as published  #
+#  by the Free Software Foundation, either version 3 of the License,  #
+#  or (at your option) any later version.                             #
+#                                                                     #
+#  bfps is distributed in the hope that it will be useful,            #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of     #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      #
+#  GNU General Public License for more details.                       #
+#                                                                     #
+#  You should have received a copy of the GNU General Public License  #
+#  along with bfps.  If not, see <http://www.gnu.org/licenses/>       #
+#                                                                     #
+# Contact: Cristian.Lalescu@ds.mpg.de                                 #
+#                                                                     #
+#######################################################################
+
+
+
+from test_io import *
+
+if __name__ == '__main__':
+    opt = parser.parse_args(
+            ['-n', '32',
+             '--ncpu', '2'] +
+            sys.argv[1:])
+    c = test_io(work_dir = opt.work_dir + '/io')
+    c.write_src()
+    c.write_par()
+    c.set_host_info(bfps.host_info)
+    c.run()
+
diff --git a/tests/test_vorticity_equation.py b/tests/test_vorticity_equation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec50531df29e82c1ff767ab3d292bef0aac66c4c
--- /dev/null
+++ b/tests/test_vorticity_equation.py
@@ -0,0 +1,101 @@
+#######################################################################
+#                                                                     #
+#  Copyright 2015 Max Planck Institute                                #
+#                 for Dynamics and Self-Organization                  #
+#                                                                     #
+#  This file is part of bfps.                                         #
+#                                                                     #
+#  bfps is free software: you can redistribute it and/or modify       #
+#  it under the terms of the GNU General Public License as published  #
+#  by the Free Software Foundation, either version 3 of the License,  #
+#  or (at your option) any later version.                             #
+#                                                                     #
+#  bfps is distributed in the hope that it will be useful,            #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of     #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      #
+#  GNU General Public License for more details.                       #
+#                                                                     #
+#  You should have received a copy of the GNU General Public License  #
+#  along with bfps.  If not, see <http://www.gnu.org/licenses/>       #
+#                                                                     #
+# Contact: Cristian.Lalescu@ds.mpg.de                                 #
+#                                                                     #
+#######################################################################
+
+
+
+import sys
+import os
+import numpy as np
+import h5py
+import argparse
+
+import bfps
+import bfps.tools
+
+from bfps_addons import NSReader
+import matplotlib.pyplot as plt
+
+def main():
+    c = bfps.NavierStokes()
+    c.launch(
+            ['-n', '72',
+             '--simname', 'fluid_solver',
+             '--ncpu', '4',
+             '--niter_todo', '256',
+             '--niter_out', '256',
+             '--niter_stat', '1',
+             '--wd', './'] +
+            sys.argv[1:])
+    data = c.read_cfield(iteration = 0)
+    f = h5py.File('vorticity_equation_cvorticity_i00000.h5', 'w')
+    f['vorticity/complex/0'] = data
+    f.close()
+    c = bfps.NSVorticityEquation()
+    c.launch(
+            ['-n', '72',
+             '--simname', 'vorticity_equation',
+             '--ncpu', '4',
+             '--niter_todo', '256',
+             '--niter_out', '256',
+             '--niter_stat', '1',
+             '--wd', './'] +
+            sys.argv[1:])
+    c0 = NSReader(simname = 'fluid_solver')
+    c1 = NSReader(simname = 'vorticity_equation')
+    df0 = c0.get_data_file()
+    df1 = c1.get_data_file()
+    f = plt.figure(figsize=(6,10))
+    a = f.add_subplot(211)
+    a.plot(df0['statistics/moments/vorticity'][:, 2, 3],
+           color = 'blue',
+           marker = '.')
+    a.plot(df1['statistics/moments/vorticity'][:, 2, 3],
+           color = 'red',
+           marker = '.')
+    a = f.add_subplot(212)
+    a.plot(df0['statistics/moments/velocity'][:, 2, 3],
+           color = 'blue',
+           marker = '.')
+    a.plot(df1['statistics/moments/velocity'][:, 2, 3],
+           color = 'red',
+           marker = '.')
+    f.tight_layout()
+    f.savefig('figs/moments.pdf')
+    f = plt.figure(figsize = (6, 10))
+    a = f.add_subplot(111)
+    a.plot(c0.statistics['enstrophy(t, k)'][0])
+    a.plot(c1.statistics['enstrophy(t, k)'][0])
+    a.set_yscale('log')
+    f.tight_layout()
+    f.savefig('figs/spectra.pdf')
+    f = h5py.File('vorticity_equation_cvorticity_i00000.h5', 'r')
+    #print(c0.statistics['enstrophy(t, k)'][0])
+    #print(c1.statistics['enstrophy(t, k)'][0])
+    c0.do_plots()
+    c1.do_plots()
+    return None
+
+if __name__ == '__main__':
+    main()
+
diff --git a/todo.txt b/todo.txt
deleted file mode 100644
index 0b5cafdefaf49739269dd49c19b14ffcd680b86f..0000000000000000000000000000000000000000
--- a/todo.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-(B) compute z polynomials only when needed                                  @optimization
-(B) use argparse subcommands instead of required argument                   @design
-(B) read https://www.xsede.org/documents/271087/369161/ExtScale-Koziol.pdf  @optimization @HDF5 +I/O
-(B) set up mechanism for adding in new PDEs                                 @design +v2.0 +alternate_algorithms
-(B) use less memory                                                         @optimization
-(B) move stat I/O to cpp lib                                                @design @HDF5
-(C) test involving hydrodynamic similarity                                  @tests
-(C) tests should use launch instead of get_parser                           @design @tests
-(D) executable should be compiled in a tmp folder
-(D) generalize interpolation comparison test                                @tests
-(D) generate separate lib(s) with extra classes                             @tests +alternate_algorithms
-(D) test anisotropic grids                                                  @tests
-(D) test non-cubic domains                                                  @tests
-(D) tests should not overwrite other tests (tox_full)                       @tests
-(E) add u-equation algorithm for testing purposes                           @tests +alternate_algorithms
-(E) pure python DNS addon: pros and cons                                    @tests +alternate_algorithms
-(F) add switch to turn off simulation