diff --git a/bfps/cpp/base.hpp b/bfps/cpp/base.hpp
index ee2d74d5b751451e9bb34600a0e2b09891a73d1f..6ed24ab7781031f1a750e97183f3ac2b8e76ffc6 100644
--- a/bfps/cpp/base.hpp
+++ b/bfps/cpp/base.hpp
@@ -42,6 +42,9 @@ inline int MOD(int a, int n)
     return ((a%n) + n) % n;
 }
 
+/////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////
+
 #ifdef OMPI_MPI_H
 
 #define BFPS_MPICXX_DOUBLE_COMPLEX MPI_DOUBLE_COMPLEX
@@ -52,6 +55,37 @@ inline int MOD(int a, int n)
 
 #endif//OMPI_MPI_H
 
+template <class realtype>
+class mpi_real_type;
+
+template <>
+class mpi_real_type<float>
+{
+public:
+    static constexpr MPI_Datatype real(){
+        return MPI_FLOAT;
+    }
+
+    static constexpr MPI_Datatype complex(){
+        return MPI_COMPLEX;
+    }
+};
+
+template <>
+class mpi_real_type<double>
+{
+public:
+    static constexpr MPI_Datatype real(){
+        return MPI_DOUBLE;
+    }
+
+    static constexpr MPI_Datatype complex(){
+        return BFPS_MPICXX_DOUBLE_COMPLEX;
+    }
+};
+
+/////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////
 
 #ifndef NDEBUG
 
diff --git a/bfps/cpp/fftw_interface.hpp b/bfps/cpp/fftw_interface.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec318b3b4b2f1c2888b4a0c07a3bfd50892026ec
--- /dev/null
+++ b/bfps/cpp/fftw_interface.hpp
@@ -0,0 +1,163 @@
+/**********************************************************************
+*                                                                     *
+*  Copyright 2015 Max Planck Institute                                *
+*                 for Dynamics and Self-Organization                  *
+*                                                                     *
+*  This file is part of bfps.                                         *
+*                                                                     *
+*  bfps is free software: you can redistribute it and/or modify       *
+*  it under the terms of the GNU General Public License as published  *
+*  by the Free Software Foundation, either version 3 of the License,  *
+*  or (at your option) any later version.                             *
+*                                                                     *
+*  bfps is distributed in the hope that it will be useful,            *
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of     *
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the      *
+*  GNU General Public License for more details.                       *
+*                                                                     *
+*  You should have received a copy of the GNU General Public License  *
+*  along with bfps.  If not, see <http://www.gnu.org/licenses/>       *
+*                                                                     *
+* Contact: Cristian.Lalescu@ds.mpg.de                                 *
+*                                                                     *
+**********************************************************************/
+
+#ifndef FFTW_INTERFACE_HPP
+#define FFTW_INTERFACE_HPP
+
+#include <fftw3-mpi.h>
+
+template <class realtype>
+class fftw_interface;
+
+template <>
+class fftw_interface<float>
+{
+public:
+    using real = float;
+    using complex = fftwf_complex;
+    using plan = fftwf_plan;
+    using iodim = fftwf_iodim;
+
+    static complex* alloc_complex(const size_t in_size){
+        return fftwf_alloc_complex(in_size);
+    }
+
+    static real* alloc_real(const size_t in_size){
+        return fftwf_alloc_real(in_size);
+    }
+
+    static void free(void* ptr){
+        fftwf_free(ptr);
+    }
+
+    static void execute(plan in_plan){
+        fftwf_execute(in_plan);
+    }
+
+    static void destroy_plan(plan in_plan){
+        fftwf_destroy_plan(in_plan);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_transpose(Params ... params){
+        return fftwf_mpi_plan_transpose(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_transpose(Params ... params){
+        return fftwf_mpi_plan_many_transpose(params...);
+    }
+
+    template <class ... Params>
+    static plan plan_guru_r2r(Params ... params){
+        return fftwf_plan_guru_r2r(params...);
+    }
+
+    template <class ... Params>
+    static plan plan_guru_dft(Params ... params){
+        return fftwf_plan_guru_dft(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_dft_c2r(Params ... params){
+        return fftwf_mpi_plan_many_dft_c2r(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_dft_r2c(Params ... params){
+        return fftwf_mpi_plan_many_dft_r2c(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_dft_c2r_3d(Params ... params){
+        return fftwf_mpi_plan_dft_c2r_3d(params...);
+    }
+};
+
+template <>
+class fftw_interface<double>
+{
+public:
+    using real = double;
+    using complex = fftw_complex;
+    using plan = fftw_plan;
+    using iodim = fftw_iodim;
+
+    static complex* alloc_complex(const size_t in_size){
+        return fftw_alloc_complex(in_size);
+    }
+
+    static real* alloc_real(const size_t in_size){
+        return fftw_alloc_real(in_size);
+    }
+
+    static void free(void* ptr){
+        fftw_free(ptr);
+    }
+
+    static void execute(plan in_plan){
+        fftw_execute(in_plan);
+    }
+
+    static void destroy_plan(plan in_plan){
+        fftw_destroy_plan(in_plan);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_transpose(Params ... params){
+        return fftw_mpi_plan_transpose(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_transpose(Params ... params){
+        return fftw_mpi_plan_many_transpose(params...);
+    }
+
+    template <class ... Params>
+    static plan plan_guru_r2r(Params ... params){
+        return fftw_plan_guru_r2r(params...);
+    }
+
+    template <class ... Params>
+    static plan plan_guru_dft(Params ... params){
+        return fftw_plan_guru_dft(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_dft_c2r(Params ... params){
+        return fftw_mpi_plan_many_dft_c2r(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_many_dft_r2c(Params ... params){
+        return fftw_mpi_plan_many_dft_r2c(params...);
+    }
+
+    template <class ... Params>
+    static plan mpi_plan_dft_c2r_3d(Params ... params){
+        return fftw_mpi_plan_dft_c2r_3d(params...);
+    }
+};
+
+#endif // FFTW_INTERFACE_HPP
diff --git a/bfps/cpp/fftw_tools.cpp b/bfps/cpp/fftw_tools.cpp
index f6eacbf1dfe2dfe31e603e9239c42d4639327d3d..61e03d292f81aed1fa4b2dfcab880fb7105b676e 100644
--- a/bfps/cpp/fftw_tools.cpp
+++ b/bfps/cpp/fftw_tools.cpp
@@ -27,6 +27,7 @@
 #include <iostream>
 #include "base.hpp"
 #include "fftw_tools.hpp"
+#include "fftw_interface.hpp"
 
 #define NDEBUG
 
@@ -51,150 +52,171 @@ int clip_zero_padding(
     return EXIT_SUCCESS;
 }
 
+template
+int clip_zero_padding<float>(
+        field_descriptor<float> *f,
+        float *a,
+        int howmany);
 
+template
+int clip_zero_padding<double>(
+        field_descriptor<double> *f,
+        double *a,
+        int howmany);
+
+
+
+template <class rnumber>
+int copy_complex_array(
+        field_descriptor<rnumber> *fi,
+        rnumber (*ai)[2],
+field_descriptor<rnumber> *fo,
+rnumber (*ao)[2],
+int howmany)
+{
+    DEBUG_MSG("entered copy_complex_array\n");
+    typename fftw_interface<rnumber>::complex *buffer;
+    buffer = fftw_interface<rnumber>::alloc_complex(fi->slice_size*howmany);
+
+    int min_fast_dim;
+    min_fast_dim =
+            (fi->sizes[2] > fo->sizes[2]) ?
+                fo->sizes[2] : fi->sizes[2];
 
-#define TOOLS_IMPLEMENTATION(FFTW, R, MPI_RNUM, MPI_CNUM) \
-template <> \
-int copy_complex_array<R>( \
-        field_descriptor<R> *fi, \
-        R (*ai)[2], \
-        field_descriptor<R> *fo, \
-        R (*ao)[2], \
-        int howmany) \
-{ \
-    DEBUG_MSG("entered copy_complex_array\n"); \
-    FFTW(complex) *buffer; \
-    buffer = FFTW(alloc_complex)(fi->slice_size*howmany); \
- \
-    int min_fast_dim; \
-    min_fast_dim = \
-        (fi->sizes[2] > fo->sizes[2]) ? \
-         fo->sizes[2] : fi->sizes[2]; \
- \
     /* clean up destination, in case we're padding with zeros
-       (even if only for one dimension) */ \
-    std::fill_n((R*)ao, fo->local_size*2, 0.0); \
- \
-    int64_t ii0, ii1; \
-    int64_t oi0, oi1; \
-    int64_t delta1, delta0; \
-    int irank, orank; \
-    delta0 = (fo->sizes[0] - fi->sizes[0]); \
-    delta1 = (fo->sizes[1] - fi->sizes[1]); \
-    for (ii0=0; ii0 < fi->sizes[0]; ii0++) \
-    { \
-        if (ii0 <= fi->sizes[0]/2) \
-        { \
-            oi0 = ii0; \
-            if (oi0 > fo->sizes[0]/2) \
-                continue; \
-        } \
-        else \
-        { \
-            oi0 = ii0 + delta0; \
-            if ((oi0 < 0) || ((fo->sizes[0] - oi0) >= fo->sizes[0]/2)) \
-                continue; \
-        } \
-        irank = fi->rank[ii0]; \
-        orank = fo->rank[oi0]; \
-        if ((irank == orank) && \
-            (irank == fi->myrank)) \
-        { \
-            std::copy( \
-                    (R*)(ai + (ii0 - fi->starts[0]    )*fi->slice_size), \
-                    (R*)(ai + (ii0 - fi->starts[0] + 1)*fi->slice_size), \
-                    (R*)buffer); \
-        } \
-        else \
-        { \
-            if (fi->myrank == irank) \
-            { \
-                MPI_Send( \
-                        (void*)(ai + (ii0-fi->starts[0])*fi->slice_size), \
-                        fi->slice_size, \
-                        MPI_CNUM, \
-                        orank, \
-                        ii0, \
-                        fi->comm); \
-            } \
-            if (fi->myrank == orank) \
-            { \
-                MPI_Recv( \
-                        (void*)(buffer), \
-                        fi->slice_size, \
-                        MPI_CNUM, \
-                        irank, \
-                        ii0, \
-                        fi->comm, \
-                        MPI_STATUS_IGNORE); \
-            } \
-        } \
-        if (fi->myrank == orank) \
-        { \
-            for (ii1 = 0; ii1 < fi->sizes[1]; ii1++) \
-            { \
-                if (ii1 <= fi->sizes[1]/2) \
-                { \
-                    oi1 = ii1; \
-                    if (oi1 > fo->sizes[1]/2) \
-                        continue; \
-                } \
-                else \
-                { \
-                    oi1 = ii1 + delta1; \
-                    if ((oi1 < 0) || ((fo->sizes[1] - oi1) >= fo->sizes[1]/2)) \
-                        continue; \
-                } \
-                std::copy( \
-                        (R*)(buffer + (ii1*fi->sizes[2]*howmany)), \
-                        (R*)(buffer + (ii1*fi->sizes[2] + min_fast_dim)*howmany), \
-                        (R*)(ao + \
-                                 ((oi0 - fo->starts[0])*fo->sizes[1] + \
-                                  oi1)*fo->sizes[2]*howmany)); \
-            } \
-        } \
-    } \
-    fftw_free(buffer); \
-    MPI_Barrier(fi->comm); \
- \
-    DEBUG_MSG("exiting copy_complex_array\n"); \
-    return EXIT_SUCCESS; \
-} \
- \
-template <> \
-int get_descriptors_3D<R>( \
-        int n0, int n1, int n2, \
-        field_descriptor<R> **fr, \
-        field_descriptor<R> **fc) \
-{ \
-    int ntmp[3]; \
-    ntmp[0] = n0; \
-    ntmp[1] = n1; \
-    ntmp[2] = n2; \
-    *fr = new field_descriptor<R>(3, ntmp, MPI_RNUM, MPI_COMM_WORLD); \
-    ntmp[0] = n0; \
-    ntmp[1] = n1; \
-    ntmp[2] = n2/2+1; \
-    *fc = new field_descriptor<R>(3, ntmp, MPI_CNUM, MPI_COMM_WORLD); \
-    return EXIT_SUCCESS; \
-} \
- \
-template \
-int clip_zero_padding<R>( \
-        field_descriptor<R> *f, \
-        R *a, \
-        int howmany); \
-
-
-
-TOOLS_IMPLEMENTATION(
-        FFTW_MANGLE_FLOAT,
-        float,
-        MPI_FLOAT,
-        MPI_COMPLEX)
-TOOLS_IMPLEMENTATION(
-        FFTW_MANGLE_DOUBLE,
-        double,
-        MPI_DOUBLE,
-        BFPS_MPICXX_DOUBLE_COMPLEX)
+       (even if only for one dimension) */
+    std::fill_n((rnumber*)ao, fo->local_size*2, 0.0);
+
+    int64_t ii0, ii1;
+    int64_t oi0, oi1;
+    int64_t delta1, delta0;
+    int irank, orank;
+    delta0 = (fo->sizes[0] - fi->sizes[0]);
+    delta1 = (fo->sizes[1] - fi->sizes[1]);
+    for (ii0=0; ii0 < fi->sizes[0]; ii0++)
+    {
+        if (ii0 <= fi->sizes[0]/2)
+        {
+            oi0 = ii0;
+            if (oi0 > fo->sizes[0]/2)
+                continue;
+        }
+        else
+        {
+            oi0 = ii0 + delta0;
+            if ((oi0 < 0) || ((fo->sizes[0] - oi0) >= fo->sizes[0]/2))
+                continue;
+        }
+        irank = fi->rank[ii0];
+        orank = fo->rank[oi0];
+        if ((irank == orank) &&
+                (irank == fi->myrank))
+        {
+            std::copy(
+                        (rnumber*)(ai + (ii0 - fi->starts[0]    )*fi->slice_size),
+                    (rnumber*)(ai + (ii0 - fi->starts[0] + 1)*fi->slice_size),
+                    (rnumber*)buffer);
+        }
+        else
+        {
+            if (fi->myrank == irank)
+            {
+                MPI_Send(
+                            (void*)(ai + (ii0-fi->starts[0])*fi->slice_size),
+                        fi->slice_size,
+                        mpi_real_type<rnumber>::complex(),
+                        orank,
+                        ii0,
+                        fi->comm);
+            }
+            if (fi->myrank == orank)
+            {
+                MPI_Recv(
+                            (void*)(buffer),
+                            fi->slice_size,
+                            mpi_real_type<rnumber>::complex(),
+                            irank,
+                            ii0,
+                            fi->comm,
+                            MPI_STATUS_IGNORE);
+            }
+        }
+        if (fi->myrank == orank)
+        {
+            for (ii1 = 0; ii1 < fi->sizes[1]; ii1++)
+            {
+                if (ii1 <= fi->sizes[1]/2)
+                {
+                    oi1 = ii1;
+                    if (oi1 > fo->sizes[1]/2)
+                        continue;
+                }
+                else
+                {
+                    oi1 = ii1 + delta1;
+                    if ((oi1 < 0) || ((fo->sizes[1] - oi1) >= fo->sizes[1]/2))
+                        continue;
+                }
+                std::copy(
+                            (rnumber*)(buffer + (ii1*fi->sizes[2]*howmany)),
+                        (rnumber*)(buffer + (ii1*fi->sizes[2] + min_fast_dim)*howmany),
+                        (rnumber*)(ao +
+                                   ((oi0 - fo->starts[0])*fo->sizes[1] +
+                        oi1)*fo->sizes[2]*howmany));
+            }
+        }
+    }
+    fftw_interface<rnumber>::free(buffer);
+    MPI_Barrier(fi->comm);
+
+    DEBUG_MSG("exiting copy_complex_array\n");
+    return EXIT_SUCCESS;
+}
+
+template
+int copy_complex_array<float>(
+        field_descriptor<float> *fi,
+        float (*ai)[2],
+        field_descriptor<float> *fo,
+        float (*ao)[2],
+        int howmany);
+
+template
+int copy_complex_array<double>(
+        field_descriptor<double> *fi,
+        double (*ai)[2],
+        field_descriptor<double> *fo,
+        double (*ao)[2],
+        int howmany);
+
+
+template <class rnumber>
+int get_descriptors_3D(
+        int n0, int n1, int n2,
+        field_descriptor<rnumber> **fr,
+        field_descriptor<rnumber> **fc)
+{
+    int ntmp[3];
+    ntmp[0] = n0;
+    ntmp[1] = n1;
+    ntmp[2] = n2;
+    *fr = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), MPI_COMM_WORLD);
+    ntmp[0] = n0;
+    ntmp[1] = n1;
+    ntmp[2] = n2/2+1;
+    *fc = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::complex(), MPI_COMM_WORLD);
+    return EXIT_SUCCESS;
+}
+
+template
+int get_descriptors_3D<float>(
+        int n0, int n1, int n2,
+        field_descriptor<float> **fr,
+        field_descriptor<float> **fc);
+
+template
+int get_descriptors_3D<double>(
+        int n0, int n1, int n2,
+        field_descriptor<double> **fr,
+        field_descriptor<double> **fc);
 
diff --git a/bfps/cpp/field_descriptor.cpp b/bfps/cpp/field_descriptor.cpp
index b5025835903a37ea5384cb4102c716f527aabfe5..8e96ba64eb6fb8fd4e4c4adee31dee19c2dd3fd4 100644
--- a/bfps/cpp/field_descriptor.cpp
+++ b/bfps/cpp/field_descriptor.cpp
@@ -31,476 +31,461 @@
 #include <iostream>
 #include "base.hpp"
 #include "field_descriptor.hpp"
+#include "fftw_interface.hpp"
 
 
 /*****************************************************************************/
 /* macro for specializations to numeric types compatible with FFTW           */
 
-#define CLASS_IMPLEMENTATION(FFTW, R, MPI_RNUM, MPI_CNUM) \
-    \
-template<> \
-field_descriptor<R>::field_descriptor( \
-        int ndims, \
-        int *n, \
-        MPI_Datatype element_type, \
-        MPI_Comm COMM_TO_USE) \
-{ \
-    DEBUG_MSG("entered field_descriptor::field_descriptor\n"); \
-    this->comm = COMM_TO_USE; \
-    MPI_Comm_rank(this->comm, &this->myrank); \
-    MPI_Comm_size(this->comm, &this->nprocs); \
-    this->ndims = ndims; \
-    this->sizes    = new int[ndims]; \
-    this->subsizes = new int[ndims]; \
-    this->starts   = new int[ndims]; \
-    int tsizes   [ndims]; \
-    int tsubsizes[ndims]; \
-    int tstarts  [ndims]; \
-    ptrdiff_t *nfftw = new ptrdiff_t[ndims]; \
-    ptrdiff_t local_n0, local_0_start; \
-    for (int i = 0; i < this->ndims; i++) \
-        nfftw[i] = n[i]; \
-    this->local_size = fftw_mpi_local_size_many( \
-            this->ndims, \
-            nfftw, \
-            1, \
-            FFTW_MPI_DEFAULT_BLOCK, \
-            this->comm, \
-            &local_n0, \
-            &local_0_start); \
-    this->sizes[0] = n[0]; \
-    this->subsizes[0] = (int)local_n0; \
-    this->starts[0] = (int)local_0_start; \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "first subsizes[0] = %d %d %d\n", \
-            this->subsizes[0], \
-            tsubsizes[0], \
-            (int)local_n0); \
-    tsizes[0] = n[0]; \
-    tsubsizes[0] = (int)local_n0; \
-    tstarts[0] = (int)local_0_start; \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "second subsizes[0] = %d %d %d\n", \
-            this->subsizes[0], \
-            tsubsizes[0], \
-            (int)local_n0); \
-    this->mpi_dtype = element_type; \
-    this->slice_size = 1; \
-    this->full_size = this->sizes[0]; \
-    for (int i = 1; i < this->ndims; i++) \
-    { \
-        this->sizes[i] = n[i]; \
-        this->subsizes[i] = n[i]; \
-        this->starts[i] = 0; \
-        this->slice_size *= this->subsizes[i]; \
-        this->full_size *= this->sizes[i]; \
-        tsizes[i] = this->sizes[i]; \
-        tsubsizes[i] = this->subsizes[i]; \
-        tstarts[i] = this->starts[i]; \
-    } \
-    tsizes[ndims-1] *= sizeof(R); \
-    tsubsizes[ndims-1] *= sizeof(R); \
-    tstarts[ndims-1] *= sizeof(R); \
-    if (this->mpi_dtype == MPI_CNUM) \
-    { \
-        tsizes[ndims-1] *= 2; \
-        tsubsizes[ndims-1] *= 2; \
-        tstarts[ndims-1] *= 2; \
-    } \
-    int local_zero_array[this->nprocs], zero_array[this->nprocs]; \
-    for (int i=0; i<this->nprocs; i++) \
-        local_zero_array[i] = 0; \
-    local_zero_array[this->myrank] = (this->subsizes[0] == 0) ? 1 : 0; \
-    MPI_Allreduce( \
-            local_zero_array, \
-            zero_array, \
-            this->nprocs, \
-            MPI_INT, \
-            MPI_SUM, \
-            this->comm); \
-    int no_of_excluded_ranks = 0; \
-    for (int i = 0; i<this->nprocs; i++) \
-        no_of_excluded_ranks += zero_array[i]; \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "subsizes[0] = %d %d\n", \
-            this->subsizes[0], \
-            tsubsizes[0]); \
-    if (no_of_excluded_ranks == 0) \
-    { \
-        this->io_comm = this->comm; \
-        this->io_nprocs = this->nprocs; \
-        this->io_myrank = this->myrank; \
-    } \
-    else \
-    { \
-        int excluded_rank[no_of_excluded_ranks]; \
-        for (int i=0, j=0; i<this->nprocs; i++) \
-            if (zero_array[i]) \
-            { \
-                excluded_rank[j] = i; \
-                j++; \
-            } \
-        MPI_Group tgroup0, tgroup; \
-        MPI_Comm_group(this->comm, &tgroup0); \
-        MPI_Group_excl(tgroup0, no_of_excluded_ranks, excluded_rank, &tgroup); \
-        MPI_Comm_create(this->comm, tgroup, &this->io_comm); \
-        MPI_Group_free(&tgroup0); \
-        MPI_Group_free(&tgroup); \
-        if (this->subsizes[0] > 0) \
-        { \
-            MPI_Comm_rank(this->io_comm, &this->io_myrank); \
-            MPI_Comm_size(this->io_comm, &this->io_nprocs); \
-        } \
-        else \
-        { \
-            this->io_myrank = MPI_PROC_NULL; \
-            this->io_nprocs = -1; \
-        } \
-    } \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "inside field_descriptor constructor, about to call " \
-            "MPI_Type_create_subarray " \
-            "%d %d %d\n", \
-            this->sizes[0], \
-            this->subsizes[0], \
-            this->starts[0]); \
-    for (int i=0; i<this->ndims; i++) \
-    DEBUG_MSG_WAIT( \
-            this->comm, \
-            "tsizes " \
-            "%d %d %d\n", \
-            tsizes[i], \
-            tsubsizes[i], \
-            tstarts[i]); \
-    if (this->subsizes[0] > 0) \
-    { \
-        DEBUG_MSG("creating subarray\n"); \
-        MPI_Type_create_subarray( \
-                ndims, \
-                tsizes, \
-                tsubsizes, \
-                tstarts, \
-                MPI_ORDER_C, \
-                MPI_UNSIGNED_CHAR, \
-                &this->mpi_array_dtype); \
-        MPI_Type_commit(&this->mpi_array_dtype); \
-    } \
-    this->rank = new int[this->sizes[0]]; \
-    int *local_rank = new int[this->sizes[0]]; \
-    std::fill_n(local_rank, this->sizes[0], 0); \
-    for (int i = 0; i < this->sizes[0]; i++) \
-        if (i >= this->starts[0] && i < this->starts[0] + this->subsizes[0]) \
-            local_rank[i] = this->myrank; \
-    MPI_Allreduce( \
-            local_rank, \
-            this->rank, \
-            this->sizes[0], \
-            MPI_INT, \
-            MPI_SUM, \
-            this->comm); \
-    delete[] local_rank; \
-    this->all_start0 = new int[this->nprocs]; \
-    int *local_start0 = new int[this->nprocs]; \
-    std::fill_n(local_start0, this->nprocs, 0); \
-    for (int i = 0; i < this->nprocs; i++) \
-        if (this->myrank == i) \
-            local_start0[i] = this->starts[0]; \
-    MPI_Allreduce( \
-            local_start0, \
-            this->all_start0, \
-            this->nprocs, \
-            MPI_INT, \
-            MPI_SUM, \
-            this->comm); \
-    delete[] local_start0; \
-    this->all_size0  = new int[this->nprocs]; \
-    int *local_size0 = new int[this->nprocs]; \
-    std::fill_n(local_size0, this->nprocs, 0); \
-    for (int i = 0; i < this->nprocs; i++) \
-        if (this->myrank == i) \
-            local_size0[i] = this->subsizes[0]; \
-    MPI_Allreduce( \
-            local_size0, \
-            this->all_size0, \
-            this->nprocs, \
-            MPI_INT, \
-            MPI_SUM, \
-            this->comm); \
-    delete[] local_size0; \
-    DEBUG_MSG("exiting field_descriptor constructor\n"); \
-} \
- \
-template <> \
-int field_descriptor<R>::read( \
-        const char *fname, \
-        void *buffer) \
-{ \
-    DEBUG_MSG("entered field_descriptor::read\n"); \
-    char representation[] = "native"; \
-    if (this->subsizes[0] > 0) \
-    { \
-        MPI_Info info; \
-        MPI_Info_create(&info); \
-        MPI_File f; \
-        ptrdiff_t read_size = this->local_size*sizeof(R); \
-        DEBUG_MSG("read size is %ld\n", read_size); \
-        char ffname[200]; \
-        if (this->mpi_dtype == MPI_CNUM) \
-            read_size *= 2; \
-        DEBUG_MSG("read size is %ld\n", read_size); \
-        sprintf(ffname, "%s", fname); \
- \
-        MPI_File_open( \
-                this->io_comm, \
-                ffname, \
-                MPI_MODE_RDONLY, \
-                info, \
-                &f); \
-        DEBUG_MSG("opened file\n"); \
-        MPI_File_set_view( \
-                f, \
-                0, \
-                MPI_UNSIGNED_CHAR, \
-                this->mpi_array_dtype, \
-                representation, \
-                info); \
-        DEBUG_MSG("view is set\n"); \
-        MPI_File_read_all( \
-                f, \
-                buffer, \
-                read_size, \
-                MPI_UNSIGNED_CHAR, \
-                MPI_STATUS_IGNORE); \
-        DEBUG_MSG("info is read\n"); \
-        MPI_File_close(&f); \
-    } \
-    DEBUG_MSG("finished with field_descriptor::read\n"); \
-    return EXIT_SUCCESS; \
-} \
- \
-template <> \
-int field_descriptor<R>::write( \
-        const char *fname, \
-        void *buffer) \
-{ \
-    char representation[] = "native"; \
-    if (this->subsizes[0] > 0) \
-    { \
-        MPI_Info info; \
-        MPI_Info_create(&info); \
-        MPI_File f; \
-        ptrdiff_t read_size = this->local_size*sizeof(R); \
-        char ffname[200]; \
-        if (this->mpi_dtype == MPI_CNUM) \
-            read_size *= 2; \
-        sprintf(ffname, "%s", fname); \
- \
-        MPI_File_open( \
-                this->io_comm, \
-                ffname, \
-                MPI_MODE_CREATE | MPI_MODE_WRONLY, \
-                info, \
-                &f); \
-        MPI_File_set_view( \
-                f, \
-                0, \
-                MPI_UNSIGNED_CHAR, \
-                this->mpi_array_dtype, \
-                representation, \
-                info); \
-        MPI_File_write_all( \
-                f, \
-                buffer, \
-                read_size, \
-                MPI_UNSIGNED_CHAR, \
-                MPI_STATUS_IGNORE); \
-        MPI_File_close(&f); \
-    } \
- \
-    return EXIT_SUCCESS; \
-} \
- \
-template <> \
-int field_descriptor<R>::transpose( \
-        R *input, \
-        R *output) \
-{ \
-    /* IMPORTANT NOTE: \
-     for 3D transposition, the input data is messed up */ \
-    FFTW(plan) tplan; \
-    if (this->ndims == 3) \
-    { \
-        /* transpose the two local dimensions 1 and 2 */ \
-        R *atmp; \
-        atmp = FFTW(alloc_real)(this->slice_size); \
-        for (int k = 0; k < this->subsizes[0]; k++) \
-        { \
-            /* put transposed slice in atmp */ \
-            for (int j = 0; j < this->sizes[1]; j++) \
-                for (int i = 0; i < this->sizes[2]; i++) \
-                    atmp[i*this->sizes[1] + j] = \
-                        input[(k*this->sizes[1] + j)*this->sizes[2] + i]; \
-            /* copy back transposed slice */ \
-            std::copy( \
-                    atmp, \
-                    atmp + this->slice_size, \
-                    input + k*this->slice_size); \
-        } \
-        FFTW(free)(atmp); \
-    } \
-    tplan = FFTW(mpi_plan_transpose)( \
-            this->sizes[0], this->slice_size, \
-            input, output, \
-            this->comm, \
-            FFTW_ESTIMATE); \
-    FFTW(execute)(tplan); \
-    FFTW(destroy_plan)(tplan); \
-    return EXIT_SUCCESS; \
-} \
- \
-template<> \
-int field_descriptor<R>::transpose( \
-        FFTW(complex) *input, \
-        FFTW(complex) *output) \
-{ \
-    switch (this->ndims) \
-    { \
-        case 2: \
-            /* do a global transpose over the 2 dimensions */ \
-            if (output == NULL) \
-            { \
-                std::cerr << "bad arguments for transpose.\n" << std::endl; \
-                return EXIT_FAILURE; \
-            } \
-            FFTW(plan) tplan; \
-            tplan = FFTW(mpi_plan_many_transpose)( \
-                    this->sizes[0], this->sizes[1], 2, \
-                    FFTW_MPI_DEFAULT_BLOCK, \
-                    FFTW_MPI_DEFAULT_BLOCK, \
-                    (R*)input, (R*)output, \
-                    this->comm, \
-                    FFTW_ESTIMATE); \
-            FFTW(execute)(tplan); \
-            FFTW(destroy_plan)(tplan); \
-            break; \
-        case 3: \
-            /* transpose the two local dimensions 1 and 2 */ \
-            FFTW(complex) *atmp; \
-            atmp = FFTW(alloc_complex)(this->slice_size); \
-            for (int k = 0; k < this->subsizes[0]; k++) \
-            { \
-                /* put transposed slice in atmp */ \
-                for (int j = 0; j < this->sizes[1]; j++) \
-                    for (int i = 0; i < this->sizes[2]; i++) \
-                    { \
-                        atmp[i*this->sizes[1] + j][0] = \
-                            input[(k*this->sizes[1] + j)*this->sizes[2] + i][0]; \
-                        atmp[i*this->sizes[1] + j][1] = \
-                            input[(k*this->sizes[1] + j)*this->sizes[2] + i][1]; \
-                    } \
-                /* copy back transposed slice */ \
-                std::copy( \
-                        (R*)(atmp), \
-                        (R*)(atmp + this->slice_size), \
-                        (R*)(input + k*this->slice_size)); \
-            } \
-            FFTW(free)(atmp); \
-            break; \
-        default: \
-            return EXIT_FAILURE; \
-            break; \
-    } \
-    return EXIT_SUCCESS; \
-} \
- \
-template<> \
-int field_descriptor<R>::interleave( \
-        R *a, \
-        int dim) \
-{ \
-/* the following is copied from \
- * http://agentzlerich.blogspot.com/2010/01/using-fftw-for-in-place-matrix.html \
- * */ \
-    FFTW(iodim) howmany_dims[2]; \
-    howmany_dims[0].n  = dim; \
-    howmany_dims[0].is = this->local_size; \
-    howmany_dims[0].os = 1; \
-    howmany_dims[1].n  = this->local_size; \
-    howmany_dims[1].is = 1; \
-    howmany_dims[1].os = dim; \
-    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]); \
- \
-    FFTW(plan) tmp = FFTW(plan_guru_r2r)( \
-            /*rank*/0, \
-            /*dims*/NULL, \
-            howmany_rank, \
-            howmany_dims, \
-            a, \
-            a, \
-            /*kind*/NULL, \
-            FFTW_ESTIMATE); \
-    FFTW(execute)(tmp); \
-    FFTW(destroy_plan)(tmp); \
-    return EXIT_SUCCESS; \
-} \
- \
-template<> \
-int field_descriptor<R>::interleave( \
-        FFTW(complex) *a, \
-        int dim) \
-{ \
-    FFTW(iodim) howmany_dims[2]; \
-    howmany_dims[0].n  = dim; \
-    howmany_dims[0].is = this->local_size; \
-    howmany_dims[0].os = 1; \
-    howmany_dims[1].n  = this->local_size; \
-    howmany_dims[1].is = 1; \
-    howmany_dims[1].os = dim; \
-    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]); \
- \
-    FFTW(plan) tmp = FFTW(plan_guru_dft)( \
-            /*rank*/0, \
-            /*dims*/NULL, \
-            howmany_rank, \
-            howmany_dims, \
-            a, \
-            a, \
-            +1, \
-            FFTW_ESTIMATE); \
-    FFTW(execute)(tmp); \
-    FFTW(destroy_plan)(tmp); \
-    return EXIT_SUCCESS; \
-} \
- \
-template<> \
-field_descriptor<R>* field_descriptor<R>::get_transpose() \
-{ \
-    int n[this->ndims]; \
-    for (int i=0; i<this->ndims; i++) \
-        n[i] = this->sizes[this->ndims - i - 1]; \
-    return new field_descriptor<R>(this->ndims, n, this->mpi_dtype, this->comm); \
-} \
 
-/*****************************************************************************/
+template <class rnumber>
+field_descriptor<rnumber>::field_descriptor(
+        int ndims,
+        int *n,
+        MPI_Datatype element_type,
+        MPI_Comm COMM_TO_USE)
+{
+    DEBUG_MSG("entered field_descriptor::field_descriptor\n");
+    this->comm = COMM_TO_USE;
+    MPI_Comm_rank(this->comm, &this->myrank);
+    MPI_Comm_size(this->comm, &this->nprocs);
+    this->ndims = ndims;
+    this->sizes    = new int[ndims];
+    this->subsizes = new int[ndims];
+    this->starts   = new int[ndims];
+    int tsizes   [ndims];
+    int tsubsizes[ndims];
+    int tstarts  [ndims];
+    ptrdiff_t *nfftw = new ptrdiff_t[ndims];
+    ptrdiff_t local_n0, local_0_start;
+    for (int i = 0; i < this->ndims; i++)
+        nfftw[i] = n[i];
+    this->local_size = fftw_mpi_local_size_many(
+                this->ndims,
+                nfftw,
+                1,
+                FFTW_MPI_DEFAULT_BLOCK,
+                this->comm,
+                &local_n0,
+                &local_0_start);
+    this->sizes[0] = n[0];
+    this->subsizes[0] = (int)local_n0;
+    this->starts[0] = (int)local_0_start;
+    DEBUG_MSG_WAIT(
+                this->comm,
+                "first subsizes[0] = %d %d %d\n",
+                this->subsizes[0],
+            tsubsizes[0],
+            (int)local_n0);
+    tsizes[0] = n[0];
+    tsubsizes[0] = (int)local_n0;
+    tstarts[0] = (int)local_0_start;
+    DEBUG_MSG_WAIT(
+                this->comm,
+                "second subsizes[0] = %d %d %d\n",
+                this->subsizes[0],
+            tsubsizes[0],
+            (int)local_n0);
+    this->mpi_dtype = element_type;
+    this->slice_size = 1;
+    this->full_size = this->sizes[0];
+    for (int i = 1; i < this->ndims; i++)
+    {
+        this->sizes[i] = n[i];
+        this->subsizes[i] = n[i];
+        this->starts[i] = 0;
+        this->slice_size *= this->subsizes[i];
+        this->full_size *= this->sizes[i];
+        tsizes[i] = this->sizes[i];
+        tsubsizes[i] = this->subsizes[i];
+        tstarts[i] = this->starts[i];
+    }
+    tsizes[ndims-1] *= sizeof(rnumber);
+    tsubsizes[ndims-1] *= sizeof(rnumber);
+    tstarts[ndims-1] *= sizeof(rnumber);
+    if (this->mpi_dtype == mpi_real_type<rnumber>::complex())
+    {
+        tsizes[ndims-1] *= 2;
+        tsubsizes[ndims-1] *= 2;
+        tstarts[ndims-1] *= 2;
+    }
+    int local_zero_array[this->nprocs], zero_array[this->nprocs];
+    for (int i=0; i<this->nprocs; i++)
+        local_zero_array[i] = 0;
+    local_zero_array[this->myrank] = (this->subsizes[0] == 0) ? 1 : 0;
+    MPI_Allreduce(
+                local_zero_array,
+                zero_array,
+                this->nprocs,
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+    int no_of_excluded_ranks = 0;
+    for (int i = 0; i<this->nprocs; i++)
+        no_of_excluded_ranks += zero_array[i];
+    DEBUG_MSG_WAIT(
+                this->comm,
+                "subsizes[0] = %d %d\n",
+                this->subsizes[0],
+            tsubsizes[0]);
+    if (no_of_excluded_ranks == 0)
+    {
+        this->io_comm = this->comm;
+        this->io_nprocs = this->nprocs;
+        this->io_myrank = this->myrank;
+    }
+    else
+    {
+        int excluded_rank[no_of_excluded_ranks];
+        for (int i=0, j=0; i<this->nprocs; i++)
+            if (zero_array[i])
+            {
+                excluded_rank[j] = i;
+                j++;
+            }
+        MPI_Group tgroup0, tgroup;
+        MPI_Comm_group(this->comm, &tgroup0);
+        MPI_Group_excl(tgroup0, no_of_excluded_ranks, excluded_rank, &tgroup);
+        MPI_Comm_create(this->comm, tgroup, &this->io_comm);
+        MPI_Group_free(&tgroup0);
+        MPI_Group_free(&tgroup);
+        if (this->subsizes[0] > 0)
+        {
+            MPI_Comm_rank(this->io_comm, &this->io_myrank);
+            MPI_Comm_size(this->io_comm, &this->io_nprocs);
+        }
+        else
+        {
+            this->io_myrank = MPI_PROC_NULL;
+            this->io_nprocs = -1;
+        }
+    }
+    DEBUG_MSG_WAIT(
+                this->comm,
+                "inside field_descriptor constructor, about to call "
+                "MPI_Type_create_subarray "
+                "%d %d %d\n",
+                this->sizes[0],
+            this->subsizes[0],
+            this->starts[0]);
+    for (int i=0; i<this->ndims; i++)
+        DEBUG_MSG_WAIT(
+                    this->comm,
+                    "tsizes "
+                    "%d %d %d\n",
+                    tsizes[i],
+                    tsubsizes[i],
+                    tstarts[i]);
+    if (this->subsizes[0] > 0)
+    {
+        DEBUG_MSG("creating subarray\n");
+        MPI_Type_create_subarray(
+                    ndims,
+                    tsizes,
+                    tsubsizes,
+                    tstarts,
+                    MPI_ORDER_C,
+                    MPI_UNSIGNED_CHAR,
+                    &this->mpi_array_dtype);
+        MPI_Type_commit(&this->mpi_array_dtype);
+    }
+    this->rank = new int[this->sizes[0]];
+    int *local_rank = new int[this->sizes[0]];
+    std::fill_n(local_rank, this->sizes[0], 0);
+    for (int i = 0; i < this->sizes[0]; i++)
+        if (i >= this->starts[0] && i < this->starts[0] + this->subsizes[0])
+            local_rank[i] = this->myrank;
+    MPI_Allreduce(
+                local_rank,
+                this->rank,
+                this->sizes[0],
+            MPI_INT,
+            MPI_SUM,
+            this->comm);
+    delete[] local_rank;
+    this->all_start0 = new int[this->nprocs];
+    int *local_start0 = new int[this->nprocs];
+    std::fill_n(local_start0, this->nprocs, 0);
+    for (int i = 0; i < this->nprocs; i++)
+        if (this->myrank == i)
+            local_start0[i] = this->starts[0];
+    MPI_Allreduce(
+                local_start0,
+                this->all_start0,
+                this->nprocs,
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+    delete[] local_start0;
+    this->all_size0  = new int[this->nprocs];
+    int *local_size0 = new int[this->nprocs];
+    std::fill_n(local_size0, this->nprocs, 0);
+    for (int i = 0; i < this->nprocs; i++)
+        if (this->myrank == i)
+            local_size0[i] = this->subsizes[0];
+    MPI_Allreduce(
+                local_size0,
+                this->all_size0,
+                this->nprocs,
+                MPI_INT,
+                MPI_SUM,
+                this->comm);
+    delete[] local_size0;
+    DEBUG_MSG("exiting field_descriptor constructor\n");
+}
 
+template <class rnumber>
+int field_descriptor<rnumber>::read(
+        const char *fname,
+        void *buffer)
+{
+    DEBUG_MSG("entered field_descriptor::read\n");
+    char representation[] = "native";
+    if (this->subsizes[0] > 0)
+    {
+        MPI_Info info;
+        MPI_Info_create(&info);
+        MPI_File f;
+        ptrdiff_t read_size = this->local_size*sizeof(rnumber);
+        DEBUG_MSG("read size is %ld\n", read_size);
+        char ffname[200];
+        if (this->mpi_dtype == mpi_real_type<rnumber>::complex())
+            read_size *= 2;
+        DEBUG_MSG("read size is %ld\n", read_size);
+        sprintf(ffname, "%s", fname);
 
+        MPI_File_open(
+                    this->io_comm,
+                    ffname,
+                    MPI_MODE_RDONLY,
+                    info,
+                    &f);
+        DEBUG_MSG("opened file\n");
+        MPI_File_set_view(
+                    f,
+                    0,
+                    MPI_UNSIGNED_CHAR,
+                    this->mpi_array_dtype,
+                    representation,
+                    info);
+        DEBUG_MSG("view is set\n");
+        MPI_File_read_all(
+                    f,
+                    buffer,
+                    read_size,
+                    MPI_UNSIGNED_CHAR,
+                    MPI_STATUS_IGNORE);
+        DEBUG_MSG("info is read\n");
+        MPI_File_close(&f);
+    }
+    DEBUG_MSG("finished with field_descriptor::read\n");
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::write(
+        const char *fname,
+        void *buffer)
+{
+    char representation[] = "native";
+    if (this->subsizes[0] > 0)
+    {
+        MPI_Info info;
+        MPI_Info_create(&info);
+        MPI_File f;
+        ptrdiff_t read_size = this->local_size*sizeof(rnumber);
+        char ffname[200];
+        if (this->mpi_dtype == mpi_real_type<rnumber>::complex())
+            read_size *= 2;
+        sprintf(ffname, "%s", fname);
+
+        MPI_File_open(
+                    this->io_comm,
+                    ffname,
+                    MPI_MODE_CREATE | MPI_MODE_WRONLY,
+                    info,
+                    &f);
+        MPI_File_set_view(
+                    f,
+                    0,
+                    MPI_UNSIGNED_CHAR,
+                    this->mpi_array_dtype,
+                    representation,
+                    info);
+        MPI_File_write_all(
+                    f,
+                    buffer,
+                    read_size,
+                    MPI_UNSIGNED_CHAR,
+                    MPI_STATUS_IGNORE);
+        MPI_File_close(&f);
+    }
+
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::transpose(
+        rnumber *input,
+        rnumber *output)
+{
+    /* IMPORTANT NOTE:
+     for 3D transposition, the input data is messed up */
+    typename fftw_interface<rnumber>::plan tplan;
+    if (this->ndims == 3)
+    {
+        /* transpose the two local dimensions 1 and 2 */
+        rnumber *atmp;
+        atmp = fftw_interface<rnumber>::alloc_real(this->slice_size);
+        for (int k = 0; k < this->subsizes[0]; k++)
+        {
+            /* put transposed slice in atmp */
+            for (int j = 0; j < this->sizes[1]; j++)
+                for (int i = 0; i < this->sizes[2]; i++)
+                    atmp[i*this->sizes[1] + j] =
+                            input[(k*this->sizes[1] + j)*this->sizes[2] + i];
+            /* copy back transposed slice */
+            std::copy(
+                        atmp,
+                        atmp + this->slice_size,
+                        input + k*this->slice_size);
+        }
+        fftw_interface<rnumber>::free(atmp);
+    }
+    tplan = fftw_interface<rnumber>::mpi_plan_transpose(
+                this->sizes[0], this->slice_size,
+            input, output,
+            this->comm,
+            FFTW_ESTIMATE);
+    fftw_interface<rnumber>::execute(tplan);
+    fftw_interface<rnumber>::destroy_plan(tplan);
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::transpose(
+        typename fftw_interface<rnumber>::complex *input,
+        typename fftw_interface<rnumber>::complex *output)
+{
+    switch (this->ndims)
+    {
+    case 2:
+        /* do a global transpose over the 2 dimensions */
+        if (output == NULL)
+        {
+            std::cerr << "bad arguments for transpose.\n" << std::endl;
+            return EXIT_FAILURE;
+        }
+        typename fftw_interface<rnumber>::plan tplan;
+        tplan = fftw_interface<rnumber>::mpi_plan_many_transpose(
+                    this->sizes[0], this->sizes[1], 2,
+                FFTW_MPI_DEFAULT_BLOCK,
+                FFTW_MPI_DEFAULT_BLOCK,
+                (rnumber*)input, (rnumber*)output,
+                this->comm,
+                FFTW_ESTIMATE);
+        fftw_interface<rnumber>::execute(tplan);
+        fftw_interface<rnumber>::destroy_plan(tplan);
+        break;
+    case 3:
+        /* transpose the two local dimensions 1 and 2 */
+        typename fftw_interface<rnumber>::complex *atmp;
+        atmp = fftw_interface<rnumber>::alloc_complex(this->slice_size);
+        for (int k = 0; k < this->subsizes[0]; k++)
+        {
+            /* put transposed slice in atmp */
+            for (int j = 0; j < this->sizes[1]; j++)
+                for (int i = 0; i < this->sizes[2]; i++)
+                {
+                    atmp[i*this->sizes[1] + j][0] =
+                            input[(k*this->sizes[1] + j)*this->sizes[2] + i][0];
+                    atmp[i*this->sizes[1] + j][1] =
+                            input[(k*this->sizes[1] + j)*this->sizes[2] + i][1];
+                }
+            /* copy back transposed slice */
+            std::copy(
+                        (rnumber*)(atmp),
+                        (rnumber*)(atmp + this->slice_size),
+                        (rnumber*)(input + k*this->slice_size));
+        }
+        fftw_interface<rnumber>::free(atmp);
+        break;
+    default:
+        return EXIT_FAILURE;
+        break;
+    }
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::interleave(
+        rnumber *a,
+        int dim)
+{
+    /* the following is copied from
+ * http://agentzlerich.blogspot.com/2010/01/using-fftw-for-in-place-matrix.html
+ * */
+    typename fftw_interface<rnumber>::iodim howmany_dims[2];
+    howmany_dims[0].n  = dim;
+    howmany_dims[0].is = this->local_size;
+    howmany_dims[0].os = 1;
+    howmany_dims[1].n  = this->local_size;
+    howmany_dims[1].is = 1;
+    howmany_dims[1].os = dim;
+    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]);
+
+    typename fftw_interface<rnumber>::plan tmp = fftw_interface<rnumber>::plan_guru_r2r(
+                /*rank*/0,
+                /*dims*/nullptr,
+                howmany_rank,
+                howmany_dims,
+                a,
+                a,
+                /*kind*/nullptr,
+                FFTW_ESTIMATE);
+    fftw_interface<rnumber>::execute(tmp);
+    fftw_interface<rnumber>::destroy_plan(tmp);
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+int field_descriptor<rnumber>::interleave(
+        typename fftw_interface<rnumber>::complex *a,
+        int dim)
+{
+    typename fftw_interface<rnumber>::iodim howmany_dims[2];
+    howmany_dims[0].n  = dim;
+    howmany_dims[0].is = this->local_size;
+    howmany_dims[0].os = 1;
+    howmany_dims[1].n  = this->local_size;
+    howmany_dims[1].is = 1;
+    howmany_dims[1].os = dim;
+    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]);
+
+    typename fftw_interface<rnumber>::plan tmp = fftw_interface<rnumber>::plan_guru_dft(
+                /*rank*/0,
+                /*dims*/nullptr,
+                howmany_rank,
+                howmany_dims,
+                a,
+                a,
+                +1,
+                FFTW_ESTIMATE);
+    fftw_interface<rnumber>::execute(tmp);
+    fftw_interface<rnumber>::destroy_plan(tmp);
+    return EXIT_SUCCESS;
+}
+
+template <class rnumber>
+field_descriptor<rnumber>* field_descriptor<rnumber>::get_transpose()
+{
+    int n[this->ndims];
+    for (int i=0; i<this->ndims; i++)
+        n[i] = this->sizes[this->ndims - i - 1];
+    return new field_descriptor<rnumber>(this->ndims, n, this->mpi_dtype, this->comm);
+}
 
 /*****************************************************************************/
-/* now actually use the macro defined above                                  */
-CLASS_IMPLEMENTATION(
-        FFTW_MANGLE_FLOAT,
-        float,
-        MPI_FLOAT,
-        MPI_COMPLEX)
-CLASS_IMPLEMENTATION(
-        FFTW_MANGLE_DOUBLE,
-        double,
-        MPI_DOUBLE,
-        BFPS_MPICXX_DOUBLE_COMPLEX)
 /*****************************************************************************/
 
 
@@ -511,23 +496,23 @@ template <class rnumber>
 field_descriptor<rnumber>::~field_descriptor()
 {
     DEBUG_MSG_WAIT(
-            MPI_COMM_WORLD,
-            this->io_comm == MPI_COMM_NULL ? "null\n" : "not null\n");
+                MPI_COMM_WORLD,
+                this->io_comm == MPI_COMM_NULL ? "null\n" : "not null\n");
     DEBUG_MSG_WAIT(
-            MPI_COMM_WORLD,
-            "subsizes[0] = %d \n", this->subsizes[0]);
+                MPI_COMM_WORLD,
+                "subsizes[0] = %d \n", this->subsizes[0]);
     if (this->subsizes[0] > 0)
     {
         DEBUG_MSG_WAIT(
-                this->io_comm,
-                "deallocating mpi_array_dtype\n");
+                    this->io_comm,
+                    "deallocating mpi_array_dtype\n");
         MPI_Type_free(&this->mpi_array_dtype);
     }
     if (this->nprocs != this->io_nprocs && this->io_myrank != MPI_PROC_NULL)
     {
         DEBUG_MSG_WAIT(
-                this->io_comm,
-                "freeing io_comm\n");
+                    this->io_comm,
+                    "freeing io_comm\n");
         MPI_Comm_free(&this->io_comm);
     }
     delete[] this->sizes;
diff --git a/bfps/cpp/field_descriptor.hpp b/bfps/cpp/field_descriptor.hpp
index bfcf52ed415ddb90bd77a6c6793974aea6a94734..2fb491bca7c130704fc5de5d22c3393cb196eec7 100644
--- a/bfps/cpp/field_descriptor.hpp
+++ b/bfps/cpp/field_descriptor.hpp
@@ -26,6 +26,7 @@
 
 #include <mpi.h>
 #include <fftw3-mpi.h>
+#include "fftw_interface.hpp"
 
 #ifndef FIELD_DESCRIPTOR
 
@@ -85,14 +86,14 @@ class field_descriptor
                 rnumber *input,
                 rnumber *output);
         int transpose(
-                cnumber *input,
-                cnumber *output = NULL);
+                typename fftw_interface<rnumber>::complex *input,
+                typename fftw_interface<rnumber>::complex *output = NULL);
 
         int interleave(
                 rnumber *input,
                 int dim);
         int interleave(
-                cnumber *input,
+                typename fftw_interface<rnumber>::complex *input,
                 int dim);
 };
 
diff --git a/bfps/cpp/fluid_solver.cpp b/bfps/cpp/fluid_solver.cpp
index 61d83fe38ab632d39ee76ad2f131fb41dce80e7c..9456cc186caefcaba4fa10af1a027a565a16c2a9 100644
--- a/bfps/cpp/fluid_solver.cpp
+++ b/bfps/cpp/fluid_solver.cpp
@@ -48,911 +48,936 @@ void fluid_solver<rnumber>::impose_zero_modes()
 /*****************************************************************************/
 /* macro for specializations to numeric types compatible with FFTW           */
 
-#define FLUID_SOLVER_DEFINITIONS(FFTW, R, MPI_RNUM, MPI_CNUM) \
- \
-template<> \
-fluid_solver<R>::fluid_solver( \
-        const char *NAME, \
-        int nx, \
-        int ny, \
-        int nz, \
-        double DKX, \
-        double DKY, \
-        double DKZ, \
-        int DEALIAS_TYPE, \
-        unsigned FFTW_PLAN_RIGOR) : fluid_solver_base<R>( \
-                NAME, \
-                nx , ny , nz, \
-                DKX, DKY, DKZ, \
-                DEALIAS_TYPE, \
-                FFTW_PLAN_RIGOR) \
-{ \
-    this->cvorticity = FFTW(alloc_complex)(this->cd->local_size);\
-    this->cvelocity  = FFTW(alloc_complex)(this->cd->local_size);\
-    this->rvorticity = FFTW(alloc_real)(this->cd->local_size*2);\
-    /*this->rvelocity  = (R*)(this->cvelocity);*/\
-    this->rvelocity  = FFTW(alloc_real)(this->cd->local_size*2);\
- \
-    this->ru = this->rvelocity;\
-    this->cu = this->cvelocity;\
- \
-    this->rv[0] = this->rvorticity;\
-    this->rv[3] = this->rvorticity;\
-    this->cv[0] = this->cvorticity;\
-    this->cv[3] = this->cvorticity;\
- \
-    this->cv[1] = FFTW(alloc_complex)(this->cd->local_size);\
-    this->cv[2] = this->cv[1];\
-    this->rv[1] = FFTW(alloc_real)(this->cd->local_size*2);\
-    this->rv[2] = this->rv[1];\
- \
-    this->c2r_vorticity = new FFTW(plan);\
-    this->r2c_vorticity = new FFTW(plan);\
-    this->c2r_velocity  = new FFTW(plan);\
-    this->r2c_velocity  = new FFTW(plan);\
- \
-    ptrdiff_t sizes[] = {nz, \
-                         ny, \
-                         nx};\
- \
-    *(FFTW(plan)*)this->c2r_vorticity = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->cvorticity, this->rvorticity, \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
- \
-    *(FFTW(plan)*)this->r2c_vorticity = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->rvorticity, this->cvorticity, \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \
- \
-    *(FFTW(plan)*)this->c2r_velocity = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->cvelocity, this->rvelocity, \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
- \
-    *(FFTW(plan)*)this->r2c_velocity = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->rvelocity, this->cvelocity, \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \
- \
-    this->uc2r = this->c2r_velocity;\
-    this->ur2c = this->r2c_velocity;\
-    this->vc2r[0] = this->c2r_vorticity;\
-    this->vr2c[0] = this->r2c_vorticity;\
- \
-    this->vc2r[1] = new FFTW(plan);\
-    this->vr2c[1] = new FFTW(plan);\
-    this->vc2r[2] = new FFTW(plan);\
-    this->vr2c[2] = new FFTW(plan);\
- \
-    *(FFTW(plan)*)(this->vc2r[1]) = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->cv[1], this->rv[1], \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
- \
-    *(FFTW(plan)*)this->vc2r[2] = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->cv[2], this->rv[2], \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
- \
-    *(FFTW(plan)*)this->vr2c[1] = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->rv[1], this->cv[1], \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \
- \
-    *(FFTW(plan)*)this->vr2c[2] = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
-            this->rv[2], this->cv[2], \
-            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT); \
- \
-    /* ``physical'' parameters etc, initialized here just in case */ \
- \
-    this->nu = 0.1; \
-    this->fmode = 1; \
-    this->famplitude = 1.0; \
-    this->fk0  = 0; \
-    this->fk1 = 3.0; \
-    /* initialization of fields must be done AFTER planning */ \
-    std::fill_n((R*)this->cvorticity, this->cd->local_size*2, 0.0); \
-    std::fill_n((R*)this->cvelocity, this->cd->local_size*2, 0.0); \
-    std::fill_n(this->rvelocity, this->cd->local_size*2, 0.0); \
-    std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0); \
-    std::fill_n((R*)this->cv[1], this->cd->local_size*2, 0.0); \
-    std::fill_n(this->rv[1], this->cd->local_size*2, 0.0); \
-    std::fill_n(this->rv[2], this->cd->local_size*2, 0.0); \
-} \
- \
-template<> \
-fluid_solver<R>::~fluid_solver() \
-{ \
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->c2r_vorticity);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->r2c_vorticity);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->c2r_velocity );\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->r2c_velocity );\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->vc2r[1]);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->vr2c[1]);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->vc2r[2]);\
-    FFTW(destroy_plan)(*(FFTW(plan)*)this->vr2c[2]);\
- \
-    delete (FFTW(plan)*)this->c2r_vorticity;\
-    delete (FFTW(plan)*)this->r2c_vorticity;\
-    delete (FFTW(plan)*)this->c2r_velocity ;\
-    delete (FFTW(plan)*)this->r2c_velocity ;\
-    delete (FFTW(plan)*)this->vc2r[1];\
-    delete (FFTW(plan)*)this->vr2c[1];\
-    delete (FFTW(plan)*)this->vc2r[2];\
-    delete (FFTW(plan)*)this->vr2c[2];\
- \
-    FFTW(free)(this->cv[1]);\
-    FFTW(free)(this->rv[1]);\
-    FFTW(free)(this->cvorticity);\
-    FFTW(free)(this->rvorticity);\
-    FFTW(free)(this->cvelocity);\
-    FFTW(free)(this->rvelocity);\
-} \
- \
-template<> \
-void fluid_solver<R>::compute_vorticity() \
-{ \
-    ptrdiff_t tindex; \
-    CLOOP_K2( \
-            this, \
-            tindex = 3*cindex; \
-            if (k2 <= this->kM2) \
-            { \
-                this->cvorticity[tindex+0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]); \
-                this->cvorticity[tindex+1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]); \
-                this->cvorticity[tindex+2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]); \
-                this->cvorticity[tindex+0][1] =  (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]); \
-                this->cvorticity[tindex+1][1] =  (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]); \
-                this->cvorticity[tindex+2][1] =  (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]); \
-            } \
-            else \
-                std::fill_n((R*)(this->cvorticity+tindex), 6, 0.0); \
-            ); \
-    this->symmetrize(this->cvorticity, 3); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_velocity(FFTW(complex) *vorticity) \
-{ \
-    ptrdiff_t tindex; \
-    CLOOP_K2( \
-            this, \
-            tindex = 3*cindex; \
-            if (k2 <= this->kM2 && k2 > 0) \
-            { \
-                this->cu[tindex+0][0] = -(this->ky[yindex]*vorticity[tindex+2][1] - this->kz[zindex]*vorticity[tindex+1][1]) / k2; \
-                this->cu[tindex+1][0] = -(this->kz[zindex]*vorticity[tindex+0][1] - this->kx[xindex]*vorticity[tindex+2][1]) / k2; \
-                this->cu[tindex+2][0] = -(this->kx[xindex]*vorticity[tindex+1][1] - this->ky[yindex]*vorticity[tindex+0][1]) / k2; \
-                this->cu[tindex+0][1] =  (this->ky[yindex]*vorticity[tindex+2][0] - this->kz[zindex]*vorticity[tindex+1][0]) / k2; \
-                this->cu[tindex+1][1] =  (this->kz[zindex]*vorticity[tindex+0][0] - this->kx[xindex]*vorticity[tindex+2][0]) / k2; \
-                this->cu[tindex+2][1] =  (this->kx[xindex]*vorticity[tindex+1][0] - this->ky[yindex]*vorticity[tindex+0][0]) / k2; \
-            } \
-            else \
-                std::fill_n((R*)(this->cu+tindex), 6, 0.0); \
-            ); \
-    /*this->symmetrize(this->cu, 3);*/ \
-} \
- \
-template<> \
-void fluid_solver<R>::ift_velocity() \
-{ \
-    FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity )); \
-} \
- \
-template<> \
-void fluid_solver<R>::ift_vorticity() \
-{ \
-    std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0); \
-    FFTW(execute)(*((FFTW(plan)*)this->c2r_vorticity )); \
-} \
- \
-template<> \
-void fluid_solver<R>::dft_velocity() \
-{ \
-    FFTW(execute)(*((FFTW(plan)*)this->r2c_velocity )); \
-} \
- \
-template<> \
-void fluid_solver<R>::dft_vorticity() \
-{ \
-    std::fill_n((R*)this->cvorticity, this->cd->local_size*2, 0.0); \
-    FFTW(execute)(*((FFTW(plan)*)this->r2c_vorticity )); \
-} \
- \
-template<> \
-void fluid_solver<R>::add_forcing(\
-        FFTW(complex) *acc_field, FFTW(complex) *vort_field, R factor) \
-{ \
-    if (strcmp(this->forcing_type, "none") == 0) \
-        return; \
-    if (strcmp(this->forcing_type, "Kolmogorov") == 0) \
-    { \
-        ptrdiff_t cindex; \
-        if (this->cd->myrank == this->cd->rank[this->fmode]) \
-        { \
-            cindex = ((this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3; \
-            acc_field[cindex+2][0] -= this->famplitude*factor/2; \
-        } \
-        if (this->cd->myrank == this->cd->rank[this->cd->sizes[0] - this->fmode]) \
-        { \
-            cindex = ((this->cd->sizes[0] - this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3; \
-            acc_field[cindex+2][0] -= this->famplitude*factor/2; \
-        } \
-        return; \
-    } \
-    if (strcmp(this->forcing_type, "linear") == 0) \
-    { \
-        double knorm; \
-        CLOOP( \
-                this, \
-                knorm = sqrt(this->kx[xindex]*this->kx[xindex] + \
-                             this->ky[yindex]*this->ky[yindex] + \
-                             this->kz[zindex]*this->kz[zindex]); \
-                if ((this->fk0 <= knorm) && \
-                    (this->fk1 >= knorm)) \
-                    for (int c=0; c<3; c++) \
-                    for (int i=0; i<2; i++) \
-                        acc_field[cindex*3+c][i] += this->famplitude*vort_field[cindex*3+c][i]*factor; \
-             ); \
-        return; \
-    } \
-} \
- \
-template<> \
-void fluid_solver<R>::omega_nonlin( \
-        int src) \
-{ \
-    assert(src >= 0 && src < 3); \
-    this->compute_velocity(this->cv[src]); \
-    /* get fields from Fourier space to real space */ \
-    FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity ));  \
-    FFTW(execute)(*((FFTW(plan)*)this->vc2r[src]));      \
-    /* compute cross product $u \times \omega$, and normalize */ \
-    R tmp[3][2]; \
-    ptrdiff_t tindex; \
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                tmp[cc][0] = (this->ru[tindex+(cc+1)%3]*this->rv[src][tindex+(cc+2)%3] - \
-                              this->ru[tindex+(cc+2)%3]*this->rv[src][tindex+(cc+1)%3]); \
-            for (int cc=0; cc<3; cc++) \
-                this->ru[(3*rindex)+cc] = tmp[cc][0] / this->normalization_factor; \
-            ); \
-    /* go back to Fourier space */ \
-    this->clean_up_real_space(this->ru, 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->r2c_velocity )); \
-    this->dealias(this->cu, 3); \
-    /* $\imath k \times Fourier(u \times \omega)$ */ \
-    CLOOP( \
-            this, \
-            tindex = 3*cindex; \
-            { \
-                tmp[0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]); \
-                tmp[1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]); \
-                tmp[2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]); \
-                tmp[0][1] =  (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]); \
-                tmp[1][1] =  (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]); \
-                tmp[2][1] =  (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]); \
-            } \
-            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \
-                this->cu[tindex+cc][i] = tmp[cc][i]; \
-            ); \
-    this->add_forcing(this->cu, this->cv[src], 1.0); \
-    this->force_divfree(this->cu); \
-} \
- \
-template<> \
-void fluid_solver<R>::step(double dt) \
-{ \
-    double factor0, factor1; \
-    std::fill_n((R*)this->cv[1], this->cd->local_size*2, 0.0); \
-    this->omega_nonlin(0); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                factor0 = exp(-this->nu * k2 * dt); \
-                for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \
-                    this->cv[1][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i] + \
-                                                   dt*this->cu[3*cindex+cc][i])*factor0; \
-            } \
-            ); \
- \
-    this->omega_nonlin(1); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                factor0 = exp(-this->nu * k2 * dt/2); \
-                factor1 = exp( this->nu * k2 * dt/2); \
-                for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \
-                    this->cv[2][3*cindex+cc][i] = (3*this->cv[0][3*cindex+cc][i]*factor0 + \
-                                                   (this->cv[1][3*cindex+cc][i] + \
-                                                    dt*this->cu[3*cindex+cc][i])*factor1)*0.25; \
-            } \
-            ); \
- \
-    this->omega_nonlin(2); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                factor0 = exp(-this->nu * k2 * dt * 0.5); \
-                for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++) \
-                    this->cv[3][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i]*factor0 + \
-                                                   2*(this->cv[2][3*cindex+cc][i] + \
-                                                      dt*this->cu[3*cindex+cc][i]))*factor0/3; \
-            } \
-            ); \
- \
-    this->force_divfree(this->cvorticity); \
-    this->symmetrize(this->cvorticity, 3); \
-    this->iteration++; \
-} \
- \
-template<> \
-int fluid_solver<R>::read(char field, char representation) \
-{ \
-    char fname[512]; \
-    int read_result; \
-    if (field == 'v') \
-    { \
-        if (representation == 'c') \
-        { \
-            this->fill_up_filename("cvorticity", fname); \
-            read_result = this->cd->read(fname, (void*)this->cvorticity); \
-            if (read_result != EXIT_SUCCESS) \
-                return read_result; \
-        } \
-        if (representation == 'r') \
-        { \
-            read_result = this->read_base("rvorticity", this->rvorticity); \
-            if (read_result != EXIT_SUCCESS) \
-                return read_result; \
-            else \
-                FFTW(execute)(*((FFTW(plan)*)this->r2c_vorticity )); \
-        } \
-        this->low_pass_Fourier(this->cvorticity, 3, this->kM); \
-        this->force_divfree(this->cvorticity); \
-        this->symmetrize(this->cvorticity, 3); \
-        return EXIT_SUCCESS; \
-    } \
-    if ((field == 'u') && (representation == 'c')) \
-    { \
-        read_result = this->read_base("cvelocity", this->cvelocity); \
-        this->low_pass_Fourier(this->cvelocity, 3, this->kM); \
-        this->force_divfree(this->cvorticity); \
-        this->symmetrize(this->cvorticity, 3); \
-        return read_result; \
-    } \
-    if ((field == 'u') && (representation == 'r')) \
-        return this->read_base("rvelocity", this->rvelocity); \
-    return EXIT_FAILURE; \
-} \
- \
-template<> \
-int fluid_solver<R>::write(char field, char representation) \
-{ \
-    char fname[512]; \
-    if ((field == 'v') && (representation == 'c')) \
-    { \
-        this->fill_up_filename("cvorticity", fname); \
-        return this->cd->write(fname, (void*)this->cvorticity); \
-    } \
-    if ((field == 'v') && (representation == 'r')) \
-    { \
-        FFTW(execute)(*((FFTW(plan)*)this->c2r_vorticity )); \
-        clip_zero_padding<R>(this->rd, this->rvorticity, 3); \
-        this->fill_up_filename("rvorticity", fname); \
-        return this->rd->write(fname, this->rvorticity); \
-    } \
-    this->compute_velocity(this->cvorticity); \
-    if ((field == 'u') && (representation == 'c')) \
-    { \
-        this->fill_up_filename("cvelocity", fname); \
-        return this->cd->write(fname, this->cvelocity); \
-    } \
-    if ((field == 'u') && (representation == 'r')) \
-    { \
-        this->ift_velocity(); \
-        clip_zero_padding<R>(this->rd, this->rvelocity, 3); \
-        this->fill_up_filename("rvelocity", fname); \
-        return this->rd->write(fname, this->rvelocity); \
-    } \
-    return EXIT_FAILURE; \
-} \
- \
-template<> \
-int fluid_solver<R>::write_rTrS2() \
-{ \
-    char fname[512]; \
-    this->fill_up_filename("rTrS2", fname); \
-    FFTW(complex) *ca; \
-    R *ra; \
-    ca = FFTW(alloc_complex)(this->cd->local_size*3); \
-    ra = (R*)(ca); \
-    this->compute_velocity(this->cvorticity); \
-    this->compute_vector_gradient(ca, this->cvelocity); \
-    for (int cc=0; cc<3; cc++) \
-    { \
-        std::copy( \
-                (R*)(ca + cc*this->cd->local_size), \
-                (R*)(ca + (cc+1)*this->cd->local_size), \
-                (R*)this->cv[1]); \
-        FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \
-        std::copy( \
-                this->rv[1], \
-                this->rv[1] + this->cd->local_size*2, \
-                ra + cc*this->cd->local_size*2); \
-    } \
-    /* velocity gradient is now stored, in real space, in ra */ \
-    R *dx_u, *dy_u, *dz_u; \
-    dx_u = ra; \
-    dy_u = ra + 2*this->cd->local_size; \
-    dz_u = ra + 4*this->cd->local_size; \
-    R *trS2 = FFTW(alloc_real)((this->cd->local_size/3)*2); \
-    double average_local = 0; \
-    RLOOP( \
-            this, \
-            R AxxAxx; \
-            R AyyAyy; \
-            R AzzAzz; \
-            R Sxy; \
-            R Syz; \
-            R Szx; \
-            ptrdiff_t tindex = 3*rindex; \
-            AxxAxx = dx_u[tindex+0]*dx_u[tindex+0]; \
-            AyyAyy = dy_u[tindex+1]*dy_u[tindex+1]; \
-            AzzAzz = dz_u[tindex+2]*dz_u[tindex+2]; \
-            Sxy = dx_u[tindex+1]+dy_u[tindex+0]; \
-            Syz = dy_u[tindex+2]+dz_u[tindex+1]; \
-            Szx = dz_u[tindex+0]+dx_u[tindex+2]; \
-            trS2[rindex] = (AxxAxx + AyyAyy + AzzAzz + \
-                            (Sxy*Sxy + Syz*Syz + Szx*Szx)/2); \
-            average_local += trS2[rindex]; \
-            ); \
-    double average; \
-    MPI_Allreduce( \
-            &average_local, \
-            &average, \
-            1, \
-            MPI_DOUBLE, MPI_SUM, this->cd->comm); \
-    DEBUG_MSG("average TrS2 is %g\n", average); \
-    FFTW(free)(ca); \
-    /* output goes here */ \
-    int ntmp[3]; \
-    ntmp[0] = this->rd->sizes[0]; \
-    ntmp[1] = this->rd->sizes[1]; \
-    ntmp[2] = this->rd->sizes[2]; \
-    field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \
-    clip_zero_padding<R>(scalar_descriptor, trS2, 1); \
-    int return_value = scalar_descriptor->write(fname, trS2); \
-    delete scalar_descriptor; \
-    FFTW(free)(trS2); \
-    return return_value; \
-} \
- \
-template<> \
-int fluid_solver<R>::write_renstrophy() \
-{ \
-    char fname[512]; \
-    this->fill_up_filename("renstrophy", fname); \
-    R *enstrophy = FFTW(alloc_real)((this->cd->local_size/3)*2); \
-    this->ift_vorticity(); \
-    double average_local = 0; \
-    RLOOP( \
-            this, \
-            ptrdiff_t tindex = 3*rindex; \
-            enstrophy[rindex] = ( \
-                this->rvorticity[tindex+0]*this->rvorticity[tindex+0] + \
-                this->rvorticity[tindex+1]*this->rvorticity[tindex+1] + \
-                this->rvorticity[tindex+2]*this->rvorticity[tindex+2] \
-                )/2; \
-            average_local += enstrophy[rindex]; \
-            ); \
-    double average; \
-    MPI_Allreduce( \
-            &average_local, \
-            &average, \
-            1, \
-            MPI_DOUBLE, MPI_SUM, this->cd->comm); \
-    DEBUG_MSG("average enstrophy is %g\n", average); \
-    /* output goes here */ \
-    int ntmp[3]; \
-    ntmp[0] = this->rd->sizes[0]; \
-    ntmp[1] = this->rd->sizes[1]; \
-    ntmp[2] = this->rd->sizes[2]; \
-    field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \
-    clip_zero_padding<R>(scalar_descriptor, enstrophy, 1); \
-    int return_value = scalar_descriptor->write(fname, enstrophy); \
-    delete scalar_descriptor; \
-    FFTW(free)(enstrophy); \
-    return return_value; \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_pressure(FFTW(complex) *pressure) \
-{ \
-    /* assume velocity is already in real space representation */ \
-    ptrdiff_t tindex; \
-    \
-    /* diagonal terms 11 22 33 */\
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc]; \
-            ); \
-    this->clean_up_real_space(this->rv[1], 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \
-    this->dealias(this->cv[1], 3); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2 && k2 > 0) \
-            { \
-                tindex = 3*cindex; \
-                for (int i=0; i<2; i++) \
-                { \
-                    pressure[cindex][i] = -(this->kx[xindex]*this->kx[xindex]*this->cv[1][tindex+0][i] + \
-                                            this->ky[yindex]*this->ky[yindex]*this->cv[1][tindex+1][i] + \
-                                            this->kz[zindex]*this->kz[zindex]*this->cv[1][tindex+2][i]); \
-                } \
-            } \
-            else \
-                std::fill_n((R*)(pressure+cindex), 2, 0.0); \
-            ); \
-    /* off-diagonal terms 12 23 31 */\
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3]; \
-            ); \
-    this->clean_up_real_space(this->rv[1], 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \
-    this->dealias(this->cv[1], 3); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2 && k2 > 0) \
-            { \
-                tindex = 3*cindex; \
-                for (int i=0; i<2; i++) \
-                { \
-                    pressure[cindex][i] -= 2*(this->kx[xindex]*this->ky[yindex]*this->cv[1][tindex+0][i] + \
-                                              this->ky[yindex]*this->kz[zindex]*this->cv[1][tindex+1][i] + \
-                                              this->kz[zindex]*this->kx[xindex]*this->cv[1][tindex+2][i]); \
-                    pressure[cindex][i] /= this->normalization_factor*k2; \
-                } \
-            } \
-            ); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_gradient_statistics( \
-        FFTW(complex) *vec, \
-        double *gradu_moments, \
-        double *trS2QR_moments, \
-        ptrdiff_t *gradu_hist, \
-        ptrdiff_t *trS2QR_hist, \
-        ptrdiff_t *QR2D_hist, \
-        double trS2QR_max_estimates[], \
-        double gradu_max_estimates[], \
-        int nbins, \
-        int QR2D_nbins) \
-{ \
-    FFTW(complex) *ca; \
-    R *ra; \
-    ca = FFTW(alloc_complex)(this->cd->local_size*3); \
-    ra = (R*)(ca); \
-    this->compute_vector_gradient(ca, vec); \
-    for (int cc=0; cc<3; cc++) \
-    { \
-        std::copy( \
-                (R*)(ca + cc*this->cd->local_size), \
-                (R*)(ca + (cc+1)*this->cd->local_size), \
-                (R*)this->cv[1]); \
-        FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \
-        std::copy( \
-                this->rv[1], \
-                this->rv[1] + this->cd->local_size*2, \
-                ra + cc*this->cd->local_size*2); \
-    } \
-    /* velocity gradient is now stored, in real space, in ra */ \
-    std::fill_n(this->rv[1], 2*this->cd->local_size, 0.0); \
-    R *dx_u, *dy_u, *dz_u; \
-    dx_u = ra; \
-    dy_u = ra + 2*this->cd->local_size; \
-    dz_u = ra + 4*this->cd->local_size; \
-    double binsize[2]; \
-    double tmp_max_estimate[3]; \
-    tmp_max_estimate[0] = trS2QR_max_estimates[0]; \
-    tmp_max_estimate[1] = trS2QR_max_estimates[1]; \
-    tmp_max_estimate[2] = trS2QR_max_estimates[2]; \
-    binsize[0] = 2*tmp_max_estimate[2] / QR2D_nbins; \
-    binsize[1] = 2*tmp_max_estimate[1] / QR2D_nbins; \
-    ptrdiff_t *local_hist = new ptrdiff_t[QR2D_nbins*QR2D_nbins]; \
-    std::fill_n(local_hist, QR2D_nbins*QR2D_nbins, 0); \
-    RLOOP( \
-            this, \
-            R AxxAxx; \
-            R AyyAyy; \
-            R AzzAzz; \
-            R AxyAyx; \
-            R AyzAzy; \
-            R AzxAxz; \
-            R Sxy; \
-            R Syz; \
-            R Szx; \
-            ptrdiff_t tindex = 3*rindex; \
-            AxxAxx = dx_u[tindex+0]*dx_u[tindex+0]; \
-            AyyAyy = dy_u[tindex+1]*dy_u[tindex+1]; \
-            AzzAzz = dz_u[tindex+2]*dz_u[tindex+2]; \
-            AxyAyx = dx_u[tindex+1]*dy_u[tindex+0]; \
-            AyzAzy = dy_u[tindex+2]*dz_u[tindex+1]; \
-            AzxAxz = dz_u[tindex+0]*dx_u[tindex+2]; \
-            this->rv[1][tindex+1] = - (AxxAxx + AyyAyy + AzzAzz)/2 - AxyAyx - AyzAzy - AzxAxz; \
-            this->rv[1][tindex+2] = - (dx_u[tindex+0]*(AxxAxx/3 + AxyAyx + AzxAxz) + \
-                                       dy_u[tindex+1]*(AyyAyy/3 + AxyAyx + AyzAzy) + \
-                                       dz_u[tindex+2]*(AzzAzz/3 + AzxAxz + AyzAzy) + \
-                                       dx_u[tindex+1]*dy_u[tindex+2]*dz_u[tindex+0] + \
-                                       dx_u[tindex+2]*dy_u[tindex+0]*dz_u[tindex+1]); \
-            int bin0 = int(floor((this->rv[1][tindex+2] + tmp_max_estimate[2]) / binsize[0])); \
-            int bin1 = int(floor((this->rv[1][tindex+1] + tmp_max_estimate[1]) / binsize[1])); \
-            if ((bin0 >= 0 && bin0 < QR2D_nbins) && \
-                (bin1 >= 0 && bin1 < QR2D_nbins)) \
-                local_hist[bin1*QR2D_nbins + bin0]++; \
-            Sxy = dx_u[tindex+1]+dy_u[tindex+0]; \
-            Syz = dy_u[tindex+2]+dz_u[tindex+1]; \
-            Szx = dz_u[tindex+0]+dx_u[tindex+2]; \
-            this->rv[1][tindex] = (AxxAxx + AyyAyy + AzzAzz + \
-                                   (Sxy*Sxy + Syz*Syz + Szx*Szx)/2); \
-            ); \
-    MPI_Allreduce( \
-            local_hist, \
-            QR2D_hist, \
-            QR2D_nbins * QR2D_nbins, \
-            MPI_INT64_T, MPI_SUM, this->cd->comm); \
-    delete[] local_hist; \
-    this->compute_rspace_stats3( \
-            this->rv[1], \
-            trS2QR_moments, \
-            trS2QR_hist, \
-            tmp_max_estimate, \
-            nbins); \
-    double *tmp_moments = new double[10*3]; \
-    ptrdiff_t *tmp_hist = new ptrdiff_t[nbins*3]; \
-    for (int cc=0; cc<3; cc++) \
-    { \
-        tmp_max_estimate[0] = gradu_max_estimates[cc*3 + 0]; \
-        tmp_max_estimate[1] = gradu_max_estimates[cc*3 + 1]; \
-        tmp_max_estimate[2] = gradu_max_estimates[cc*3 + 2]; \
-        this->compute_rspace_stats3( \
-                dx_u + cc*2*this->cd->local_size, \
-                tmp_moments, \
-                tmp_hist, \
-                tmp_max_estimate, \
-                nbins); \
-        for (int n = 0; n < 10; n++) \
-        for (int i = 0; i < 3 ; i++) \
-        { \
-            gradu_moments[(n*3 + cc)*3 + i] = tmp_moments[n*3 + i]; \
-        } \
-        for (int n = 0; n < nbins; n++) \
-        for (int i = 0; i < 3; i++) \
-        { \
-            gradu_hist[(n*3 + cc)*3 + i] = tmp_hist[n*3 + i]; \
-        } \
-    } \
-    delete[] tmp_moments; \
-    delete[] tmp_hist; \
-    FFTW(free)(ca); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_Lagrangian_acceleration(R (*acceleration)[2]) \
-{ \
-    ptrdiff_t tindex; \
-    FFTW(complex) *pressure; \
-    pressure = FFTW(alloc_complex)(this->cd->local_size/3); \
-    this->compute_velocity(this->cvorticity); \
-    this->ift_velocity(); \
-    this->compute_pressure(pressure); \
-    this->compute_velocity(this->cvorticity); \
-    std::fill_n((R*)this->cv[1], 2*this->cd->local_size, 0.0); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                for (int cc=0; cc<3; cc++) \
-                    for (int i=0; i<2; i++) \
-                        this->cv[1][tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i]; \
-                if (strcmp(this->forcing_type, "linear") == 0) \
-                { \
-                    double knorm = sqrt(k2); \
-                    if ((this->fk0 <= knorm) && \
-                        (this->fk1 >= knorm)) \
-                        for (int c=0; c<3; c++) \
-                            for (int i=0; i<2; i++) \
-                                this->cv[1][tindex+c][i] += this->famplitude*this->cu[tindex+c][i]; \
-                } \
-                this->cv[1][tindex+0][0] += this->kx[xindex]*pressure[cindex][1]; \
-                this->cv[1][tindex+1][0] += this->ky[yindex]*pressure[cindex][1]; \
-                this->cv[1][tindex+2][0] += this->kz[zindex]*pressure[cindex][1]; \
-                this->cv[1][tindex+0][1] -= this->kx[xindex]*pressure[cindex][0]; \
-                this->cv[1][tindex+1][1] -= this->ky[yindex]*pressure[cindex][0]; \
-                this->cv[1][tindex+2][1] -= this->kz[zindex]*pressure[cindex][0]; \
-            } \
-            ); \
-    std::copy( \
-            (R*)this->cv[1], \
-            (R*)(this->cv[1] + this->cd->local_size), \
-            (R*)acceleration); \
-    FFTW(free)(pressure); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_Eulerian_acceleration(FFTW(complex) *acceleration) \
-{ \
-    std::fill_n((R*)(acceleration), 2*this->cd->local_size, 0.0); \
-    ptrdiff_t tindex; \
-    this->compute_velocity(this->cvorticity); \
-    /* put in linear terms */ \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                for (int cc=0; cc<3; cc++) \
-                    for (int i=0; i<2; i++) \
-                        acceleration[tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i]; \
-                if (strcmp(this->forcing_type, "linear") == 0) \
-                { \
-                    double knorm = sqrt(k2); \
-                    if ((this->fk0 <= knorm) && \
-                        (this->fk1 >= knorm)) \
-                    { \
-                        for (int c=0; c<3; c++) \
-                            for (int i=0; i<2; i++) \
-                                acceleration[tindex+c][i] += this->famplitude*this->cu[tindex+c][i]; \
-                    } \
-                } \
-            } \
-            ); \
-    this->ift_velocity(); \
-    /* compute uu */ \
-    /* 11 22 33 */ \
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc] / this->normalization_factor; \
-            ); \
-    this->clean_up_real_space(this->rv[1], 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \
-    this->dealias(this->cv[1], 3); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                acceleration[tindex+0][0] += \
-                         this->kx[xindex]*this->cv[1][tindex+0][1]; \
-                acceleration[tindex+0][1] += \
-                        -this->kx[xindex]*this->cv[1][tindex+0][0]; \
-                acceleration[tindex+1][0] += \
-                         this->ky[yindex]*this->cv[1][tindex+1][1]; \
-                acceleration[tindex+1][1] += \
-                        -this->ky[yindex]*this->cv[1][tindex+1][0]; \
-                acceleration[tindex+2][0] += \
-                         this->kz[zindex]*this->cv[1][tindex+2][1]; \
-                acceleration[tindex+2][1] += \
-                        -this->kz[zindex]*this->cv[1][tindex+2][0]; \
-            } \
-            ); \
-    /* 12 23 31 */ \
-    RLOOP ( \
-            this, \
-            tindex = 3*rindex; \
-            for (int cc=0; cc<3; cc++) \
-                this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3] / this->normalization_factor; \
-            ); \
-    this->clean_up_real_space(this->rv[1], 3); \
-    FFTW(execute)(*((FFTW(plan)*)this->vr2c[1])); \
-    this->dealias(this->cv[1], 3); \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                acceleration[tindex+0][0] += \
-                        (this->ky[yindex]*this->cv[1][tindex+0][1] + \
-                         this->kz[zindex]*this->cv[1][tindex+2][1]); \
-                acceleration[tindex+0][1] += \
-                      - (this->ky[yindex]*this->cv[1][tindex+0][0] + \
-                         this->kz[zindex]*this->cv[1][tindex+2][0]); \
-                acceleration[tindex+1][0] += \
-                        (this->kz[zindex]*this->cv[1][tindex+1][1] + \
-                         this->kx[xindex]*this->cv[1][tindex+0][1]); \
-                acceleration[tindex+1][1] += \
-                      - (this->kz[zindex]*this->cv[1][tindex+1][0] + \
-                         this->kx[xindex]*this->cv[1][tindex+0][0]); \
-                acceleration[tindex+2][0] += \
-                        (this->kx[xindex]*this->cv[1][tindex+2][1] + \
-                         this->ky[yindex]*this->cv[1][tindex+1][1]); \
-                acceleration[tindex+2][1] += \
-                      - (this->kx[xindex]*this->cv[1][tindex+2][0] + \
-                         this->ky[yindex]*this->cv[1][tindex+1][0]); \
-            } \
-            ); \
-    if (this->cd->myrank == this->cd->rank[0]) \
-        std::fill_n((R*)(acceleration), 6, 0.0); \
-    this->force_divfree(acceleration); \
-} \
- \
-template<> \
-void fluid_solver<R>::compute_Lagrangian_acceleration(R *acceleration) \
-{ \
-    this->compute_Lagrangian_acceleration((FFTW(complex)*)acceleration); \
-    FFTW(execute)(*((FFTW(plan)*)this->vc2r[1])); \
-    std::copy( \
-            this->rv[1], \
-            this->rv[1] + 2*this->cd->local_size, \
-            acceleration); \
-} \
- \
-template<> \
-int fluid_solver<R>::write_rpressure() \
-{ \
-    char fname[512]; \
-    FFTW(complex) *pressure; \
-    pressure = FFTW(alloc_complex)(this->cd->local_size/3); \
-    this->compute_velocity(this->cvorticity); \
-    this->ift_velocity(); \
-    this->compute_pressure(pressure); \
-    this->fill_up_filename("rpressure", fname); \
-    R *rpressure = FFTW(alloc_real)((this->cd->local_size/3)*2); \
-    FFTW(plan) c2r; \
-    c2r = FFTW(mpi_plan_dft_c2r_3d)( \
-            this->rd->sizes[0], this->rd->sizes[1], this->rd->sizes[2], \
-            pressure, rpressure, this->cd->comm, \
-            this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN); \
-    FFTW(execute)(c2r); \
-    /* output goes here */ \
-    int ntmp[3]; \
-    ntmp[0] = this->rd->sizes[0]; \
-    ntmp[1] = this->rd->sizes[1]; \
-    ntmp[2] = this->rd->sizes[2]; \
-    field_descriptor<R> *scalar_descriptor = new field_descriptor<R>(3, ntmp, MPI_RNUM, this->cd->comm); \
-    clip_zero_padding<R>(scalar_descriptor, rpressure, 1); \
-    int return_value = scalar_descriptor->write(fname, rpressure); \
-    delete scalar_descriptor; \
-    FFTW(destroy_plan)(c2r); \
-    FFTW(free)(pressure); \
-    FFTW(free)(rpressure); \
-    return return_value; \
-} \
+template <class rnumber>
+fluid_solver<rnumber>::fluid_solver(
+        const char *NAME,
+        int nx,
+        int ny,
+        int nz,
+        double DKX,
+        double DKY,
+        double DKZ,
+        int DEALIAS_TYPE,
+        unsigned FFTW_PLAN_RIGOR) : fluid_solver_base<rnumber>(
+                                        NAME,
+                                        nx , ny , nz,
+                                        DKX, DKY, DKZ,
+                                        DEALIAS_TYPE,
+                                        FFTW_PLAN_RIGOR)
+{
+    this->cvorticity = fftw_interface<rnumber>::alloc_complex(this->cd->local_size);
+    this->cvelocity  = fftw_interface<rnumber>::alloc_complex(this->cd->local_size);
+    this->rvorticity = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2);
+    /*this->rvelocity  = (rnumber*)(this->cvelocity);*/
+    this->rvelocity  = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2);
 
-/*****************************************************************************/
+    this->ru = this->rvelocity;
+    this->cu = this->cvelocity;
 
+    this->rv[0] = this->rvorticity;
+    this->rv[3] = this->rvorticity;
+    this->cv[0] = this->cvorticity;
+    this->cv[3] = this->cvorticity;
 
+    this->cv[1] = fftw_interface<rnumber>::alloc_complex(this->cd->local_size);
+    this->cv[2] = this->cv[1];
+    this->rv[1] = fftw_interface<rnumber>::alloc_real(this->cd->local_size*2);
+    this->rv[2] = this->rv[1];
+
+    this->c2r_vorticity = new typename fftw_interface<rnumber>::plan;
+    this->r2c_vorticity = new typename fftw_interface<rnumber>::plan;
+    this->c2r_velocity  = new typename fftw_interface<rnumber>::plan;
+    this->r2c_velocity  = new typename fftw_interface<rnumber>::plan;
+
+    ptrdiff_t sizes[] = {nz,
+                         ny,
+                         nx};
+
+    *this->c2r_vorticity = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->cvorticity, this->rvorticity,
+                MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+
+    *this->r2c_vorticity = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->rvorticity, this->cvorticity,
+                MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
+
+    *this->c2r_velocity = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->cvelocity, this->rvelocity,
+                MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+
+    *this->r2c_velocity = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->rvelocity, this->cvelocity,
+                MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
+
+    this->uc2r = this->c2r_velocity;
+    this->ur2c = this->r2c_velocity;
+    this->vc2r[0] = this->c2r_vorticity;
+    this->vr2c[0] = this->r2c_vorticity;
+
+    this->vc2r[1] = new typename fftw_interface<rnumber>::plan;
+    this->vr2c[1] = new typename fftw_interface<rnumber>::plan;
+    this->vc2r[2] = new typename fftw_interface<rnumber>::plan;
+    this->vr2c[2] = new typename fftw_interface<rnumber>::plan;
+
+    *(this->vc2r[1]) = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->cv[1], this->rv[1],
+            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+
+    *this->vc2r[2] = fftw_interface<rnumber>::mpi_plan_many_dft_c2r(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->cv[2], this->rv[2],
+            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+
+    *this->vr2c[1] = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->rv[1], this->cv[1],
+            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
+
+    *this->vr2c[2] = fftw_interface<rnumber>::mpi_plan_many_dft_r2c(
+                3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK,
+                this->rv[2], this->cv[2],
+            MPI_COMM_WORLD, this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_OUT);
+
+    /* ``physical'' parameters etc, initialized here just in case */
+
+    this->nu = 0.1;
+    this->fmode = 1;
+    this->famplitude = 1.0;
+    this->fk0  = 0;
+    this->fk1 = 3.0;
+    /* initialization of fields must be done AFTER planning */
+    std::fill_n((rnumber*)this->cvorticity, this->cd->local_size*2, 0.0);
+    std::fill_n((rnumber*)this->cvelocity, this->cd->local_size*2, 0.0);
+    std::fill_n(this->rvelocity, this->cd->local_size*2, 0.0);
+    std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0);
+    std::fill_n((rnumber*)this->cv[1], this->cd->local_size*2, 0.0);
+    std::fill_n(this->rv[1], this->cd->local_size*2, 0.0);
+    std::fill_n(this->rv[2], this->cd->local_size*2, 0.0);
+}
+
+template <class rnumber>
+fluid_solver<rnumber>::~fluid_solver()
+{
+    fftw_interface<rnumber>::destroy_plan(*this->c2r_vorticity);
+    fftw_interface<rnumber>::destroy_plan(*this->r2c_vorticity);
+    fftw_interface<rnumber>::destroy_plan(*this->c2r_velocity );
+    fftw_interface<rnumber>::destroy_plan(*this->r2c_velocity );
+    fftw_interface<rnumber>::destroy_plan(*this->vc2r[1]);
+    fftw_interface<rnumber>::destroy_plan(*this->vr2c[1]);
+    fftw_interface<rnumber>::destroy_plan(*this->vc2r[2]);
+    fftw_interface<rnumber>::destroy_plan(*this->vr2c[2]);
+
+    delete this->c2r_vorticity;
+    delete this->r2c_vorticity;
+    delete this->c2r_velocity ;
+    delete this->r2c_velocity ;
+    delete this->vc2r[1];
+    delete this->vr2c[1];
+    delete this->vc2r[2];
+    delete this->vr2c[2];
+
+    fftw_interface<rnumber>::free(this->cv[1]);
+    fftw_interface<rnumber>::free(this->rv[1]);
+    fftw_interface<rnumber>::free(this->cvorticity);
+    fftw_interface<rnumber>::free(this->rvorticity);
+    fftw_interface<rnumber>::free(this->cvelocity);
+    fftw_interface<rnumber>::free(this->rvelocity);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_vorticity()
+{
+    ptrdiff_t tindex;
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        tindex = 3*cindex;
+        if (k2 <= this->kM2)
+        {
+            this->cvorticity[tindex+0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]);
+            this->cvorticity[tindex+1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]);
+            this->cvorticity[tindex+2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]);
+            this->cvorticity[tindex+0][1] =  (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]);
+            this->cvorticity[tindex+1][1] =  (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]);
+            this->cvorticity[tindex+2][1] =  (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]);
+        }
+        else
+            std::fill_n((rnumber*)(this->cvorticity+tindex), 6, 0.0);
+    }
+    );
+    this->symmetrize(this->cvorticity, 3);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_velocity(rnumber (*__restrict__ vorticity)[2])
+{
+    ptrdiff_t tindex;
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        tindex = 3*cindex;
+        if (k2 <= this->kM2 && k2 > 0)
+        {
+            this->cu[tindex+0][0] = -(this->ky[yindex]*vorticity[tindex+2][1] - this->kz[zindex]*vorticity[tindex+1][1]) / k2;
+            this->cu[tindex+1][0] = -(this->kz[zindex]*vorticity[tindex+0][1] - this->kx[xindex]*vorticity[tindex+2][1]) / k2;
+            this->cu[tindex+2][0] = -(this->kx[xindex]*vorticity[tindex+1][1] - this->ky[yindex]*vorticity[tindex+0][1]) / k2;
+            this->cu[tindex+0][1] =  (this->ky[yindex]*vorticity[tindex+2][0] - this->kz[zindex]*vorticity[tindex+1][0]) / k2;
+            this->cu[tindex+1][1] =  (this->kz[zindex]*vorticity[tindex+0][0] - this->kx[xindex]*vorticity[tindex+2][0]) / k2;
+            this->cu[tindex+2][1] =  (this->kx[xindex]*vorticity[tindex+1][0] - this->ky[yindex]*vorticity[tindex+0][0]) / k2;
+        }
+        else
+            std::fill_n((rnumber*)(this->cu+tindex), 6, 0.0);
+    }
+    );
+    /*this->symmetrize(this->cu, 3);*/
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::ift_velocity()
+{
+    fftw_interface<rnumber>::execute(*(this->c2r_velocity ));
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::ift_vorticity()
+{
+    std::fill_n(this->rvorticity, this->cd->local_size*2, 0.0);
+    fftw_interface<rnumber>::execute(*(this->c2r_vorticity ));
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::dft_velocity()
+{
+    fftw_interface<rnumber>::execute(*(this->r2c_velocity ));
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::dft_vorticity()
+{
+    std::fill_n((rnumber*)this->cvorticity, this->cd->local_size*2, 0.0);
+    fftw_interface<rnumber>::execute(*(this->r2c_vorticity ));
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::add_forcing(
+        rnumber (*__restrict__ acc_field)[2], rnumber (*__restrict__ vort_field)[2], rnumber factor)
+{
+    if (strcmp(this->forcing_type, "none") == 0)
+        return;
+    if (strcmp(this->forcing_type, "Kolmogorov") == 0)
+    {
+        ptrdiff_t cindex;
+        if (this->cd->myrank == this->cd->rank[this->fmode])
+        {
+            cindex = ((this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3;
+            acc_field[cindex+2][0] -= this->famplitude*factor/2;
+        }
+        if (this->cd->myrank == this->cd->rank[this->cd->sizes[0] - this->fmode])
+        {
+            cindex = ((this->cd->sizes[0] - this->fmode - this->cd->starts[0]) * this->cd->sizes[1])*this->cd->sizes[2]*3;
+            acc_field[cindex+2][0] -= this->famplitude*factor/2;
+        }
+        return;
+    }
+    if (strcmp(this->forcing_type, "linear") == 0)
+    {
+        double knorm;
+        CLOOP(
+                    this,
+                    [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+            knorm = sqrt(this->kx[xindex]*this->kx[xindex] +
+                         this->ky[yindex]*this->ky[yindex] +
+                         this->kz[zindex]*this->kz[zindex]);
+            if ((this->fk0 <= knorm) &&
+                    (this->fk1 >= knorm))
+                for (int c=0; c<3; c++)
+                    for (int i=0; i<2; i++)
+                        acc_field[cindex*3+c][i] += this->famplitude*vort_field[cindex*3+c][i]*factor;
+        }
+        );
+        return;
+    }
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::omega_nonlin(
+        int src)
+{
+    assert(src >= 0 && src < 3);
+    this->compute_velocity(this->cv[src]);
+    /* get fields from Fourier space to real space */
+    fftw_interface<rnumber>::execute(*(this->c2r_velocity ));
+    fftw_interface<rnumber>::execute(*(this->vc2r[src]));
+    /* compute cross product $u \times \omega$, and normalize */
+    rnumber tmp[3][2];
+    ptrdiff_t tindex;
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            tmp[cc][0] = (this->ru[tindex+(cc+1)%3]*this->rv[src][tindex+(cc+2)%3] -
+                    this->ru[tindex+(cc+2)%3]*this->rv[src][tindex+(cc+1)%3]);
+        for (int cc=0; cc<3; cc++)
+            this->ru[(3*rindex)+cc] = tmp[cc][0] / this->normalization_factor;
+    }
+    );
+    /* go back to Fourier space */
+    this->clean_up_real_space(this->ru, 3);
+    fftw_interface<rnumber>::execute(*(this->r2c_velocity ));
+    this->dealias(this->cu, 3);
+    /* $\imath k \times Fourier(u \times \omega)$ */
+    CLOOP(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        tindex = 3*cindex;
+        {
+            tmp[0][0] = -(this->ky[yindex]*this->cu[tindex+2][1] - this->kz[zindex]*this->cu[tindex+1][1]);
+            tmp[1][0] = -(this->kz[zindex]*this->cu[tindex+0][1] - this->kx[xindex]*this->cu[tindex+2][1]);
+            tmp[2][0] = -(this->kx[xindex]*this->cu[tindex+1][1] - this->ky[yindex]*this->cu[tindex+0][1]);
+            tmp[0][1] =  (this->ky[yindex]*this->cu[tindex+2][0] - this->kz[zindex]*this->cu[tindex+1][0]);
+            tmp[1][1] =  (this->kz[zindex]*this->cu[tindex+0][0] - this->kx[xindex]*this->cu[tindex+2][0]);
+            tmp[2][1] =  (this->kx[xindex]*this->cu[tindex+1][0] - this->ky[yindex]*this->cu[tindex+0][0]);
+        }
+        for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+            this->cu[tindex+cc][i] = tmp[cc][i];
+    }
+    );
+    this->add_forcing(this->cu, this->cv[src], 1.0);
+    this->force_divfree(this->cu);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::step(double dt)
+{
+    double factor0, factor1;
+    std::fill_n((rnumber*)this->cv[1], this->cd->local_size*2, 0.0);
+    this->omega_nonlin(0);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            factor0 = exp(-this->nu * k2 * dt);
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->cv[1][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i] +
+                    dt*this->cu[3*cindex+cc][i])*factor0;
+        }
+    }
+    );
+
+    this->omega_nonlin(1);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            factor0 = exp(-this->nu * k2 * dt/2);
+            factor1 = exp( this->nu * k2 * dt/2);
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->cv[2][3*cindex+cc][i] = (3*this->cv[0][3*cindex+cc][i]*factor0 +
+                    (this->cv[1][3*cindex+cc][i] +
+                    dt*this->cu[3*cindex+cc][i])*factor1)*0.25;
+        }
+    }
+    );
+
+    this->omega_nonlin(2);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            factor0 = exp(-this->nu * k2 * dt * 0.5);
+            for (int cc=0; cc<3; cc++) for (int i=0; i<2; i++)
+                this->cv[3][3*cindex+cc][i] = (this->cv[0][3*cindex+cc][i]*factor0 +
+                    2*(this->cv[2][3*cindex+cc][i] +
+                    dt*this->cu[3*cindex+cc][i]))*factor0/3;
+        }
+    }
+    );
+
+    this->force_divfree(this->cvorticity);
+    this->symmetrize(this->cvorticity, 3);
+    this->iteration++;
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::read(char field, char representation)
+{
+    char fname[512];
+    int read_result;
+    if (field == 'v')
+    {
+        if (representation == 'c')
+        {
+            this->fill_up_filename("cvorticity", fname);
+            read_result = this->cd->read(fname, (void*)this->cvorticity);
+            if (read_result != EXIT_SUCCESS)
+                return read_result;
+        }
+        if (representation == 'r')
+        {
+            read_result = this->read_base("rvorticity", this->rvorticity);
+            if (read_result != EXIT_SUCCESS)
+                return read_result;
+            else
+                fftw_interface<rnumber>::execute(*(this->r2c_vorticity ));
+        }
+        this->low_pass_Fourier(this->cvorticity, 3, this->kM);
+        this->force_divfree(this->cvorticity);
+        this->symmetrize(this->cvorticity, 3);
+        return EXIT_SUCCESS;
+    }
+    if ((field == 'u') && (representation == 'c'))
+    {
+        read_result = this->read_base("cvelocity", this->cvelocity);
+        this->low_pass_Fourier(this->cvelocity, 3, this->kM);
+        this->force_divfree(this->cvorticity);
+        this->symmetrize(this->cvorticity, 3);
+        return read_result;
+    }
+    if ((field == 'u') && (representation == 'r'))
+        return this->read_base("rvelocity", this->rvelocity);
+    return EXIT_FAILURE;
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::write(char field, char representation)
+{
+    char fname[512];
+    if ((field == 'v') && (representation == 'c'))
+    {
+        this->fill_up_filename("cvorticity", fname);
+        return this->cd->write(fname, (void*)this->cvorticity);
+    }
+    if ((field == 'v') && (representation == 'r'))
+    {
+        fftw_interface<rnumber>::execute(*(this->c2r_vorticity ));
+        clip_zero_padding<rnumber>(this->rd, this->rvorticity, 3);
+        this->fill_up_filename("rvorticity", fname);
+        return this->rd->write(fname, this->rvorticity);
+    }
+    this->compute_velocity(this->cvorticity);
+    if ((field == 'u') && (representation == 'c'))
+    {
+        this->fill_up_filename("cvelocity", fname);
+        return this->cd->write(fname, this->cvelocity);
+    }
+    if ((field == 'u') && (representation == 'r'))
+    {
+        this->ift_velocity();
+        clip_zero_padding<rnumber>(this->rd, this->rvelocity, 3);
+        this->fill_up_filename("rvelocity", fname);
+        return this->rd->write(fname, this->rvelocity);
+    }
+    return EXIT_FAILURE;
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::write_rTrS2()
+{
+    char fname[512];
+    this->fill_up_filename("rTrS2", fname);
+    typename fftw_interface<rnumber>::complex *ca;
+    rnumber *ra;
+    ca = fftw_interface<rnumber>::alloc_complex(this->cd->local_size*3);
+    ra = (rnumber*)(ca);
+    this->compute_velocity(this->cvorticity);
+    this->compute_vector_gradient(ca, this->cvelocity);
+    for (int cc=0; cc<3; cc++)
+    {
+        std::copy(
+                    (rnumber*)(ca + cc*this->cd->local_size),
+                    (rnumber*)(ca + (cc+1)*this->cd->local_size),
+                    (rnumber*)this->cv[1]);
+        fftw_interface<rnumber>::execute(*(this->vc2r[1]));
+        std::copy(
+                    this->rv[1],
+                this->rv[1] + this->cd->local_size*2,
+                ra + cc*this->cd->local_size*2);
+    }
+    /* velocity gradient is now stored, in real space, in ra */
+    rnumber *dx_u, *dy_u, *dz_u;
+    dx_u = ra;
+    dy_u = ra + 2*this->cd->local_size;
+    dz_u = ra + 4*this->cd->local_size;
+    rnumber *trS2 = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2);
+    double average_local = 0;
+    RLOOP(
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        rnumber AxxAxx;
+        rnumber AyyAyy;
+        rnumber AzzAzz;
+        rnumber Sxy;
+        rnumber Syz;
+        rnumber Szx;
+        ptrdiff_t tindex = 3*rindex;
+        AxxAxx = dx_u[tindex+0]*dx_u[tindex+0];
+        AyyAyy = dy_u[tindex+1]*dy_u[tindex+1];
+        AzzAzz = dz_u[tindex+2]*dz_u[tindex+2];
+        Sxy = dx_u[tindex+1]+dy_u[tindex+0];
+        Syz = dy_u[tindex+2]+dz_u[tindex+1];
+        Szx = dz_u[tindex+0]+dx_u[tindex+2];
+        trS2[rindex] = (AxxAxx + AyyAyy + AzzAzz +
+                        (Sxy*Sxy + Syz*Syz + Szx*Szx)/2);
+        average_local += trS2[rindex];
+    }
+    );
+    double average;
+    MPI_Allreduce(
+                &average_local,
+                &average,
+                1,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
+    DEBUG_MSG("average TrS2 is %g\n", average);
+    fftw_interface<rnumber>::free(ca);
+    /* output goes here */
+    int ntmp[3];
+    ntmp[0] = this->rd->sizes[0];
+    ntmp[1] = this->rd->sizes[1];
+    ntmp[2] = this->rd->sizes[2];
+    field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm);
+    clip_zero_padding<rnumber>(scalar_descriptor, trS2, 1);
+    int return_value = scalar_descriptor->write(fname, trS2);
+    delete scalar_descriptor;
+    fftw_interface<rnumber>::free(trS2);
+    return return_value;
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::write_renstrophy()
+{
+    char fname[512];
+    this->fill_up_filename("renstrophy", fname);
+    rnumber *enstrophy = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2);
+    this->ift_vorticity();
+    double average_local = 0;
+    RLOOP(
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        ptrdiff_t tindex = 3*rindex;
+        enstrophy[rindex] = (
+                    this->rvorticity[tindex+0]*this->rvorticity[tindex+0] +
+                this->rvorticity[tindex+1]*this->rvorticity[tindex+1] +
+                this->rvorticity[tindex+2]*this->rvorticity[tindex+2]
+                )/2;
+        average_local += enstrophy[rindex];
+    }
+    );
+    double average;
+    MPI_Allreduce(
+                &average_local,
+                &average,
+                1,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
+    DEBUG_MSG("average enstrophy is %g\n", average);
+    /* output goes here */
+    int ntmp[3];
+    ntmp[0] = this->rd->sizes[0];
+    ntmp[1] = this->rd->sizes[1];
+    ntmp[2] = this->rd->sizes[2];
+    field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm);
+    clip_zero_padding<rnumber>(scalar_descriptor, enstrophy, 1);
+    int return_value = scalar_descriptor->write(fname, enstrophy);
+    delete scalar_descriptor;
+    fftw_interface<rnumber>::free(enstrophy);
+    return return_value;
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_pressure(rnumber (*__restrict__ pressure)[2])
+{
+    /* assume velocity is already in real space representation */
+    ptrdiff_t tindex;
+
+    /* diagonal terms 11 22 33 */
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc];
+    }
+    );
+    this->clean_up_real_space(this->rv[1], 3);
+    fftw_interface<rnumber>::execute(*(this->vr2c[1]));
+    this->dealias(this->cv[1], 3);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2 && k2 > 0)
+        {
+            tindex = 3*cindex;
+            for (int i=0; i<2; i++)
+            {
+                pressure[cindex][i] = -(this->kx[xindex]*this->kx[xindex]*this->cv[1][tindex+0][i] +
+                        this->ky[yindex]*this->ky[yindex]*this->cv[1][tindex+1][i] +
+                        this->kz[zindex]*this->kz[zindex]*this->cv[1][tindex+2][i]);
+            }
+        }
+        else
+            std::fill_n((rnumber*)(pressure+cindex), 2, 0.0);
+    }
+    );
+    /* off-diagonal terms 12 23 31 */
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3];
+    }
+    );
+    this->clean_up_real_space(this->rv[1], 3);
+    fftw_interface<rnumber>::execute(*(this->vr2c[1]));
+    this->dealias(this->cv[1], 3);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2 && k2 > 0)
+        {
+            tindex = 3*cindex;
+            for (int i=0; i<2; i++)
+            {
+                pressure[cindex][i] -= 2*(this->kx[xindex]*this->ky[yindex]*this->cv[1][tindex+0][i] +
+                        this->ky[yindex]*this->kz[zindex]*this->cv[1][tindex+1][i] +
+                        this->kz[zindex]*this->kx[xindex]*this->cv[1][tindex+2][i]);
+                pressure[cindex][i] /= this->normalization_factor*k2;
+            }
+        }
+    }
+    );
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_gradient_statistics(
+        rnumber (*__restrict__ vec)[2],
+double *gradu_moments,
+double *trS2QR_moments,
+ptrdiff_t *gradu_hist,
+ptrdiff_t *trS2QR_hist,
+ptrdiff_t *QR2D_hist,
+double trS2QR_max_estimates[],
+double gradu_max_estimates[],
+int nbins,
+int QR2D_nbins)
+{
+    typename fftw_interface<rnumber>::complex *ca;
+    rnumber *ra;
+    ca = fftw_interface<rnumber>::alloc_complex(this->cd->local_size*3);
+    ra = (rnumber*)(ca);
+    this->compute_vector_gradient(ca, vec);
+    for (int cc=0; cc<3; cc++)
+    {
+        std::copy(
+                    (rnumber*)(ca + cc*this->cd->local_size),
+                    (rnumber*)(ca + (cc+1)*this->cd->local_size),
+                    (rnumber*)this->cv[1]);
+        fftw_interface<rnumber>::execute(*(this->vc2r[1]));
+        std::copy(
+                    this->rv[1],
+                this->rv[1] + this->cd->local_size*2,
+                ra + cc*this->cd->local_size*2);
+    }
+    /* velocity gradient is now stored, in real space, in ra */
+    std::fill_n(this->rv[1], 2*this->cd->local_size, 0.0);
+    rnumber *dx_u, *dy_u, *dz_u;
+    dx_u = ra;
+    dy_u = ra + 2*this->cd->local_size;
+    dz_u = ra + 4*this->cd->local_size;
+    double binsize[2];
+    double tmp_max_estimate[3];
+    tmp_max_estimate[0] = trS2QR_max_estimates[0];
+    tmp_max_estimate[1] = trS2QR_max_estimates[1];
+    tmp_max_estimate[2] = trS2QR_max_estimates[2];
+    binsize[0] = 2*tmp_max_estimate[2] / QR2D_nbins;
+    binsize[1] = 2*tmp_max_estimate[1] / QR2D_nbins;
+    ptrdiff_t *local_hist = new ptrdiff_t[QR2D_nbins*QR2D_nbins];
+    std::fill_n(local_hist, QR2D_nbins*QR2D_nbins, 0);
+    RLOOP(
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        rnumber AxxAxx;
+        rnumber AyyAyy;
+        rnumber AzzAzz;
+        rnumber AxyAyx;
+        rnumber AyzAzy;
+        rnumber AzxAxz;
+        rnumber Sxy;
+        rnumber Syz;
+        rnumber Szx;
+        ptrdiff_t tindex = 3*rindex;
+        AxxAxx = dx_u[tindex+0]*dx_u[tindex+0];
+        AyyAyy = dy_u[tindex+1]*dy_u[tindex+1];
+        AzzAzz = dz_u[tindex+2]*dz_u[tindex+2];
+        AxyAyx = dx_u[tindex+1]*dy_u[tindex+0];
+        AyzAzy = dy_u[tindex+2]*dz_u[tindex+1];
+        AzxAxz = dz_u[tindex+0]*dx_u[tindex+2];
+        this->rv[1][tindex+1] = - (AxxAxx + AyyAyy + AzzAzz)/2 - AxyAyx - AyzAzy - AzxAxz;
+        this->rv[1][tindex+2] = - (dx_u[tindex+0]*(AxxAxx/3 + AxyAyx + AzxAxz) +
+                dy_u[tindex+1]*(AyyAyy/3 + AxyAyx + AyzAzy) +
+                dz_u[tindex+2]*(AzzAzz/3 + AzxAxz + AyzAzy) +
+                dx_u[tindex+1]*dy_u[tindex+2]*dz_u[tindex+0] +
+                dx_u[tindex+2]*dy_u[tindex+0]*dz_u[tindex+1]);
+        int bin0 = int(floor((this->rv[1][tindex+2] + tmp_max_estimate[2]) / binsize[0]));
+        int bin1 = int(floor((this->rv[1][tindex+1] + tmp_max_estimate[1]) / binsize[1]));
+        if ((bin0 >= 0 && bin0 < QR2D_nbins) &&
+                (bin1 >= 0 && bin1 < QR2D_nbins))
+            local_hist[bin1*QR2D_nbins + bin0]++;
+        Sxy = dx_u[tindex+1]+dy_u[tindex+0];
+        Syz = dy_u[tindex+2]+dz_u[tindex+1];
+        Szx = dz_u[tindex+0]+dx_u[tindex+2];
+        this->rv[1][tindex] = (AxxAxx + AyyAyy + AzzAzz +
+                               (Sxy*Sxy + Syz*Syz + Szx*Szx)/2);
+    }
+    );
+    MPI_Allreduce(
+                local_hist,
+                QR2D_hist,
+                QR2D_nbins * QR2D_nbins,
+                MPI_INT64_T, MPI_SUM, this->cd->comm);
+    delete[] local_hist;
+    this->compute_rspace_stats3(
+                this->rv[1],
+            trS2QR_moments,
+            trS2QR_hist,
+            tmp_max_estimate,
+            nbins);
+    double *tmp_moments = new double[10*3];
+    ptrdiff_t *tmp_hist = new ptrdiff_t[nbins*3];
+    for (int cc=0; cc<3; cc++)
+    {
+        tmp_max_estimate[0] = gradu_max_estimates[cc*3 + 0];
+        tmp_max_estimate[1] = gradu_max_estimates[cc*3 + 1];
+        tmp_max_estimate[2] = gradu_max_estimates[cc*3 + 2];
+        this->compute_rspace_stats3(
+                    dx_u + cc*2*this->cd->local_size,
+                    tmp_moments,
+                    tmp_hist,
+                    tmp_max_estimate,
+                    nbins);
+        for (int n = 0; n < 10; n++)
+            for (int i = 0; i < 3 ; i++)
+            {
+                gradu_moments[(n*3 + cc)*3 + i] = tmp_moments[n*3 + i];
+            }
+        for (int n = 0; n < nbins; n++)
+            for (int i = 0; i < 3; i++)
+            {
+                gradu_hist[(n*3 + cc)*3 + i] = tmp_hist[n*3 + i];
+            }
+    }
+    delete[] tmp_moments;
+    delete[] tmp_hist;
+    fftw_interface<rnumber>::free(ca);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_Lagrangian_acceleration(rnumber (*acceleration)[2])
+{
+    ptrdiff_t tindex;
+    typename fftw_interface<rnumber>::complex *pressure;
+    pressure = fftw_interface<rnumber>::alloc_complex(this->cd->local_size/3);
+    this->compute_velocity(this->cvorticity);
+    this->ift_velocity();
+    this->compute_pressure(pressure);
+    this->compute_velocity(this->cvorticity);
+    std::fill_n((rnumber*)this->cv[1], 2*this->cd->local_size, 0.0);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            tindex = 3*cindex;
+            for (int cc=0; cc<3; cc++)
+                for (int i=0; i<2; i++)
+                    this->cv[1][tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i];
+            if (strcmp(this->forcing_type, "linear") == 0)
+            {
+                double knorm = sqrt(k2);
+                if ((this->fk0 <= knorm) &&
+                        (this->fk1 >= knorm))
+                    for (int c=0; c<3; c++)
+                        for (int i=0; i<2; i++)
+                            this->cv[1][tindex+c][i] += this->famplitude*this->cu[tindex+c][i];
+            }
+            this->cv[1][tindex+0][0] += this->kx[xindex]*pressure[cindex][1];
+            this->cv[1][tindex+1][0] += this->ky[yindex]*pressure[cindex][1];
+            this->cv[1][tindex+2][0] += this->kz[zindex]*pressure[cindex][1];
+            this->cv[1][tindex+0][1] -= this->kx[xindex]*pressure[cindex][0];
+            this->cv[1][tindex+1][1] -= this->ky[yindex]*pressure[cindex][0];
+            this->cv[1][tindex+2][1] -= this->kz[zindex]*pressure[cindex][0];
+        }
+    }
+    );
+    std::copy(
+                (rnumber*)this->cv[1],
+            (rnumber*)(this->cv[1] + this->cd->local_size),
+            (rnumber*)acceleration);
+    fftw_interface<rnumber>::free(pressure);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_Eulerian_acceleration(rnumber (*__restrict__ acceleration)[2])
+{
+    std::fill_n((rnumber*)(acceleration), 2*this->cd->local_size, 0.0);
+    ptrdiff_t tindex;
+    this->compute_velocity(this->cvorticity);
+    /* put in linear terms */
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            tindex = 3*cindex;
+            for (int cc=0; cc<3; cc++)
+                for (int i=0; i<2; i++)
+                    acceleration[tindex+cc][i] = - this->nu*k2*this->cu[tindex+cc][i];
+            if (strcmp(this->forcing_type, "linear") == 0)
+            {
+                double knorm = sqrt(k2);
+                if ((this->fk0 <= knorm) &&
+                        (this->fk1 >= knorm))
+                {
+                    for (int c=0; c<3; c++)
+                        for (int i=0; i<2; i++)
+                            acceleration[tindex+c][i] += this->famplitude*this->cu[tindex+c][i];
+                }
+            }
+        }
+    }
+    );
+    this->ift_velocity();
+    /* compute uu */
+    /* 11 22 33 */
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+cc] / this->normalization_factor;
+    }
+    );
+    this->clean_up_real_space(this->rv[1], 3);
+    fftw_interface<rnumber>::execute(*(this->vr2c[1]));
+    this->dealias(this->cv[1], 3);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            tindex = 3*cindex;
+            acceleration[tindex+0][0] +=
+                    this->kx[xindex]*this->cv[1][tindex+0][1];
+            acceleration[tindex+0][1] +=
+                    -this->kx[xindex]*this->cv[1][tindex+0][0];
+            acceleration[tindex+1][0] +=
+                    this->ky[yindex]*this->cv[1][tindex+1][1];
+            acceleration[tindex+1][1] +=
+                    -this->ky[yindex]*this->cv[1][tindex+1][0];
+            acceleration[tindex+2][0] +=
+                    this->kz[zindex]*this->cv[1][tindex+2][1];
+            acceleration[tindex+2][1] +=
+                    -this->kz[zindex]*this->cv[1][tindex+2][0];
+        }
+    }
+    );
+    /* 12 23 31 */
+    RLOOP (
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
+        tindex = 3*rindex;
+        for (int cc=0; cc<3; cc++)
+            this->rv[1][tindex+cc] = this->ru[tindex+cc]*this->ru[tindex+(cc+1)%3] / this->normalization_factor;
+    }
+    );
+    this->clean_up_real_space(this->rv[1], 3);
+    fftw_interface<rnumber>::execute(*(this->vr2c[1]));
+    this->dealias(this->cv[1], 3);
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            tindex = 3*cindex;
+            acceleration[tindex+0][0] +=
+                    (this->ky[yindex]*this->cv[1][tindex+0][1] +
+                    this->kz[zindex]*this->cv[1][tindex+2][1]);
+            acceleration[tindex+0][1] +=
+                    - (this->ky[yindex]*this->cv[1][tindex+0][0] +
+                    this->kz[zindex]*this->cv[1][tindex+2][0]);
+            acceleration[tindex+1][0] +=
+                    (this->kz[zindex]*this->cv[1][tindex+1][1] +
+                    this->kx[xindex]*this->cv[1][tindex+0][1]);
+            acceleration[tindex+1][1] +=
+                    - (this->kz[zindex]*this->cv[1][tindex+1][0] +
+                    this->kx[xindex]*this->cv[1][tindex+0][0]);
+            acceleration[tindex+2][0] +=
+                    (this->kx[xindex]*this->cv[1][tindex+2][1] +
+                    this->ky[yindex]*this->cv[1][tindex+1][1]);
+            acceleration[tindex+2][1] +=
+                    - (this->kx[xindex]*this->cv[1][tindex+2][0] +
+                    this->ky[yindex]*this->cv[1][tindex+1][0]);
+        }
+    }
+    );
+    if (this->cd->myrank == this->cd->rank[0])
+        std::fill_n((rnumber*)(acceleration), 6, 0.0);
+    this->force_divfree(acceleration);
+}
+
+template <class rnumber>
+void fluid_solver<rnumber>::compute_Lagrangian_acceleration(rnumber *__restrict__ acceleration)
+{
+    this->compute_Lagrangian_acceleration((typename fftw_interface<rnumber>::complex*)acceleration);
+    fftw_interface<rnumber>::execute(*(this->vc2r[1]));
+    std::copy(
+                this->rv[1],
+            this->rv[1] + 2*this->cd->local_size,
+            acceleration);
+}
+
+template <class rnumber>
+int fluid_solver<rnumber>::write_rpressure()
+{
+    char fname[512];
+    typename fftw_interface<rnumber>::complex *pressure;
+    pressure = fftw_interface<rnumber>::alloc_complex(this->cd->local_size/3);
+    this->compute_velocity(this->cvorticity);
+    this->ift_velocity();
+    this->compute_pressure(pressure);
+    this->fill_up_filename("rpressure", fname);
+    rnumber *rpressure = fftw_interface<rnumber>::alloc_real((this->cd->local_size/3)*2);
+    typename fftw_interface<rnumber>::plan c2r;
+    c2r = fftw_interface<rnumber>::mpi_plan_dft_c2r_3d(
+                this->rd->sizes[0], this->rd->sizes[1], this->rd->sizes[2],
+            pressure, rpressure, this->cd->comm,
+            this->fftw_plan_rigor | FFTW_MPI_TRANSPOSED_IN);
+    fftw_interface<rnumber>::execute(c2r);
+    /* output goes here */
+    int ntmp[3];
+    ntmp[0] = this->rd->sizes[0];
+    ntmp[1] = this->rd->sizes[1];
+    ntmp[2] = this->rd->sizes[2];
+    field_descriptor<rnumber> *scalar_descriptor = new field_descriptor<rnumber>(3, ntmp, mpi_real_type<rnumber>::real(), this->cd->comm);
+    clip_zero_padding<rnumber>(scalar_descriptor, rpressure, 1);
+    int return_value = scalar_descriptor->write(fname, rpressure);
+    delete scalar_descriptor;
+    fftw_interface<rnumber>::destroy_plan(c2r);
+    fftw_interface<rnumber>::free(pressure);
+    fftw_interface<rnumber>::free(rpressure);
+    return return_value;
+}
 
 /*****************************************************************************/
-/* now actually use the macro defined above                                  */
-FLUID_SOLVER_DEFINITIONS(
-        FFTW_MANGLE_FLOAT,
-        float,
-        MPI_FLOAT,
-        MPI_COMPLEX)
-FLUID_SOLVER_DEFINITIONS(
-        FFTW_MANGLE_DOUBLE,
-        double,
-        MPI_DOUBLE,
-        BFPS_MPICXX_DOUBLE_COMPLEX)
-/*****************************************************************************/
+
 
 
 
diff --git a/bfps/cpp/fluid_solver.hpp b/bfps/cpp/fluid_solver.hpp
index 2b6ec64de12cc133687074c83c71696ffc507509..4cc75cee4385353f64dc9bc9e7d34c6efba9ad48 100644
--- a/bfps/cpp/fluid_solver.hpp
+++ b/bfps/cpp/fluid_solver.hpp
@@ -55,12 +55,12 @@ class fluid_solver:public fluid_solver_base<rnumber>
         typename fluid_solver_base<rnumber>::cnumber *cu, *cv[4];
 
         /* plans */
-        void *c2r_vorticity;
-        void *r2c_vorticity;
-        void *c2r_velocity;
-        void *r2c_velocity;
-        void *uc2r, *ur2c;
-        void *vr2c[3], *vc2r[3];
+        typename fftw_interface<rnumber>::plan *c2r_vorticity;
+        typename fftw_interface<rnumber>::plan *r2c_vorticity;
+        typename fftw_interface<rnumber>::plan *c2r_velocity;
+        typename fftw_interface<rnumber>::plan *r2c_velocity;
+        typename fftw_interface<rnumber>::plan *uc2r, *ur2c;
+        typename fftw_interface<rnumber>::plan *vr2c[3], *vc2r[3];
 
         /* physical parameters */
         double nu;
diff --git a/bfps/cpp/fluid_solver_base.cpp b/bfps/cpp/fluid_solver_base.cpp
index 2f2aeee9a8ae699b7863c90dcffb550bc905390a..b7ba8680de20f7f1f4a48bdf561ffd35c33f9528 100644
--- a/bfps/cpp/fluid_solver_base.cpp
+++ b/bfps/cpp/fluid_solver_base.cpp
@@ -69,24 +69,26 @@ void fluid_solver_base<rnumber>::cospectrum(cnumber *a, cnumber *b, double *spec
     std::fill_n(cospec_local, this->nshells*9, 0);
     int tmp_int;
     CLOOP_K2_NXMODES(
-            this,
-            if (k2 <= this->kMspec2)
-            {
-                tmp_int = int(sqrt(k2)/this->dk)*9;
-                for (int i=0; i<3; i++)
-                    for (int j=0; j<3; j++)
-                    {
-                        cospec_local[tmp_int+i*3+j] += nxmodes * (
-                        (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] +
-                        (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]);
-                    }
-            }
-            );
+                this,
+
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2, int nxmodes){if (k2 <= this->kMspec2)
+        {
+            tmp_int = int(sqrt(k2)/this->dk)*9;
+            for (int i=0; i<3; i++)
+                for (int j=0; j<3; j++)
+                {
+                    cospec_local[tmp_int+i*3+j] += nxmodes * (
+                                (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] +
+                            (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]);
+                }
+        }}
+    );
     MPI_Allreduce(
-            (void*)cospec_local,
-            (void*)spec,
-            this->nshells*9,
-            MPI_DOUBLE, MPI_SUM, this->cd->comm);
+                (void*)cospec_local,
+                (void*)spec,
+                this->nshells*9,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
     fftw_free(cospec_local);
 }
 
@@ -98,25 +100,27 @@ void fluid_solver_base<rnumber>::cospectrum(cnumber *a, cnumber *b, double *spec
     double factor = 1;
     int tmp_int;
     CLOOP_K2_NXMODES(
-            this,
-            if (k2 <= this->kMspec2)
-            {
-                factor = nxmodes*pow(k2, k2exponent);
-                tmp_int = int(sqrt(k2)/this->dk)*9;
-                for (int i=0; i<3; i++)
-                    for (int j=0; j<3; j++)
-                    {
-                        cospec_local[tmp_int+i*3+j] += factor * (
-                        (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] +
-                        (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]);
-                    }
-            }
-            );
+                this,
+
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2, int nxmodes){if (k2 <= this->kMspec2)
+        {
+            factor = nxmodes*pow(k2, k2exponent);
+            tmp_int = int(sqrt(k2)/this->dk)*9;
+            for (int i=0; i<3; i++)
+                for (int j=0; j<3; j++)
+                {
+                    cospec_local[tmp_int+i*3+j] += factor * (
+                                (*(a + 3*cindex+i))[0] * (*(b + 3*cindex+j))[0] +
+                            (*(a + 3*cindex+i))[1] * (*(b + 3*cindex+j))[1]);
+                }
+        }}
+    );
     MPI_Allreduce(
-            (void*)cospec_local,
-            (void*)spec,
-            this->nshells*9,
-            MPI_DOUBLE, MPI_SUM, this->cd->comm);
+                (void*)cospec_local,
+                (void*)spec,
+                this->nshells*9,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
     //for (int n=0; n<this->nshells; n++)
     //{
     //    spec[n] *= 12.5663706144*pow(this->kshell[n], 2) / this->nshell[n];
@@ -175,7 +179,8 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
     std::fill_n(local_moments, nmoments*nvals, 0);
     if (nvals == 4) local_moments[3] = max_estimate[3];
     RLOOP(
-        this,
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
         std::fill_n(pow_tmp, nvals, 1.0);
         if (nvals == 4) val_tmp[3] = 0.0;
         for (int i=0; i<3; i++)
@@ -207,27 +212,28 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
         for (int n=1; n < nmoments-1; n++)
             for (int i=0; i<nvals; i++)
                 local_moments[n*nvals + i] += (pow_tmp[i] = val_tmp[i]*pow_tmp[i]);
-        );
+    }
+    );
     MPI_Allreduce(
-            (void*)local_moments,
-            (void*)moments,
-            nvals,
-            MPI_DOUBLE, MPI_MIN, this->cd->comm);
+                (void*)local_moments,
+                (void*)moments,
+                nvals,
+                MPI_DOUBLE, MPI_MIN, this->cd->comm);
     MPI_Allreduce(
-            (void*)(local_moments + nvals),
-            (void*)(moments+nvals),
-            (nmoments-2)*nvals,
-            MPI_DOUBLE, MPI_SUM, this->cd->comm);
+                (void*)(local_moments + nvals),
+                (void*)(moments+nvals),
+                (nmoments-2)*nvals,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
     MPI_Allreduce(
-            (void*)(local_moments + (nmoments-1)*nvals),
-            (void*)(moments+(nmoments-1)*nvals),
-            nvals,
-            MPI_DOUBLE, MPI_MAX, this->cd->comm);
+                (void*)(local_moments + (nmoments-1)*nvals),
+                (void*)(moments+(nmoments-1)*nvals),
+                nvals,
+                MPI_DOUBLE, MPI_MAX, this->cd->comm);
     MPI_Allreduce(
-            (void*)local_hist,
-            (void*)hist,
-            nbins*nvals,
-            MPI_INT64_T, MPI_SUM, this->cd->comm);
+                (void*)local_hist,
+                (void*)hist,
+                nbins*nvals,
+                MPI_INT64_T, MPI_SUM, this->cd->comm);
     for (int n=1; n < nmoments-1; n++)
         for (int i=0; i<nvals; i++)
             moments[n*nvals + i] /= this->normalization_factor;
@@ -290,7 +296,8 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
     std::fill_n(local_moments, 10*nvals, 0);
     if (nvals == 4) local_moments[3] = max_estimate[3];
     RLOOP(
-        this,
+                this,
+                [&](ptrdiff_t rindex, ptrdiff_t xindex, ptrdiff_t yindex, ptrdiff_t zindex){
         std::fill_n(pow_tmp, nvals, 1.0);
         if (nvals == 4) val_tmp[3] = 0.0;
         for (int i=0; i<3; i++)
@@ -322,27 +329,28 @@ void fluid_solver_base<rnumber>::compute_rspace_stats(
         for (int n=1; n<9; n++)
             for (int i=0; i<nvals; i++)
                 local_moments[n*nvals + i] += (pow_tmp[i] = val_tmp[i]*pow_tmp[i]);
-        );
+    }
+    );
     MPI_Allreduce(
-            (void*)local_moments,
-            (void*)moments,
-            nvals,
-            MPI_DOUBLE, MPI_MIN, this->cd->comm);
+                (void*)local_moments,
+                (void*)moments,
+                nvals,
+                MPI_DOUBLE, MPI_MIN, this->cd->comm);
     MPI_Allreduce(
-            (void*)(local_moments + nvals),
-            (void*)(moments+nvals),
-            8*nvals,
-            MPI_DOUBLE, MPI_SUM, this->cd->comm);
+                (void*)(local_moments + nvals),
+                (void*)(moments+nvals),
+                8*nvals,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
     MPI_Allreduce(
-            (void*)(local_moments + 9*nvals),
-            (void*)(moments+9*nvals),
-            nvals,
-            MPI_DOUBLE, MPI_MAX, this->cd->comm);
+                (void*)(local_moments + 9*nvals),
+                (void*)(moments+9*nvals),
+                nvals,
+                MPI_DOUBLE, MPI_MAX, this->cd->comm);
     MPI_Allreduce(
-            (void*)local_hist,
-            (void*)hist,
-            nbins*nvals,
-            MPI_INT64_T, MPI_SUM, this->cd->comm);
+                (void*)local_hist,
+                (void*)hist,
+                nbins*nvals,
+                MPI_INT64_T, MPI_SUM, this->cd->comm);
     for (int n=1; n<9; n++)
         for (int i=0; i<nvals; i++)
             moments[n*nvals + i] /= this->normalization_factor;
@@ -371,362 +379,361 @@ void fluid_solver_base<rnumber>::write_spectrum(const char *fname, cnumber *a, c
 /*****************************************************************************/
 /* macro for specializations to numeric types compatible with FFTW           */
 
-#define FLUID_SOLVER_BASE_DEFINITIONS(FFTW, R, MPI_RNUM, MPI_CNUM) \
- \
-template<> \
-fluid_solver_base<R>::fluid_solver_base( \
-        const char *NAME, \
-        int nx, \
-        int ny, \
-        int nz, \
-        double DKX, \
-        double DKY, \
-        double DKZ, \
-        int DEALIAS_TYPE, \
-        unsigned FFTW_PLAN_RIGOR) \
-{ \
-    strncpy(this->name, NAME, 256); \
-    this->name[255] = '\0'; \
-    this->iteration = 0; \
-    this->fftw_plan_rigor = FFTW_PLAN_RIGOR; \
- \
-    int ntmp[4]; \
-    ntmp[0] = nz; \
-    ntmp[1] = ny; \
-    ntmp[2] = nx; \
-    ntmp[3] = 3; \
-    this->rd = new field_descriptor<R>( \
-            4, ntmp, MPI_RNUM, MPI_COMM_WORLD);\
-    this->normalization_factor = (this->rd->full_size/3); \
-    ntmp[0] = ny; \
-    ntmp[1] = nz; \
-    ntmp[2] = nx/2 + 1; \
-    ntmp[3] = 3; \
-    this->cd = new field_descriptor<R>( \
-            4, ntmp, MPI_CNUM, this->rd->comm);\
- \
-    this->dkx = DKX; \
-    this->dky = DKY; \
-    this->dkz = DKZ; \
-    this->kx = new double[this->cd->sizes[2]]; \
-    this->ky = new double[this->cd->subsizes[0]]; \
-    this->kz = new double[this->cd->sizes[1]]; \
-    this->dealias_type = DEALIAS_TYPE; \
-    switch(this->dealias_type) \
-    { \
-        /* HL07 smooth filter */ \
-        case 1: \
-            this->kMx = this->dkx*(int(this->rd->sizes[2] / 2)-1); \
-            this->kMy = this->dky*(int(this->rd->sizes[1] / 2)-1); \
-            this->kMz = this->dkz*(int(this->rd->sizes[0] / 2)-1); \
-            break; \
-        default: \
-            this->kMx = this->dkx*(int(this->rd->sizes[2] / 3)-1); \
-            this->kMy = this->dky*(int(this->rd->sizes[1] / 3)-1); \
-            this->kMz = this->dkz*(int(this->rd->sizes[0] / 3)-1); \
-    } \
-    int i, ii; \
-    for (i = 0; i<this->cd->sizes[2]; i++) \
-        this->kx[i] = i*this->dkx; \
-    for (i = 0; i<this->cd->subsizes[0]; i++) \
-    { \
-        ii = i + this->cd->starts[0]; \
-        if (ii <= this->rd->sizes[1]/2) \
-            this->ky[i] = this->dky*ii; \
-        else \
-            this->ky[i] = this->dky*(ii - this->rd->sizes[1]); \
-    } \
-    for (i = 0; i<this->cd->sizes[1]; i++) \
-    { \
-        if (i <= this->rd->sizes[0]/2) \
-            this->kz[i] = this->dkz*i; \
-        else \
-            this->kz[i] = this->dkz*(i - this->rd->sizes[0]); \
-    } \
-    this->kM = this->kMx; \
-    if (this->kM < this->kMy) this->kM = this->kMy; \
-    if (this->kM < this->kMz) this->kM = this->kMz; \
-    this->kM2 = this->kM * this->kM; \
-    this->kMspec = this->kM; \
-    this->kMspec2 = this->kM2; \
-    this->dk = this->dkx; \
-    if (this->dk > this->dky) this->dk = this->dky; \
-    if (this->dk > this->dkz) this->dk = this->dkz; \
-    this->dk2 = this->dk*this->dk; \
-    DEBUG_MSG( \
-            "kM = %g, kM2 = %g, dk = %g, dk2 = %g\n", \
-            this->kM, this->kM2, this->dk, this->dk2); \
-    /* spectra stuff */ \
-    this->nshells = int(this->kMspec / this->dk) + 2; \
-    DEBUG_MSG( \
-            "kMspec = %g, kMspec2 = %g, nshells = %ld\n", \
-            this->kMspec, this->kMspec2, this->nshells); \
-    this->kshell = new double[this->nshells]; \
-    std::fill_n(this->kshell, this->nshells, 0.0); \
-    this->nshell = new int64_t[this->nshells]; \
-    std::fill_n(this->nshell, this->nshells, 0); \
-    double *kshell_local = new double[this->nshells]; \
-    std::fill_n(kshell_local, this->nshells, 0.0); \
-    int64_t *nshell_local = new int64_t[this->nshells]; \
-    std::fill_n(nshell_local, this->nshells, 0.0); \
-    double knorm; \
-    CLOOP_K2_NXMODES( \
-            this, \
-            if (k2 < this->kM2) \
-            { \
-                knorm = sqrt(k2); \
-                nshell_local[int(knorm/this->dk)] += nxmodes; \
-                kshell_local[int(knorm/this->dk)] += nxmodes*knorm; \
-            } \
-            this->Fourier_filter[int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.)); \
-            ); \
-    \
-    MPI_Allreduce( \
-            (void*)(nshell_local), \
-            (void*)(this->nshell), \
-            this->nshells, \
-            MPI_INT64_T, MPI_SUM, this->cd->comm); \
-    MPI_Allreduce( \
-            (void*)(kshell_local), \
-            (void*)(this->kshell), \
-            this->nshells, \
-            MPI_DOUBLE, MPI_SUM, this->cd->comm); \
-    for (unsigned int n=0; n<this->nshells; n++) \
-    { \
-        this->kshell[n] /= this->nshell[n]; \
-    } \
-    delete[] nshell_local; \
-    delete[] kshell_local; \
-} \
- \
-template<> \
-fluid_solver_base<R>::~fluid_solver_base() \
-{ \
-    delete[] this->kshell; \
-    delete[] this->nshell; \
- \
-    delete[] this->kx; \
-    delete[] this->ky; \
-    delete[] this->kz; \
- \
-    delete this->cd; \
-    delete this->rd; \
-} \
- \
-template<> \
-void fluid_solver_base<R>::low_pass_Fourier(FFTW(complex) *a, const int howmany, const double kmax) \
-{ \
-    const double km2 = kmax*kmax; \
-    const int howmany2 = 2*howmany; \
-    /*DEBUG_MSG("entered low_pass_Fourier, kmax=%lg km2=%lg howmany2=%d\n", kmax, km2, howmany2);*/ \
-    CLOOP_K2( \
-            this, \
-            /*DEBUG_MSG("kx=%lg ky=%lg kz=%lg k2=%lg\n", \
-                      this->kx[xindex], \
-                      this->ky[yindex], \
-                      this->kz[zindex], \
-                      k2);*/ \
-            if (k2 >= km2) \
-                std::fill_n((R*)(a + howmany*cindex), howmany2, 0.0); \
-            );\
-} \
- \
-template<> \
-void fluid_solver_base<R>::dealias(FFTW(complex) *a, const int howmany) \
-{ \
-    if (this->dealias_type == 0) \
-        { \
-            this->low_pass_Fourier(a, howmany, this->kM); \
-            return; \
-        } \
-    double tval; \
-    CLOOP_K2( \
-            this, \
-            tval = this->Fourier_filter[int(round(k2/this->dk2))]; \
-            for (int tcounter = 0; tcounter < howmany; tcounter++) \
-            for (int i=0; i<2; i++) \
-                a[howmany*cindex+tcounter][i] *= tval; \
-         ); \
-} \
- \
-template<> \
-void fluid_solver_base<R>::force_divfree(FFTW(complex) *a) \
-{ \
-    FFTW(complex) tval; \
-    CLOOP_K2( \
-            this, \
-            if (k2 > 0) \
-            { \
-                tval[0] = (this->kx[xindex]*((*(a + cindex*3  ))[0]) + \
-                           this->ky[yindex]*((*(a + cindex*3+1))[0]) + \
-                           this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2; \
-                tval[1] = (this->kx[xindex]*((*(a + cindex*3  ))[1]) + \
-                           this->ky[yindex]*((*(a + cindex*3+1))[1]) + \
-                           this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2; \
-                for (int imag_part=0; imag_part<2; imag_part++) \
-                { \
-                    a[cindex*3  ][imag_part] -= tval[imag_part]*this->kx[xindex]; \
-                    a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex]; \
-                    a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex]; \
-                } \
-            } \
-            );\
-    if (this->cd->myrank == this->cd->rank[0]) \
-        std::fill_n((R*)(a), 6, 0.0); \
-} \
- \
-template<> \
-void fluid_solver_base<R>::compute_vector_gradient(FFTW(complex) *A, FFTW(complex) *cvec) \
-{ \
-    ptrdiff_t tindex; \
-    std::fill_n((R*)A, 3*2*this->cd->local_size, 0.0); \
-    FFTW(complex) *dx_u, *dy_u, *dz_u; \
-    dx_u = A; \
-    dy_u = A + this->cd->local_size; \
-    dz_u = A + 2*this->cd->local_size; \
-    CLOOP_K2( \
-            this, \
-            if (k2 <= this->kM2) \
-            { \
-                tindex = 3*cindex; \
-                for (int cc=0; cc<3; cc++) \
-                { \
-                    dx_u[tindex + cc][0] = -this->kx[xindex]*cvec[tindex+cc][1]; \
-                    dx_u[tindex + cc][1] =  this->kx[xindex]*cvec[tindex+cc][0]; \
-                    dy_u[tindex + cc][0] = -this->ky[yindex]*cvec[tindex+cc][1]; \
-                    dy_u[tindex + cc][1] =  this->ky[yindex]*cvec[tindex+cc][0]; \
-                    dz_u[tindex + cc][0] = -this->kz[zindex]*cvec[tindex+cc][1]; \
-                    dz_u[tindex + cc][1] =  this->kz[zindex]*cvec[tindex+cc][0]; \
-                } \
-            } \
-            ); \
-} \
- \
-template<> \
-void fluid_solver_base<R>::symmetrize(FFTW(complex) *data, const int howmany) \
-{ \
-    ptrdiff_t ii, cc; \
-    MPI_Status *mpistatus = new MPI_Status; \
-    if (this->cd->myrank == this->cd->rank[0]) \
-    { \
-        for (cc = 0; cc < howmany; cc++) \
-            data[cc][1] = 0.0; \
-        for (ii = 1; ii < this->cd->sizes[1]/2; ii++) \
-            for (cc = 0; cc < howmany; cc++) { \
-                ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[0] = \
-                 (*(data + cc + howmany*(                     ii)*this->cd->sizes[2]))[0]; \
-                ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[1] = \
-                -(*(data + cc + howmany*(                     ii)*this->cd->sizes[2]))[1]; \
-                } \
-    } \
-    FFTW(complex) *buffer; \
-    buffer = FFTW(alloc_complex)(howmany*this->cd->sizes[1]); \
-    ptrdiff_t yy; \
-    /*ptrdiff_t tindex;*/ \
-    int ranksrc, rankdst; \
-    for (yy = 1; yy < this->cd->sizes[0]/2; yy++) { \
-        ranksrc = this->cd->rank[yy]; \
-        rankdst = this->cd->rank[this->cd->sizes[0] - yy]; \
-        if (this->cd->myrank == ranksrc) \
-            for (ii = 0; ii < this->cd->sizes[1]; ii++) \
-                for (cc = 0; cc < howmany; cc++) \
-                    for (int imag_comp=0; imag_comp<2; imag_comp++) \
-                    (*(buffer + howmany*ii+cc))[imag_comp] = \
-                        (*(data + howmany*((yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[imag_comp]; \
-        if (ranksrc != rankdst) \
-        { \
-            if (this->cd->myrank == ranksrc) \
-                MPI_Send((void*)buffer, \
-                         howmany*this->cd->sizes[1], MPI_CNUM, rankdst, yy, \
-                         this->cd->comm); \
-            if (this->cd->myrank == rankdst) \
-                MPI_Recv((void*)buffer, \
-                         howmany*this->cd->sizes[1], MPI_CNUM, ranksrc, yy, \
-                         this->cd->comm, mpistatus); \
-        } \
-        if (this->cd->myrank == rankdst) \
-        { \
-            for (ii = 1; ii < this->cd->sizes[1]; ii++) \
-                for (cc = 0; cc < howmany; cc++) \
-                { \
-                    (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[0] = \
-                        (*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[0]; \
-                    (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[1] = \
-                       -(*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[1]; \
-                } \
-            for (cc = 0; cc < howmany; cc++) \
-            { \
-                (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[0] =  (*(buffer + cc))[0]; \
-                (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[1] = -(*(buffer + cc))[1]; \
-            } \
-        } \
-    } \
-    FFTW(free)(buffer); \
-    delete mpistatus; \
-    /* put asymmetric data to 0 */\
-    /*if (this->cd->myrank == this->cd->rank[this->cd->sizes[0]/2]) \
-    { \
-        tindex = howmany*(this->cd->sizes[0]/2 - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2]; \
-        for (ii = 0; ii < this->cd->sizes[1]; ii++) \
-        { \
-            std::fill_n((R*)(data + tindex), howmany*2*this->cd->sizes[2], 0.0); \
-            tindex += howmany*this->cd->sizes[2]; \
-        } \
-    } \
-    tindex = howmany*(); \
-    std::fill_n((R*)(data + tindex), howmany*2, 0.0);*/ \
-} \
- \
-template<> \
-int fluid_solver_base<R>::read_base(const char *fname, R *data) \
-{ \
-    char full_name[512]; \
-    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \
-    return this->rd->read(full_name, (void*)data); \
-} \
- \
-template<> \
-int fluid_solver_base<R>::read_base(const char *fname, FFTW(complex) *data) \
-{ \
-    char full_name[512]; \
-    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \
-    return this->cd->read(full_name, (void*)data); \
-} \
- \
-template<> \
-int fluid_solver_base<R>::write_base(const char *fname, R *data) \
-{ \
-    char full_name[512]; \
-    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \
-    return this->rd->write(full_name, (void*)data); \
-} \
- \
-template<> \
-int fluid_solver_base<R>::write_base(const char *fname, FFTW(complex) *data) \
-{ \
-    char full_name[512]; \
-    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration); \
-    return this->cd->write(full_name, (void*)data); \
-} \
- \
-/* finally, force generation of code                                         */ \
-template class fluid_solver_base<R>; \
+template <class rnumber>
+fluid_solver_base<rnumber>::fluid_solver_base(
+        const char *NAME,
+        int nx,
+        int ny,
+        int nz,
+        double DKX,
+        double DKY,
+        double DKZ,
+        int DEALIAS_TYPE,
+        unsigned FFTW_PLAN_RIGOR)
+{
+    strncpy(this->name, NAME, 256);
+    this->name[255] = '\0';
+    this->iteration = 0;
+    this->fftw_plan_rigor = FFTW_PLAN_RIGOR;
 
-/*****************************************************************************/
+    int ntmp[4];
+    ntmp[0] = nz;
+    ntmp[1] = ny;
+    ntmp[2] = nx;
+    ntmp[3] = 3;
+    this->rd = new field_descriptor<rnumber>(
+                4, ntmp, mpi_real_type<rnumber>::real(), MPI_COMM_WORLD);
+    this->normalization_factor = (this->rd->full_size/3);
+    ntmp[0] = ny;
+    ntmp[1] = nz;
+    ntmp[2] = nx/2 + 1;
+    ntmp[3] = 3;
+    this->cd = new field_descriptor<rnumber>(
+                4, ntmp, mpi_real_type<rnumber>::complex(), this->rd->comm);
 
+    this->dkx = DKX;
+    this->dky = DKY;
+    this->dkz = DKZ;
+    this->kx = new double[this->cd->sizes[2]];
+    this->ky = new double[this->cd->subsizes[0]];
+    this->kz = new double[this->cd->sizes[1]];
+    this->dealias_type = DEALIAS_TYPE;
+    switch(this->dealias_type)
+    {
+    /* HL07 smooth filter */
+    case 1:
+        this->kMx = this->dkx*(int(this->rd->sizes[2] / 2)-1);
+        this->kMy = this->dky*(int(this->rd->sizes[1] / 2)-1);
+        this->kMz = this->dkz*(int(this->rd->sizes[0] / 2)-1);
+        break;
+    default:
+        this->kMx = this->dkx*(int(this->rd->sizes[2] / 3)-1);
+        this->kMy = this->dky*(int(this->rd->sizes[1] / 3)-1);
+        this->kMz = this->dkz*(int(this->rd->sizes[0] / 3)-1);
+    }
+    int i, ii;
+    for (i = 0; i<this->cd->sizes[2]; i++)
+        this->kx[i] = i*this->dkx;
+    for (i = 0; i<this->cd->subsizes[0]; i++)
+    {
+        ii = i + this->cd->starts[0];
+        if (ii <= this->rd->sizes[1]/2)
+            this->ky[i] = this->dky*ii;
+        else
+            this->ky[i] = this->dky*(ii - this->rd->sizes[1]);
+    }
+    for (i = 0; i<this->cd->sizes[1]; i++)
+    {
+        if (i <= this->rd->sizes[0]/2)
+            this->kz[i] = this->dkz*i;
+        else
+            this->kz[i] = this->dkz*(i - this->rd->sizes[0]);
+    }
+    this->kM = this->kMx;
+    if (this->kM < this->kMy) this->kM = this->kMy;
+    if (this->kM < this->kMz) this->kM = this->kMz;
+    this->kM2 = this->kM * this->kM;
+    this->kMspec = this->kM;
+    this->kMspec2 = this->kM2;
+    this->dk = this->dkx;
+    if (this->dk > this->dky) this->dk = this->dky;
+    if (this->dk > this->dkz) this->dk = this->dkz;
+    this->dk2 = this->dk*this->dk;
+    DEBUG_MSG(
+                "kM = %g, kM2 = %g, dk = %g, dk2 = %g\n",
+                this->kM, this->kM2, this->dk, this->dk2);
+    /* spectra stuff */
+    this->nshells = int(this->kMspec / this->dk) + 2;
+    DEBUG_MSG(
+                "kMspec = %g, kMspec2 = %g, nshells = %ld\n",
+                this->kMspec, this->kMspec2, this->nshells);
+    this->kshell = new double[this->nshells];
+    std::fill_n(this->kshell, this->nshells, 0.0);
+    this->nshell = new int64_t[this->nshells];
+    std::fill_n(this->nshell, this->nshells, 0);
+    double *kshell_local = new double[this->nshells];
+    std::fill_n(kshell_local, this->nshells, 0.0);
+    int64_t *nshell_local = new int64_t[this->nshells];
+    std::fill_n(nshell_local, this->nshells, 0.0);
+    double knorm;
+    CLOOP_K2_NXMODES(
+                this,
 
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2, int nxmodes){if (k2 < this->kM2)
+        {
+            knorm = sqrt(k2);
+            nshell_local[int(knorm/this->dk)] += nxmodes;
+            kshell_local[int(knorm/this->dk)] += nxmodes*knorm;
+        }
+        this->Fourier_filter[int(round(k2 / this->dk2))] = exp(-36.0 * pow(k2/this->kM2, 18.));}
+    );
+
+    MPI_Allreduce(
+                (void*)(nshell_local),
+                (void*)(this->nshell),
+                this->nshells,
+                MPI_INT64_T, MPI_SUM, this->cd->comm);
+    MPI_Allreduce(
+                (void*)(kshell_local),
+                (void*)(this->kshell),
+                this->nshells,
+                MPI_DOUBLE, MPI_SUM, this->cd->comm);
+    for (unsigned int n=0; n<this->nshells; n++)
+    {
+        this->kshell[n] /= this->nshell[n];
+    }
+    delete[] nshell_local;
+    delete[] kshell_local;
+}
+
+template <class rnumber>
+fluid_solver_base<rnumber>::~fluid_solver_base()
+{
+    delete[] this->kshell;
+    delete[] this->nshell;
+
+    delete[] this->kx;
+    delete[] this->ky;
+    delete[] this->kz;
+
+    delete this->cd;
+    delete this->rd;
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::low_pass_Fourier(cnumber *a, const int howmany, const double kmax)
+{
+    const double km2 = kmax*kmax;
+    const int howmany2 = 2*howmany;
+    /*DEBUG_MSG("entered low_pass_Fourier, kmax=%lg km2=%lg howmany2=%d\n", kmax, km2, howmany2);*/
+    CLOOP_K2(
+                this,
+                /*DEBUG_MSG("kx=%lg ky=%lg kz=%lg k2=%lg\n",
+                                  this->kx[xindex],
+                                  this->ky[yindex],
+                                  this->kz[zindex],
+                                  k2);*/
+
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2){
+        if (k2 >= km2)
+            std::fill_n((rnumber*)(a + howmany*cindex), howmany2, 0.0);}
+    );
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::dealias(cnumber *a, const int howmany)
+{
+    if (this->dealias_type == 0)
+    {
+        this->low_pass_Fourier(a, howmany, this->kM);
+        return;
+    }
+    double tval;
+    CLOOP_K2(
+                this,
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2){
+        tval = this->Fourier_filter[int(round(k2/this->dk2))];
+        for (int tcounter = 0; tcounter < howmany; tcounter++)
+            for (int i=0; i<2; i++)
+                a[howmany*cindex+tcounter][i] *= tval;
+    }
+    );
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::force_divfree(cnumber *a)
+{
+    cnumber tval;
+    CLOOP_K2(
+                this,
+
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2){if (k2 > 0)
+        {
+            tval[0] = (this->kx[xindex]*((*(a + cindex*3  ))[0]) +
+                    this->ky[yindex]*((*(a + cindex*3+1))[0]) +
+                    this->kz[zindex]*((*(a + cindex*3+2))[0]) ) / k2;
+            tval[1] = (this->kx[xindex]*((*(a + cindex*3  ))[1]) +
+                    this->ky[yindex]*((*(a + cindex*3+1))[1]) +
+                    this->kz[zindex]*((*(a + cindex*3+2))[1]) ) / k2;
+            for (int imag_part=0; imag_part<2; imag_part++)
+            {
+                a[cindex*3  ][imag_part] -= tval[imag_part]*this->kx[xindex];
+                a[cindex*3+1][imag_part] -= tval[imag_part]*this->ky[yindex];
+                a[cindex*3+2][imag_part] -= tval[imag_part]*this->kz[zindex];
+            }
+        }}
+    );
+    if (this->cd->myrank == this->cd->rank[0])
+        std::fill_n((rnumber*)(a), 6, 0.0);
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::compute_vector_gradient(cnumber *A, cnumber *cvec)
+{
+    ptrdiff_t tindex;
+    std::fill_n((rnumber*)A, 3*2*this->cd->local_size, 0.0);
+    cnumber *dx_u, *dy_u, *dz_u;
+    dx_u = A;
+    dy_u = A + this->cd->local_size;
+    dz_u = A + 2*this->cd->local_size;
+    CLOOP_K2(
+                this,
+
+                [&](ptrdiff_t cindex, ptrdiff_t xindex, ptrdiff_t yindex,
+                ptrdiff_t zindex, double k2){
+        if (k2 <= this->kM2)
+        {
+            tindex = 3*cindex;
+            for (int cc=0; cc<3; cc++)
+            {
+                dx_u[tindex + cc][0] = -this->kx[xindex]*cvec[tindex+cc][1];
+                dx_u[tindex + cc][1] =  this->kx[xindex]*cvec[tindex+cc][0];
+                dy_u[tindex + cc][0] = -this->ky[yindex]*cvec[tindex+cc][1];
+                dy_u[tindex + cc][1] =  this->ky[yindex]*cvec[tindex+cc][0];
+                dz_u[tindex + cc][0] = -this->kz[zindex]*cvec[tindex+cc][1];
+                dz_u[tindex + cc][1] =  this->kz[zindex]*cvec[tindex+cc][0];
+            }
+        }}
+    );
+}
+
+template <class rnumber>
+void fluid_solver_base<rnumber>::symmetrize(cnumber *data, const int howmany)
+{
+    ptrdiff_t ii, cc;
+    MPI_Status *mpistatus = new MPI_Status;
+    if (this->cd->myrank == this->cd->rank[0])
+    {
+        for (cc = 0; cc < howmany; cc++)
+            data[cc][1] = 0.0;
+        for (ii = 1; ii < this->cd->sizes[1]/2; ii++)
+            for (cc = 0; cc < howmany; cc++) {
+                ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[0] =
+                        (*(data + cc + howmany*(                     ii)*this->cd->sizes[2]))[0];
+                ( *(data + cc + howmany*(this->cd->sizes[1] - ii)*this->cd->sizes[2]))[1] =
+                        -(*(data + cc + howmany*(                     ii)*this->cd->sizes[2]))[1];
+            }
+    }
+    cnumber *buffer;
+    buffer = fftw_interface<rnumber>::alloc_complex(howmany*this->cd->sizes[1]);
+    ptrdiff_t yy;
+    /*ptrdiff_t tindex;*/
+    int ranksrc, rankdst;
+    for (yy = 1; yy < this->cd->sizes[0]/2; yy++) {
+        ranksrc = this->cd->rank[yy];
+        rankdst = this->cd->rank[this->cd->sizes[0] - yy];
+        if (this->cd->myrank == ranksrc)
+            for (ii = 0; ii < this->cd->sizes[1]; ii++)
+                for (cc = 0; cc < howmany; cc++)
+                    for (int imag_comp=0; imag_comp<2; imag_comp++)
+                        (*(buffer + howmany*ii+cc))[imag_comp] =
+                            (*(data + howmany*((yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[imag_comp];
+        if (ranksrc != rankdst)
+        {
+            if (this->cd->myrank == ranksrc)
+                MPI_Send((void*)buffer,
+                         howmany*this->cd->sizes[1], mpi_real_type<rnumber>::complex(), rankdst, yy,
+                        this->cd->comm);
+            if (this->cd->myrank == rankdst)
+                MPI_Recv((void*)buffer,
+                         howmany*this->cd->sizes[1], mpi_real_type<rnumber>::complex(), ranksrc, yy,
+                        this->cd->comm, mpistatus);
+        }
+        if (this->cd->myrank == rankdst)
+        {
+            for (ii = 1; ii < this->cd->sizes[1]; ii++)
+                for (cc = 0; cc < howmany; cc++)
+                {
+                    (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[0] =
+                            (*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[0];
+                    (*(data + howmany*((this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1] + ii)*this->cd->sizes[2] + cc))[1] =
+                            -(*(buffer + howmany*(this->cd->sizes[1]-ii)+cc))[1];
+                }
+            for (cc = 0; cc < howmany; cc++)
+            {
+                (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[0] =  (*(buffer + cc))[0];
+                (*((data + cc + howmany*(this->cd->sizes[0] - yy - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2])))[1] = -(*(buffer + cc))[1];
+            }
+        }
+    }
+    fftw_interface<rnumber>::free(buffer);
+    delete mpistatus;
+    /* put asymmetric data to 0 */
+    /*if (this->cd->myrank == this->cd->rank[this->cd->sizes[0]/2])
+    {
+        tindex = howmany*(this->cd->sizes[0]/2 - this->cd->starts[0])*this->cd->sizes[1]*this->cd->sizes[2];
+        for (ii = 0; ii < this->cd->sizes[1]; ii++)
+        {
+            std::fill_n((rnumber*)(data + tindex), howmany*2*this->cd->sizes[2], 0.0);
+            tindex += howmany*this->cd->sizes[2];
+        }
+    }
+    tindex = howmany*();
+    std::fill_n((rnumber*)(data + tindex), howmany*2, 0.0);*/
+}
+
+template <class rnumber>
+int fluid_solver_base<rnumber>::read_base(const char *fname, rnumber *data)
+{
+    char full_name[512];
+    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration);
+    return this->rd->read(full_name, (void*)data);
+}
+
+template <class rnumber>
+int fluid_solver_base<rnumber>::read_base(const char *fname, cnumber *data)
+{
+    char full_name[512];
+    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration);
+    return this->cd->read(full_name, (void*)data);
+}
+
+template <class rnumber>
+int fluid_solver_base<rnumber>::write_base(const char *fname, rnumber *data)
+{
+    char full_name[512];
+    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration);
+    return this->rd->write(full_name, (void*)data);
+}
+
+template <class rnumber>
+int fluid_solver_base<rnumber>::write_base(const char *fname, cnumber *data)
+{
+    char full_name[512];
+    sprintf(full_name, "%s_%s_i%.5x", this->name, fname, this->iteration);
+    return this->cd->write(full_name, (void*)data);
+}
+
+/* finally, force generation of code                                         */
+template class fluid_solver_base<float>;
+template class fluid_solver_base<double>;
 
 /*****************************************************************************/
-/* now actually use the macro defined above                                  */
-FLUID_SOLVER_BASE_DEFINITIONS(
-        FFTW_MANGLE_FLOAT,
-        float,
-        MPI_FLOAT,
-        MPI_COMPLEX)
-FLUID_SOLVER_BASE_DEFINITIONS(
-        FFTW_MANGLE_DOUBLE,
-        double,
-        MPI_DOUBLE,
-        BFPS_MPICXX_DOUBLE_COMPLEX)
-/*****************************************************************************/
+
+
+
 
diff --git a/bfps/cpp/fluid_solver_base.hpp b/bfps/cpp/fluid_solver_base.hpp
index 62deb597b4a6a3f4fc87198099d15778e7a2a255..9989bc19f1d3e24f6141e2097ab71a98dcf9f198 100644
--- a/bfps/cpp/fluid_solver_base.hpp
+++ b/bfps/cpp/fluid_solver_base.hpp
@@ -135,97 +135,99 @@ class fluid_solver_base
 /* macros for loops                                                          */
 
 /* Fourier space loop */
-#define CLOOP(obj, expression) \
- \
-{ \
-    ptrdiff_t cindex = 0; \
-    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \
-    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \
-    for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) \
-        { \
-            expression; \
-            cindex++; \
-        } \
+template <class ObjectType, class FuncType>
+void CLOOP(ObjectType* obj, FuncType expression)
+{
+    ptrdiff_t cindex = 0;
+    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++)
+    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++)
+    for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++)
+        {
+            expression(cindex, xindex, yindex, zindex);
+            cindex++;
+        }
 }
 
-#define CLOOP_NXMODES(obj, expression) \
- \
-{ \
-    ptrdiff_t cindex = 0; \
-    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \
-    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \
-    { \
-        int nxmodes = 1; \
-        ptrdiff_t xindex = 0; \
-        expression; \
-        cindex++; \
-        nxmodes = 2; \
-    for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) \
-        { \
-            expression; \
-            cindex++; \
-        } \
-    } \
+template <class ObjectType, class FuncType>
+void CLOOP_NXMODES(ObjectType* obj, FuncType expression)
+{
+    ptrdiff_t cindex = 0;
+    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++)
+    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++)
+    {
+        int nxmodes = 1;
+        ptrdiff_t xindex = 0;
+        expression;
+        cindex++;
+        nxmodes = 2;
+    for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++)
+        {
+            expression();
+            cindex++;
+        }
+    }
 }
 
-#define CLOOP_K2(obj, expression) \
- \
-{ \
-    double k2; \
-    ptrdiff_t cindex = 0; \
-    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \
-    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \
-    for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++) \
-        { \
-            k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-                  obj->ky[yindex]*obj->ky[yindex] + \
-                  obj->kz[zindex]*obj->kz[zindex]); \
-            expression; \
-            cindex++; \
-        } \
+
+template <class ObjectType, class FuncType>
+void CLOOP_K2(ObjectType* obj, FuncType expression)
+{
+    double k2;
+    ptrdiff_t cindex = 0;
+    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++)
+    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++)
+    for (ptrdiff_t xindex = 0; xindex < obj->cd->subsizes[2]; xindex++)
+        {
+            k2 = (obj->kx[xindex]*obj->kx[xindex] +
+                  obj->ky[yindex]*obj->ky[yindex] +
+                  obj->kz[zindex]*obj->kz[zindex]);
+            expression(cindex, xindex, yindex, zindex, k2);
+            cindex++;
+        }
 }
 
-#define CLOOP_K2_NXMODES(obj, expression) \
- \
-{ \
-    double k2; \
-    ptrdiff_t cindex = 0; \
-    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++) \
-    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++) \
-    { \
-        int nxmodes = 1; \
-        ptrdiff_t xindex = 0; \
-        k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-              obj->ky[yindex]*obj->ky[yindex] + \
-              obj->kz[zindex]*obj->kz[zindex]); \
-        expression; \
-        cindex++; \
-        nxmodes = 2; \
-    for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++) \
-        { \
-            k2 = (obj->kx[xindex]*obj->kx[xindex] + \
-                  obj->ky[yindex]*obj->ky[yindex] + \
-                  obj->kz[zindex]*obj->kz[zindex]); \
-            expression; \
-            cindex++; \
-        } \
-    } \
+
+template <class ObjectType, class FuncType>
+void CLOOP_K2_NXMODES(ObjectType* obj, FuncType expression)
+{
+    double k2;
+    ptrdiff_t cindex = 0;
+    for (ptrdiff_t yindex = 0; yindex < obj->cd->subsizes[0]; yindex++)
+    for (ptrdiff_t zindex = 0; zindex < obj->cd->subsizes[1]; zindex++)
+    {
+        int nxmodes = 1;
+        ptrdiff_t xindex = 0;
+        k2 = (obj->kx[xindex]*obj->kx[xindex] +
+              obj->ky[yindex]*obj->ky[yindex] +
+              obj->kz[zindex]*obj->kz[zindex]);
+        expression(cindex, xindex, yindex, zindex, k2, nxmodes);
+        cindex++;
+        nxmodes = 2;
+    for (xindex = 1; xindex < obj->cd->subsizes[2]; xindex++)
+        {
+            k2 = (obj->kx[xindex]*obj->kx[xindex] +
+                  obj->ky[yindex]*obj->ky[yindex] +
+                  obj->kz[zindex]*obj->kz[zindex]);
+            expression(cindex, xindex, yindex, zindex, k2, nxmodes);
+            cindex++;
+        }
+    }
 }
 
-/* real space loop */
-#define RLOOP(obj, expression) \
- \
-{ \
-    for (int zindex = 0; zindex < obj->rd->subsizes[0]; zindex++) \
-    for (int yindex = 0; yindex < obj->rd->subsizes[1]; yindex++) \
-    { \
-        ptrdiff_t rindex = (zindex * obj->rd->subsizes[1] + yindex)*(obj->rd->subsizes[2]+2); \
-    for (int xindex = 0; xindex < obj->rd->subsizes[2]; xindex++) \
-        { \
-            expression; \
-            rindex++; \
-        } \
-    } \
+
+template <class ObjectType, class FuncType>
+void RLOOP(ObjectType* obj, FuncType expression)
+{
+    for (int zindex = 0; zindex < obj->rd->subsizes[0]; zindex++)
+    for (int yindex = 0; yindex < obj->rd->subsizes[1]; yindex++)
+    {
+        ptrdiff_t rindex = (zindex * obj->rd->subsizes[1] + yindex)*(obj->rd->subsizes[2]+2);
+    for (int xindex = 0; xindex < obj->rd->subsizes[2]; xindex++)
+        {
+            expression(rindex, xindex, yindex, zindex);
+            rindex++;
+        }
+    }
 }
 
 /*****************************************************************************/
diff --git a/setup.py b/setup.py
index 938936cba3330e6bc77d186fa6d15ab463d92364..cfd9fba198807f31df19d7b53ab6e0eada98b60c 100644
--- a/setup.py
+++ b/setup.py
@@ -111,7 +111,8 @@ src_file_list = ['field',
 
 header_list = (['cpp/base.hpp'] +
                ['cpp/' + fname + '.hpp'
-                for fname in src_file_list])
+                for fname in src_file_list] + 
+               ['cpp/fftw_interface.hpp'])
 
 with open('MANIFEST.in', 'w') as manifest_in_file:
     for fname in (['bfps/cpp/' + ff + '.cpp' for ff in src_file_list] +