diff --git a/makefile b/makefile
index b4b1c4c5a909ffa1734fd6427c922593ded70a58..67882e8094e02246fbd97d723b0bd7fcbee8d211 100644
--- a/makefile
+++ b/makefile
@@ -2,7 +2,9 @@ MPICXX  = mpicxx
 LINKER  = mpicxx
 DEFINES =
 CFLAGS  = -Wall \
-		  -O2
+		  -O2 \
+		  #-pg \
+		  #-finstrument-functions
 
 LIBS = -lfftw3_mpi \
 	   -lfftw3 \
diff --git a/src/RMHD_converter.cpp b/src/RMHD_converter.cpp
index 18ef57cd6eda397aec52d839d7d9e93a36c79f92..e4394241ecb5c633b1518eaaa21957fed0acca82 100644
--- a/src/RMHD_converter.cpp
+++ b/src/RMHD_converter.cpp
@@ -57,23 +57,18 @@ RMHD_converter::RMHD_converter(
     this->c3  = (fftwf_complex*)this->r3;
 
     // allocate plans
-    this->complex2real0 = fftwf_mpi_plan_dft_c2r_3d(
-            f3r->sizes[0], f3r->sizes[1], f3r->sizes[2],
+    ptrdiff_t blabla[] = {this->f3r->sizes[0],
+                          this->f3r->sizes[1],
+                          this->f3r->sizes[2]};
+    this->complex2real = fftwf_mpi_plan_many_dft_c2r(
+            3,
+            blabla,
+            2,
+            FFTW_MPI_DEFAULT_BLOCK,
+            FFTW_MPI_DEFAULT_BLOCK,
             this->c3, this->r3,
             MPI_COMM_WORLD,
             FFTW_ESTIMATE);
-    this->complex2real1 = fftwf_mpi_plan_dft_c2r_3d(
-            f3r->sizes[0], f3r->sizes[1], f3r->sizes[2],
-            this->c3 + this->f3c->local_size,
-            this->r3 + 2*this->f3c->local_size,
-            MPI_COMM_WORLD,
-            FFTW_PATIENT);
-
-    // various descriptions for the real data
-    n[0] = N0*2;
-    n[1] = N1;
-    n[2] = N2;
-    this->f4r = new field_descriptor(3, n, MPI_REAL4, MPI_COMM_WORLD);
 
     this->s = new Morton_shuffler(N0, N1, N2, 2, nfiles);
 }
@@ -85,7 +80,6 @@ RMHD_converter::~RMHD_converter()
     delete this->f2c;
     delete this->f3c;
     delete this->f3r;
-    delete this->f4r;
 
     delete this->s;
 
@@ -93,8 +87,7 @@ RMHD_converter::~RMHD_converter()
     fftwf_free(this->c12);
     fftwf_free(this->r3);
 
-    fftwf_destroy_plan(this->complex2real0);
-    fftwf_destroy_plan(this->complex2real1);
+    fftwf_destroy_plan(this->complex2real);
 }
 
 int RMHD_converter::convert(
@@ -109,7 +102,6 @@ int RMHD_converter::convert(
     fftwf_copy_complex_array(
             this->f2c, this->c12,
             this->f3c, this->c3);
-    fftwf_execute(this->complex2real0);
 
     //read second field
     this->f0c->read(ifile1, (void*)this->c0);
@@ -118,13 +110,12 @@ int RMHD_converter::convert(
     fftwf_copy_complex_array(
             this->f2c, this->c12,
             this->f3c, this->c3 + this->f3c->local_size);
-    fftwf_execute(this->complex2real1);
 
-    fftwf_clip_zero_padding(this->f4r, this->r3);
+    this->f3c->interleave(this->c3, 2);
 
-    // mix components
-    this->f3r->interleave(this->r3, 2);
+    fftwf_execute(this->complex2real);
 
+    fftwf_clip_zero_padding(this->f3r, this->r3, 2);
     this->s->shuffle(this->r3, ofile);
     return EXIT_SUCCESS;
 }
diff --git a/src/RMHD_converter.hpp b/src/RMHD_converter.hpp
index 6ce834c6f0f912ebfff24775d3ece70c84081129..c792cdf4ee279a6a570c8d58495a1583c8e16288 100644
--- a/src/RMHD_converter.hpp
+++ b/src/RMHD_converter.hpp
@@ -20,9 +20,6 @@ class RMHD_converter
         field_descriptor *f2c; // descriptor for 3D fully transposed input
         field_descriptor *f3c, *f3r; // descriptors for FFT
 
-        // descriptor for N0*2 x N1 x N2 real space array
-        field_descriptor *f4r;
-
         Morton_shuffler *s;
 
         fftwf_complex *c0 ; // array to store 2D input
@@ -30,8 +27,7 @@ class RMHD_converter
         fftwf_complex *c3 ; // array to store resized Fourier data
         float *r3         ; // array to store real space data
 
-        fftwf_plan complex2real0;
-        fftwf_plan complex2real1;
+        fftwf_plan complex2real;
 
         /* methods */
         RMHD_converter(
diff --git a/src/fftwf_tools.cpp b/src/fftwf_tools.cpp
index 1fb8436c775a1431dcb7be2a37be72bf7e3c35b5..3ea239bc1d7935f4662b5102ce7b16eb29204d4f 100644
--- a/src/fftwf_tools.cpp
+++ b/src/fftwf_tools.cpp
@@ -115,7 +115,8 @@ int fftwf_copy_complex_array(
 
 int fftwf_clip_zero_padding(
         field_descriptor *f,
-        float *a)
+        float *a,
+        int howmany)
 {
     if (f->ndims != 3)
         return EXIT_FAILURE;
@@ -123,9 +124,9 @@ int fftwf_clip_zero_padding(
     for (int i0 = 0; i0 < f->subsizes[0]; i0++)
         for (int i1 = 0; i1 < f->sizes[1]; i1++)
         {
-            std::copy(a, a + f->sizes[2], b);
-            a += f->sizes[2] + 2;
-            b += f->sizes[2];
+            std::copy(a, a + f->sizes[2]*howmany, b);
+            a += f->sizes[2]*howmany + 2*howmany;
+            b += f->sizes[2]*howmany;
         }
     return EXIT_SUCCESS;
 }
diff --git a/src/fftwf_tools.hpp b/src/fftwf_tools.hpp
index 17344e53dfb38b03a8b472b03702410e3eb29a31..f39ed5277d2bada764f6c8b89b17841b121dbf4d 100644
--- a/src/fftwf_tools.hpp
+++ b/src/fftwf_tools.hpp
@@ -20,7 +20,8 @@ int fftwf_copy_complex_array(
 
 int fftwf_clip_zero_padding(
         field_descriptor *f,
-        float *a);
+        float *a,
+        int howmany=1);
 
 /* function to get pair of descriptors for real and Fourier space
  * arrays used with fftw.
diff --git a/src/field_descriptor.cpp b/src/field_descriptor.cpp
index 2c37486bf7d52494dc32f1f4c65497461d73af5d..56dd4346d723dc172566207c2e4c3997722382ea 100644
--- a/src/field_descriptor.cpp
+++ b/src/field_descriptor.cpp
@@ -266,17 +266,29 @@ int field_descriptor::interleave(
 }
 
 int field_descriptor::interleave(
-        fftw_complex *input,
-        fftw_complex *output,
+        fftwf_complex *a,
         int dim)
 {
-    // TODO: implement inplace interleaver
-    for (int k = 0; k < this->local_size; k++)
-        for (int j = 0; j < dim; j++)
-        {
-            output[k*dim + j][0] = input[j*this->local_size + k][0];
-            output[k*dim + j][1] = input[j*this->local_size + k][1];
-        }
+    fftwf_iodim howmany_dims[2];
+    howmany_dims[0].n  = dim;
+    howmany_dims[0].is = this->local_size;
+    howmany_dims[0].os = 1;
+    howmany_dims[1].n  = this->local_size;
+    howmany_dims[1].is = 1;
+    howmany_dims[1].os = dim;
+    const int howmany_rank = sizeof(howmany_dims)/sizeof(howmany_dims[0]);
+
+    fftwf_plan tmp = fftwf_plan_guru_dft(
+            /*rank*/0,
+            /*dims*/NULL,
+            howmany_rank,
+            howmany_dims,
+            a,
+            a,
+            +1,
+            FFTW_ESTIMATE);
+    fftwf_execute(tmp);
+    fftwf_destroy_plan(tmp);
     return EXIT_SUCCESS;
 }
 
diff --git a/src/field_descriptor.hpp b/src/field_descriptor.hpp
index 5166d022949d2ac2e5fcb0836b5bbbb04485a295..c2fb4c5fec4336179cf13743db536319995d2414 100644
--- a/src/field_descriptor.hpp
+++ b/src/field_descriptor.hpp
@@ -61,8 +61,7 @@ class field_descriptor
                 float *input,
                 int dim);
         int interleave(
-                fftw_complex *input,
-                fftw_complex *output,
+                fftwf_complex *input,
                 int dim);
 };