diff --git a/src/fluid_solver.cpp b/src/fluid_solver.cpp
index d10e6960bb72b301ee9b71bc056b6d87c6894e1c..6b64d08b8cf2d2d2985c936bbb56786b6bf8e100 100644
--- a/src/fluid_solver.cpp
+++ b/src/fluid_solver.cpp
@@ -19,12 +19,35 @@
 ************************************************************************/
 
 
-
+#include <cassert>
 #include "fluid_solver.hpp"
 #include "fftw_tools.hpp"
 
 
 
+/*****************************************************************************/
+/* macros for loops                                                          */
+
+/* complex loop */
+#define CLOOP(expression) \
+ \
+{ \
+    int cindex; \
+    for (int yindex = 0; yindex < this->cd->subsizes[0]; yindex++) \
+    for (int zindex = 0; zindex < this->cd->subsizes[1]; zindex++) \
+    { \
+        cindex = (yindex * this->cd->sizes[1] + zindex)*this->cd->sizes[2]; \
+    for (int xindex = 0; xindex < this->cd->subsizes[2]; xindex++) \
+        { \
+            expression; \
+            cindex++; \
+        } \
+    } \
+}
+
+/*****************************************************************************/
+
+
 /*****************************************************************************/
 /* macro for specializations to numeric types compatible with FFTW           */
 
@@ -34,7 +57,10 @@ template<> \
 fluid_solver<R>::fluid_solver( \
         int nx, \
         int ny, \
-        int nz) \
+        int nz, \
+        double DKX, \
+        double DKY, \
+        double DKZ) \
 { \
     int ntmp[4]; \
     ntmp[0] = nz; \
@@ -43,6 +69,8 @@ fluid_solver<R>::fluid_solver( \
     ntmp[3] = 3; \
     this->rd = new field_descriptor<R>( \
             4, ntmp, MPI_RNUM, MPI_COMM_WORLD);\
+    ntmp[0] = ny; \
+    ntmp[1] = nz; \
     ntmp[2] = nx/2 + 1; \
     this->cd = new field_descriptor<R>( \
             4, ntmp, MPI_CNUM, MPI_COMM_WORLD);\
@@ -50,6 +78,19 @@ fluid_solver<R>::fluid_solver( \
     this->cvelocity  = FFTW(alloc_complex)(this->cd->local_size);\
     this->rvorticity = FFTW(alloc_real)(this->cd->local_size*2);\
     this->rvelocity  = FFTW(alloc_real)(this->cd->local_size*2);\
+ \
+    this->ru = this->rvelocity;\
+    this->cu = this->cvelocity;\
+ \
+    this->rv[0] = this->rvorticity;\
+    this->rv[3] = this->rvorticity;\
+    this->cv[0] = this->cvorticity;\
+    this->cv[3] = this->cvorticity;\
+ \
+    this->cv[1] = FFTW(alloc_complex)(this->cd->local_size);\
+    this->cv[2] = FFTW(alloc_complex)(this->cd->local_size);\
+    this->rv[1] = (R*)(this->cv);\
+    this->rv[2] = (R*)(this->cv);\
  \
     this->c2r_vorticity = new FFTW(plan);\
     this->r2c_vorticity = new FFTW(plan);\
@@ -61,51 +102,141 @@ fluid_solver<R>::fluid_solver( \
                          nx};\
  \
     *(FFTW(plan)*)this->c2r_vorticity = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, \
-            FFTW_MPI_DEFAULT_BLOCK, \
-            FFTW_MPI_DEFAULT_BLOCK, \
+            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
             this->cvorticity, this->rvorticity, \
-            MPI_COMM_WORLD, \
-            FFTW_ESTIMATE); \
+            MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN); \
  \
     *(FFTW(plan)*)this->r2c_vorticity = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, \
-            FFTW_MPI_DEFAULT_BLOCK, \
-            FFTW_MPI_DEFAULT_BLOCK, \
+            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
             this->rvorticity, this->cvorticity, \
-            MPI_COMM_WORLD, \
-            FFTW_ESTIMATE); \
+            MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT); \
  \
     *(FFTW(plan)*)this->c2r_velocity = FFTW(mpi_plan_many_dft_c2r)( \
-            3, sizes, 3, \
-            FFTW_MPI_DEFAULT_BLOCK, \
-            FFTW_MPI_DEFAULT_BLOCK, \
+            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
             this->cvelocity, this->rvelocity, \
-            MPI_COMM_WORLD, \
-            FFTW_ESTIMATE); \
+            MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN); \
  \
     *(FFTW(plan)*)this->r2c_velocity = FFTW(mpi_plan_many_dft_r2c)( \
-            3, sizes, 3, \
-            FFTW_MPI_DEFAULT_BLOCK, \
-            FFTW_MPI_DEFAULT_BLOCK, \
+            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
             this->rvelocity, this->cvelocity, \
-            MPI_COMM_WORLD, \
-            FFTW_ESTIMATE); \
+            MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT); \
+ \
+    this->uc2r = this->c2r_velocity;\
+    this->ur2c = this->r2c_velocity;\
+    this->vc2r[0] = this->c2r_vorticity;\
+    this->vr2c[0] = this->r2c_vorticity;\
+ \
+    this->vc2r[1] = new FFTW(plan);\
+    this->vr2c[1] = new FFTW(plan);\
+    this->vc2r[2] = new FFTW(plan);\
+    this->vr2c[2] = new FFTW(plan);\
+ \
+    *(FFTW(plan)*)this->vc2r[1] = FFTW(mpi_plan_many_dft_c2r)( \
+            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
+            this->cv[1], this->rv[1], \
+            MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN); \
+ \
+    *(FFTW(plan)*)this->vc2r[2] = FFTW(mpi_plan_many_dft_c2r)( \
+            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
+            this->cv[2], this->rv[2], \
+            MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN); \
+ \
+    *(FFTW(plan)*)this->vr2c[1] = FFTW(mpi_plan_many_dft_r2c)( \
+            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
+            this->rv[1], this->cv[1], \
+            MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT); \
+ \
+    *(FFTW(plan)*)this->vr2c[2] = FFTW(mpi_plan_many_dft_r2c)( \
+            3, sizes, 3, FFTW_MPI_DEFAULT_BLOCK, FFTW_MPI_DEFAULT_BLOCK, \
+            this->rv[2], this->cv[2], \
+            MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT); \
+ \
+    /* wavenumbers etc */ \
+ \
+    this->dkx = DKX; \
+    this->dky = DKY; \
+    this->dkz = DKZ; \
+    this->kx = new double[this->cd->sizes[2]]; \
+    this->ky = new double[this->cd->subsizes[0]]; \
+    this->kz = new double[this->cd->sizes[1]]; \
+    this->knullx = new bool[this->cd->sizes[2]]; \
+    this->knully = new bool[this->cd->subsizes[0]]; \
+    this->knullz = new bool[this->cd->sizes[1]]; \
+    this->nonzerokx = int(this->rd->sizes[2] / 3); \
+    this->kMx = this->dkx*(this->nonzerokx-1); \
+    this->nonzeroky = int(this->rd->sizes[1] / 3); \
+    this->kMy = this->dky*(this->nonzeroky-1); \
+    this->nonzeroky = 2*this->nonzeroky - 1; \
+    this->nonzerokz = int(this->rd->sizes[0] / 3); \
+    this->kMz = this->dkz*(this->nonzerokz-1); \
+    this->nonzerokz = 2*this->nonzerokz - 1; \
+    int i, ii; \
+    for (i = 0; i<this->cd->sizes[2]; i++) \
+    { \
+        this->kx[i] = i*this->dkx; \
+        if (i < this->nonzerokx) \
+            this->knullx[i] = false; \
+        else \
+            this->knullx[i] = true; \
+    } \
+    for (i = 0; i<this->cd->subsizes[0]; i++) \
+    { \
+        int tval = (this->nonzeroky+1)/2; \
+        ii = i + this->cd->starts[0]; \
+        if (ii <= this->rd->sizes[1]/2) \
+            this->ky[i] = this->dky*ii; \
+        else \
+            this->ky[i] = this->dky*(ii - this->rd->sizes[1]); \
+        if (ii < tval || (this->rd->sizes[1] - ii) < tval) \
+            this->knully[i] = false; \
+        else \
+            this->knully[i] = true; \
+    } \
+    for (i = 0; i<this->cd->sizes[1]; i++) \
+    { \
+        int tval = (this->nonzerokz+1)/2; \
+        if (i <= this->rd->sizes[0]/2) \
+            this->kz[i] = this->dkz*i; \
+        else \
+            this->kz[i] = this->dkz*(i - this->rd->sizes[0]); \
+        if (i < tval || (this->rd->sizes[0] - i) < tval) \
+            this->knullz[i] = false; \
+        else \
+            this->knullz[i] = true; \
+    } \
 } \
  \
 template<> \
 fluid_solver<R>::~fluid_solver() \
 { \
+ \
+    delete this->kx;\
+    delete this->ky;\
+    delete this->kz;\
+    delete this->knullx;\
+    delete this->knully;\
+    delete this->knullz;\
+ \
     FFTW(destroy_plan)(*(FFTW(plan)*)this->c2r_vorticity);\
     FFTW(destroy_plan)(*(FFTW(plan)*)this->r2c_vorticity);\
     FFTW(destroy_plan)(*(FFTW(plan)*)this->c2r_velocity );\
     FFTW(destroy_plan)(*(FFTW(plan)*)this->r2c_velocity );\
+    FFTW(destroy_plan)(*(FFTW(plan)*)this->vc2r[1]);\
+    FFTW(destroy_plan)(*(FFTW(plan)*)this->vr2c[1]);\
+    FFTW(destroy_plan)(*(FFTW(plan)*)this->vc2r[2]);\
+    FFTW(destroy_plan)(*(FFTW(plan)*)this->vr2c[2]);\
  \
     delete (FFTW(plan)*)this->c2r_vorticity;\
     delete (FFTW(plan)*)this->r2c_vorticity;\
     delete (FFTW(plan)*)this->c2r_velocity ;\
     delete (FFTW(plan)*)this->r2c_velocity ;\
+    delete (FFTW(plan)*)this->vc2r[1];\
+    delete (FFTW(plan)*)this->vr2c[1];\
+    delete (FFTW(plan)*)this->vc2r[2];\
+    delete (FFTW(plan)*)this->vr2c[2];\
  \
+    FFTW(free)(this->cv[1]);\
+    FFTW(free)(this->cv[2]);\
     FFTW(free)(this->cvorticity);\
     FFTW(free)(this->rvorticity);\
     FFTW(free)(this->cvelocity);\
@@ -116,8 +247,37 @@ fluid_solver<R>::~fluid_solver() \
 } \
  \
 template<> \
-void fluid_solver<R>::step() \
-{}
+void fluid_solver<R>::omega_nonlin( \
+        int src) \
+{ \
+    assert(src >= 0 && src < 3); \
+    double k2; \
+    /* compute velocity field */ \
+    CLOOP( \
+            k2 = (this->kx[xindex]*this->kx[xindex] + \
+                  this->ky[yindex]*this->ky[yindex] + \
+                  this->kz[zindex]*this->kz[zindex]); \
+            this->cu[cindex*3 + 0][0] = (this->ky[yindex]*this->cv[src][cindex*3+2][1] - this->kz[zindex]*this->cv[src][cindex*3+1][1]) / k2; \
+            this->cu[cindex*3 + 1][0] = (this->kz[zindex]*this->cv[src][cindex*3+0][1] - this->kx[xindex]*this->cv[src][cindex*3+2][1]) / k2; \
+            this->cu[cindex*3 + 2][0] = (this->kx[xindex]*this->cv[src][cindex*3+1][1] - this->ky[yindex]*this->cv[src][cindex*3+0][1]) / k2; \
+            this->cu[cindex*3 + 0][1] = (this->ky[yindex]*this->cv[src][cindex*3+2][0] - this->kz[zindex]*this->cv[src][cindex*3+1][0]) / k2; \
+            this->cu[cindex*3 + 1][1] = (this->kz[zindex]*this->cv[src][cindex*3+0][0] - this->kx[xindex]*this->cv[src][cindex*3+2][0]) / k2; \
+            this->cu[cindex*3 + 2][1] = (this->kx[xindex]*this->cv[src][cindex*3+1][0] - this->ky[yindex]*this->cv[src][cindex*3+0][0]) / k2; \
+            ); \
+    /* $\imath k \times DFT(u \times \omega)$ */ \
+    /* get fields from Fourier space to real space */ \
+    FFTW(execute)(*((FFTW(plan)*)this->r2c_vorticity)); \
+    FFTW(execute)(*((FFTW(plan)*)this->r2c_velocity )); \
+    /* compute cross product into vr */ \
+    /* get cross product back into Fourier space */ \
+    FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity)); \
+    /* dealiasing */ \
+} \
+ \
+template<> \
+void fluid_solver<R>::step(double dt) \
+{ \
+}
 /*****************************************************************************/