diff --git a/src/fluid_solver.cpp b/src/fluid_solver.cpp
index 6b64d08b8cf2d2d2985c936bbb56786b6bf8e100..9c55db402bb5834e42f2dcbf75b7f0273083429c 100644
--- a/src/fluid_solver.cpp
+++ b/src/fluid_solver.cpp
@@ -20,6 +20,7 @@
 
 
 #include <cassert>
+#include <cmath>
 #include "fluid_solver.hpp"
 #include "fftw_tools.hpp"
 
@@ -28,7 +29,7 @@
 /*****************************************************************************/
 /* macros for loops                                                          */
 
-/* complex loop */
+/* Fourier space loop */
 #define CLOOP(expression) \
  \
 { \
@@ -45,6 +46,23 @@
     } \
 }
 
+/* real space loop */
+#define RLOOP(expression) \
+ \
+{ \
+    int rindex; \
+    for (int zindex = 0; zindex < this->rd->subsizes[0]; zindex++) \
+    for (int yindex = 0; yindex < this->rd->subsizes[1]; yindex++) \
+    { \
+        rindex = (zindex * this->rd->sizes[1] + yindex)*this->rd->sizes[2]; \
+    for (int xindex = 0; xindex < this->rd->subsizes[2]; xindex++) \
+        { \
+            expression; \
+            rindex++; \
+        } \
+    } \
+}
+
 /*****************************************************************************/
 
 
@@ -151,8 +169,9 @@ fluid_solver<R>::fluid_solver( \
             this->rv[2], this->cv[2], \
             MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT); \
  \
-    /* wavenumbers etc */ \
+    /* ``physical'' parameters etc */ \
  \
+    this->nu = 0.01; \
     this->dkx = DKX; \
     this->dky = DKY; \
     this->dkz = DKZ; \
@@ -257,26 +276,94 @@ void fluid_solver<R>::omega_nonlin( \
             k2 = (this->kx[xindex]*this->kx[xindex] + \
                   this->ky[yindex]*this->ky[yindex] + \
                   this->kz[zindex]*this->kz[zindex]); \
-            this->cu[cindex*3 + 0][0] = (this->ky[yindex]*this->cv[src][cindex*3+2][1] - this->kz[zindex]*this->cv[src][cindex*3+1][1]) / k2; \
-            this->cu[cindex*3 + 1][0] = (this->kz[zindex]*this->cv[src][cindex*3+0][1] - this->kx[xindex]*this->cv[src][cindex*3+2][1]) / k2; \
-            this->cu[cindex*3 + 2][0] = (this->kx[xindex]*this->cv[src][cindex*3+1][1] - this->ky[yindex]*this->cv[src][cindex*3+0][1]) / k2; \
-            this->cu[cindex*3 + 0][1] = (this->ky[yindex]*this->cv[src][cindex*3+2][0] - this->kz[zindex]*this->cv[src][cindex*3+1][0]) / k2; \
-            this->cu[cindex*3 + 1][1] = (this->kz[zindex]*this->cv[src][cindex*3+0][0] - this->kx[xindex]*this->cv[src][cindex*3+2][0]) / k2; \
-            this->cu[cindex*3 + 2][1] = (this->kx[xindex]*this->cv[src][cindex*3+1][0] - this->ky[yindex]*this->cv[src][cindex*3+0][0]) / k2; \
+            this->cu[cindex*3+0][0] = (this->ky[yindex]*this->cv[src][cindex*3+2][1] - this->kz[zindex]*this->cv[src][cindex*3+1][1]) / k2; \
+            this->cu[cindex*3+1][0] = (this->kz[zindex]*this->cv[src][cindex*3+0][1] - this->kx[xindex]*this->cv[src][cindex*3+2][1]) / k2; \
+            this->cu[cindex*3+2][0] = (this->kx[xindex]*this->cv[src][cindex*3+1][1] - this->ky[yindex]*this->cv[src][cindex*3+0][1]) / k2; \
+            this->cu[cindex*3+0][1] = (this->ky[yindex]*this->cv[src][cindex*3+2][0] - this->kz[zindex]*this->cv[src][cindex*3+1][0]) / k2; \
+            this->cu[cindex*3+1][1] = (this->kz[zindex]*this->cv[src][cindex*3+0][0] - this->kx[xindex]*this->cv[src][cindex*3+2][0]) / k2; \
+            this->cu[cindex*3+2][1] = (this->kx[xindex]*this->cv[src][cindex*3+1][0] - this->ky[yindex]*this->cv[src][cindex*3+0][0]) / k2; \
             ); \
-    /* $\imath k \times DFT(u \times \omega)$ */ \
     /* get fields from Fourier space to real space */ \
-    FFTW(execute)(*((FFTW(plan)*)this->r2c_vorticity)); \
+    FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity )); \
+    FFTW(execute)(*((FFTW(plan)*)this->vc2r[src])); \
+    /* compute cross product $u \times \omega$, and normalize */ \
+    R tmpx0, tmpy0, tmpz0; \
+    RLOOP ( \
+             tmpx0 = (this->ru[rindex*3+1]*this->rv[src][rindex*3+2] - this->ru[rindex*3+2]*this->rv[src][rindex*3+1]); \
+             tmpy0 = (this->ru[rindex*3+2]*this->rv[src][rindex*3+0] - this->ru[rindex*3+0]*this->rv[src][rindex*3+2]); \
+             tmpz0 = (this->ru[rindex*3+0]*this->rv[src][rindex*3+1] - this->ru[rindex*3+1]*this->rv[src][rindex*3+0]); \
+             this->ru[rindex*3+0] = tmpx0 / (this->rd->full_size / 3); \
+             this->ru[rindex*3+1] = tmpy0 / (this->rd->full_size / 3); \
+             this->ru[rindex*3+2] = tmpz0 / (this->rd->full_size / 3); \
+            ); \
+    /* go back to Fourier space */ \
     FFTW(execute)(*((FFTW(plan)*)this->r2c_velocity )); \
-    /* compute cross product into vr */ \
-    /* get cross product back into Fourier space */ \
-    FFTW(execute)(*((FFTW(plan)*)this->c2r_velocity)); \
-    /* dealiasing */ \
+    /* $\imath k \times DFT(u \times \omega)$ */ \
+    R tmpx1, tmpy1, tmpz1; \
+    CLOOP( \
+            tmpx0 = (this->ky[yindex]*this->cu[cindex*3+2][1] - this->kz[zindex]*this->cu[cindex*3+1][1]); \
+            tmpy0 = (this->kz[zindex]*this->cu[cindex*3+0][1] - this->kx[xindex]*this->cu[cindex*3+2][1]); \
+            tmpz0 = (this->kx[xindex]*this->cu[cindex*3+1][1] - this->ky[yindex]*this->cu[cindex*3+0][1]); \
+            tmpx1 = (this->ky[yindex]*this->cu[cindex*3+2][0] - this->kz[zindex]*this->cu[cindex*3+1][0]); \
+            tmpy1 = (this->kz[zindex]*this->cu[cindex*3+0][0] - this->kx[xindex]*this->cu[cindex*3+2][0]); \
+            tmpz1 = (this->kx[xindex]*this->cu[cindex*3+1][0] - this->ky[yindex]*this->cu[cindex*3+0][0]); \
+            this->cu[cindex*3+0][0] = tmpx0;\
+            this->cu[cindex*3+1][0] = tmpy0;\
+            this->cu[cindex*3+2][0] = tmpz0;\
+            this->cu[cindex*3+0][1] = tmpx1;\
+            this->cu[cindex*3+1][1] = tmpy1;\
+            this->cu[cindex*3+2][1] = tmpz1;\
+            ); \
 } \
  \
 template<> \
 void fluid_solver<R>::step(double dt) \
 { \
+    double k2, factor0, factor1; \
+    this->omega_nonlin(0); \
+    CLOOP( \
+            k2 = (this->kx[xindex]*this->kx[xindex] + \
+                  this->ky[yindex]*this->ky[yindex] + \
+                  this->kz[zindex]*this->kz[zindex]); \
+            factor0 = exp(-this->nu * k2 * dt); \
+            this->cv[1][cindex*3+0][0] = (this->cv[0][cindex*3+0][0] + dt*this->cu[cindex*3+0][0])*factor0; \
+            this->cv[1][cindex*3+1][0] = (this->cv[0][cindex*3+1][0] + dt*this->cu[cindex*3+1][0])*factor0; \
+            this->cv[1][cindex*3+2][0] = (this->cv[0][cindex*3+2][0] + dt*this->cu[cindex*3+2][0])*factor0; \
+            this->cv[1][cindex*3+0][1] = (this->cv[0][cindex*3+0][1] + dt*this->cu[cindex*3+0][1])*factor0; \
+            this->cv[1][cindex*3+1][1] = (this->cv[0][cindex*3+1][1] + dt*this->cu[cindex*3+1][1])*factor0; \
+            this->cv[1][cindex*3+2][1] = (this->cv[0][cindex*3+2][1] + dt*this->cu[cindex*3+2][1])*factor0; \
+            ); \
+ \
+    this->omega_nonlin(1); \
+    CLOOP( \
+            k2 = (this->kx[xindex]*this->kx[xindex] + \
+                  this->ky[yindex]*this->ky[yindex] + \
+                  this->kz[zindex]*this->kz[zindex]); \
+            factor0 = exp(-this->nu * k2 * dt/2); \
+            factor1 = exp( this->nu * k2 * dt/2); \
+            this->cv[2][cindex*3+0][0] = (3*this->cv[0][cindex*3+0][0]*factor0 + (this->cv[1][cindex*3+0][0] + dt*this->cu[cindex*3+0][0])*factor1)*0.25; \
+            this->cv[2][cindex*3+1][0] = (3*this->cv[0][cindex*3+1][0]*factor0 + (this->cv[1][cindex*3+1][0] + dt*this->cu[cindex*3+1][0])*factor1)*0.25; \
+            this->cv[2][cindex*3+2][0] = (3*this->cv[0][cindex*3+2][0]*factor0 + (this->cv[1][cindex*3+2][0] + dt*this->cu[cindex*3+2][0])*factor1)*0.25; \
+            this->cv[2][cindex*3+0][1] = (3*this->cv[0][cindex*3+0][1]*factor0 + (this->cv[1][cindex*3+0][1] + dt*this->cu[cindex*3+0][1])*factor1)*0.25; \
+            this->cv[2][cindex*3+1][1] = (3*this->cv[0][cindex*3+1][1]*factor0 + (this->cv[1][cindex*3+1][1] + dt*this->cu[cindex*3+1][1])*factor1)*0.25; \
+            this->cv[2][cindex*3+2][1] = (3*this->cv[0][cindex*3+2][1]*factor0 + (this->cv[1][cindex*3+2][1] + dt*this->cu[cindex*3+2][1])*factor1)*0.25; \
+            ); \
+ \
+    this->omega_nonlin(2); \
+    CLOOP( \
+            k2 = (this->kx[xindex]*this->kx[xindex] + \
+                  this->ky[yindex]*this->ky[yindex] + \
+                  this->kz[zindex]*this->kz[zindex]); \
+            factor0 = exp(-this->nu * k2 * dt); \
+            factor1 = exp(-this->nu * k2 * dt/2); \
+            this->cv[3][cindex*3+0][0] = (this->cv[0][cindex*3+0][0]*factor0 + 2*(this->cv[1][cindex*3+0][0] + dt*this->cu[cindex*3+0][0])*factor1)/3; \
+            this->cv[3][cindex*3+1][0] = (this->cv[0][cindex*3+1][0]*factor0 + 2*(this->cv[1][cindex*3+1][0] + dt*this->cu[cindex*3+1][0])*factor1)/3; \
+            this->cv[3][cindex*3+2][0] = (this->cv[0][cindex*3+2][0]*factor0 + 2*(this->cv[1][cindex*3+2][0] + dt*this->cu[cindex*3+2][0])*factor1)/3; \
+            this->cv[3][cindex*3+0][1] = (this->cv[0][cindex*3+0][1]*factor0 + 2*(this->cv[1][cindex*3+0][1] + dt*this->cu[cindex*3+0][1])*factor1)/3; \
+            this->cv[3][cindex*3+1][1] = (this->cv[0][cindex*3+1][1]*factor0 + 2*(this->cv[1][cindex*3+1][1] + dt*this->cu[cindex*3+1][1])*factor1)/3; \
+            this->cv[3][cindex*3+2][1] = (this->cv[0][cindex*3+2][1]*factor0 + 2*(this->cv[1][cindex*3+2][1] + dt*this->cu[cindex*3+2][1])*factor1)/3; \
+            ); \
+ \
 }
 /*****************************************************************************/