diff --git a/bench.py b/bench.py
index 85a34348ee8bb77ea0483f5b3d198eb0a33a26d7..e403ee10e8ec9063ed590b938812ee3a5314819f 100644
--- a/bench.py
+++ b/bench.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pypocketfft
-import pyfftw
 from time import time
 import matplotlib.pyplot as plt
 import math
diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h
index f0a03e0d7e40a0827a0704f20ccb646f0281e627..8c59c48c0e192484f82f887233d6be47e9f01c1b 100644
--- a/pocketfft_hdronly.h
+++ b/pocketfft_hdronly.h
@@ -266,305 +266,78 @@ template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
 //
 // twiddle factor section
 //
-
-/** Approximate sin(pi*a) within the range [-0.25, 0.25] */
-inline float sinpi0(float a)
-  {
-  // adapted from https://stackoverflow.com/questions/42792939/
-  float s = a * a;
-  float r =      -5.957031250000000000e-01f;
-  r = fma (r, s,  2.550399541854858398e+00f);
-  r = fma (r, s, -5.167724132537841797e+00f);
-  r = (a * s) * r;
-  return fma (a, 3.141592741012573242e+00f, r);
-  }
-
-/** Approximate sin(pi*a) within the range [-0.25, 0.25] */
-inline double sinpi0(double a)
-  {
-  // adapted from https://stackoverflow.com/questions/42792939/
-  double s = a * a;
-  double r =      4.6151442520157035e-4;
-  r = fma (r, s, -7.3700183130883555e-3);
-  r = fma (r, s,  8.2145868949323936e-2);
-  r = fma (r, s, -5.9926452893214921e-1);
-  r = fma (r, s,  2.5501640398732688e+0);
-  r = fma (r, s, -5.1677127800499516e+0);
-  s = s * a;
-  r = r * s;
-  return fma (a, 3.1415926535897931e+0, r);
-  }
-
-/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
-inline float cosm1pi0(float a)
-  {
-  // adapted from https://stackoverflow.com/questions/42792939/
-  float s = a * a;
-  float r =        2.313842773437500000e-01f;
-  r = fmaf (r, s, -1.335021972656250000e+00f);
-  r = fmaf (r, s,  4.058703899383544922e+00f);
-  r = fmaf (r, s, -4.934802055358886719e+00f);
-  return r*s;
-  }
-
-/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
-inline double cosm1pi0(double a)
-  {
-  // adapted from https://stackoverflow.com/questions/42792939/
-  double s = a * a;
-  double r =     -1.0369917389758117e-4;
-  r = fma (r, s,  1.9294935641298806e-3);
-  r = fma (r, s, -2.5806887942825395e-2);
-  r = fma (r, s,  2.3533063028328211e-1);
-  r = fma (r, s, -1.3352627688538006e+0);
-  r = fma (r, s,  4.0587121264167623e+0);
-  r = fma (r, s, -4.9348022005446790e+0);
-  return r*s;
-  }
-
-template <typename T> void sincosm1pi0(T a_, T *POCKETFFT_RESTRICT res)
-  {
-  if (sizeof(T)>sizeof(double)) // don't have the code for long double
-    {
-    constexpr T pi = T(3.141592653589793238462643383279502884197L);
-    auto s = sin(pi*a_);
-    res[1] = s;
-    res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1);
-    return;
-    }
-  res[0] = T(cosm1pi0(double(a_)));
-  res[1] = T(sinpi0(double(a_)));
-  }
-
-template <typename T> T sinpi(T a)
-  {
-  // reduce argument to primary approximation interval (-0.25, 0.25)
-  auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
-  auto i = (int64_t)r;
-  auto t = fma (T(-0.5), r, a);
-
-  switch (i%4)
-    {
-    case 0:
-      return sinpi0(t);
-    case 1: case -3:
-      return cosm1pi0(t) + T(1.);
-    case 2: case -2:
-      return T(0.) - sinpi0(t);
-    case 3: case -1:
-      return T(-1.) - cosm1pi0(t);
-    }
-  throw runtime_error("cannot happen");
-  }
-
-template <typename T> T cospi(T a)
-  {
-  // reduce argument to primary approximation interval (-0.25, 0.25)
-  auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
-  auto i = (int64_t)r;
-  auto t = fma (T(-0.5), r, a);
-
-  switch (i%4)
-    {
-    case 0:
-      return cosm1pi0(t) + T(1.);
-    case 1: case -3:
-      return T(0.) - sinpi0(t);
-    case 2: case -2:
-      return T(-1.) - cosm1pi0(t);
-    case 3: case -1:
-      return sinpi0(t);
-    }
-  throw runtime_error("cannot happen");
-  }
-
-inline long double cospi(long double a)
-  {
-  constexpr auto pi = 3.141592653589793238462643383279502884197L;
-  return sizeof(long double) > sizeof(double) ? cos(a * pi) :
-    static_cast<long double>(cospi(static_cast<double>(a)));
-  }
-
-inline long double sinpi(long double a)
-  {
-  constexpr auto pi = 3.141592653589793238462643383279502884197L;
-  return sizeof(long double) > sizeof(double) ? sin(a * pi) :
-    static_cast<long double>(sinpi(static_cast<double>(a)));
-  }
-
-template <typename T> void sincospi(T a, T *POCKETFFT_RESTRICT res)
-  {
-  // reduce argument to primary approximation interval (-0.25, 0.25)
-  auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding
-  auto i = (int64_t)r;
-  auto t = fma (T(-0.5), r, a);
-
-  auto c=cosm1pi0(t)+T(1.);
-  auto s=sinpi0(t);
-
-  // map results according to quadrant
-  if (i & 2)
-    {
-    s = T(0.)-s;
-    c = T(0.)-c;
-    }
-  if (i & 1)
-    {
-    swap(s, c);
-    c = T(0.)-c;
-    }
-  res[0]=c;
-  res[1]=s;
-  }
-
-inline void sincospi(long double a, long double *POCKETFFT_RESTRICT res)
-  {
-  if (sizeof(long double) > sizeof(double))
-    {
-    constexpr auto pi = 3.141592653589793238462643383279502884197L;
-    res[0] = cos(pi * a);
-    res[1] = sin(pi * a);
-    }
-  else
-    {
-    double sincos[2];
-    sincospi(static_cast<double>(a), sincos);
-    res[0] = static_cast<long double>(sincos[0]);
-    res[1] = static_cast<long double>(sincos[1]);
-    }
-  }
-
 template<typename T> class sincos_2pibyn
   {
   private:
     using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
-    arr<T> data;
-
-    POCKETFFT_NOINLINE void calc_first_octant(size_t den,
-      T * POCKETFFT_RESTRICT res)
-      {
-      size_t n = (den+4)>>3;
-      if (n==0) return;
-      res[0]=1.; res[1]=0.;
-      if (n==1) return;
-      size_t l1 = size_t(sqrt(n));
-      arr<Thigh> tmp(2*l1);
-      for (size_t i=1; i<l1; ++i)
-        {
-        sincosm1pi0(Thigh(2*i)/Thigh(den),&tmp[2*i]);
-        res[2*i  ] = T(tmp[2*i]+1);
-        res[2*i+1] = T(tmp[2*i+1]);
-        }
-      size_t start=l1;
-      while(start<n)
-        {
-        Thigh cs[2];
-        sincosm1pi0((Thigh(2*start))/Thigh(den),cs);
-        res[2*start] = T(cs[0]+1);
-        res[2*start+1] = T(cs[1]);
-        size_t end = l1;
-        if (start+end>n) end = n-start;
-        for (size_t i=1; i<end; ++i)
-          {
-          Thigh csx[2]={tmp[2*i], tmp[2*i+1]};
-          res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1);
-          res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]);
-          }
-        start += l1;
-        }
-      }
+    size_t N, mask, shift;
+    arr<cmplx<Thigh>> v1, v2;
 
-    void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
+    static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
       {
-      T * POCKETFFT_RESTRICT p = res+n;
-      calc_first_octant(n<<1, p);
-      size_t ndone=(n+2)>>2;
-      size_t i=0, idx1=0, idx2=2*ndone-2;
-      for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
+      x<<=3;
+      if (x<4*n) // first half
         {
-        res[idx1] = p[2*i  ]; res[idx1+1] = p[2*i+1];
-        res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2];
+        if (x<2*n) // first quadrant
+          {
+          if (x<n) return cmplx<Thigh>(cos(Thigh(x)*ang), sin(Thigh(x)*ang));
+          return cmplx<Thigh>(sin(Thigh(2*n-x)*ang), cos(Thigh(2*n-x)*ang));
+          }
+        else // second quadrant
+          {
+          x-=2*n;
+          if (x<n) return cmplx<Thigh>(-sin(Thigh(x)*ang), cos(Thigh(x)*ang));
+          return cmplx<Thigh>(-cos(Thigh(2*n-x)*ang), sin(Thigh(2*n-x)*ang));
+          }
         }
-      if (i!=ndone)
-        { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; }
-      }
-
-    void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res)
-      {
-      int ndone=int(n+1)>>1;
-      T * p = res+n-1;
-      calc_first_octant(n<<2, p);
-      int i4=0, in=int(n), i=0;
-      for (; i4<=in-i4; ++i, i4+=4) // octant 0
-        { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; }
-      for (; i4-in <= 0; ++i, i4+=4) // octant 1
-        { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; }
-      for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
-        { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; }
-      for (; i<ndone; ++i, i4+=4) // octant 3
-        { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; }
-      }
-
-    void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
-      {
-      constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
-      size_t quart = n>>2;
-      if ((n&7)==0)
-        res[quart] = res[quart+1] = hsqt2;
-      for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
-        { res[j] = res[i+1]; res[j+1] = res[i]; }
-      }
-
-    POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res)
-      {
-      size_t half = n>>1;
-      if ((n&3)==0)
-        for (size_t i=0; i<half; i+=2)
-          { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; }
-      else
-        for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
-          { res[j] = -res[i]; res[j+1] = res[i+1]; }
-      }
-
-    void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res)
-      {
-      if ((n&1)==0)
-        for (size_t i=0; i<n; ++i)
-          res[i+n] = -res[i];
       else
-        for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
-          { res[j] = res[i]; res[j+1] = -res[i+1]; }
-      }
-
-    POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res)
-      {
-      if ((n&3)==0)
         {
-        calc_first_octant(n, res);
-        fill_first_quadrant(n, res);
-        fill_first_half(n, res);
-        }
-      else if ((n&1)==0)
-        {
-        calc_first_quadrant(n, res);
-        fill_first_half(n, res);
+        x=8*n-x;
+        if (x<2*n) // third quadrant
+          {
+          if (x<n) return cmplx<Thigh>(cos(Thigh(x)*ang), -sin(Thigh(x)*ang));
+          return cmplx<Thigh>(sin(Thigh(2*n-x)*ang), -cos(Thigh(2*n-x)*ang));
+          }
+        else // fourth quadrant
+          {
+          x-=2*n;
+          if (x<n) return cmplx<Thigh>(-sin(Thigh(x)*ang), -cos(Thigh(x)*ang));
+          return cmplx<Thigh>(-cos(Thigh(2*n-x)*ang), -sin(Thigh(2*n-x)*ang));
+          }
         }
-      else
-        calc_first_half(n, res);
       }
 
   public:
-    POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half)
-      : data(2*n)
-      {
-      sincos_2pibyn_half(n, data.data());
-      if (!half) fill_second_half(n, data.data());
+    POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
+      : N(n)
+      {
+      constexpr auto pi = 3.141592653589793238462643383279502884197L;
+      Thigh ang = Thigh(0.25L*pi/n);
+      size_t nval = (n+2)/2;
+      shift = 1;
+      while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
+      mask = (size_t(1)<<shift)-1;
+      v1.resize(mask+1);
+      v1[0].Set(Thigh(1), Thigh(0));
+      for (size_t i=1; i<v1.size(); ++i)
+        v1[i]=calc(i,n,ang);
+      v2.resize((nval+mask)/(mask+1));
+      v2[0].Set(Thigh(1), Thigh(0));
+      for (size_t i=1; i<v2.size(); ++i)
+        v2[i]=calc(i*(mask+1),n,ang);
+      }
+
+    cmplx<T> operator[](size_t idx) const
+      {
+      if (2*idx<=N)
+        {
+        auto x1=v1[idx&mask], x2=v2[idx>>shift];
+        return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
+        }
+      idx = N-idx;
+      auto x1=v1[idx&mask], x2=v2[idx>>shift];
+      return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
       }
-
-    T operator[](size_t idx) const { return data[idx]; }
-    const T *rdata() const { return data; }
-    const cmplx<T> *cdata() const
-      { return reinterpret_cast<const cmplx<T> *>(data.data()); }
   };
 
 struct util // hack to avoid duplicate symbols
@@ -940,12 +713,10 @@ template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=2;
-
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+2*c)]; };
   auto WA = [wa, ido](size_t x, size_t i)
     { return wa[i-1+x*(ido-1)]; };
 
@@ -989,14 +760,13 @@ template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=3;
   constexpr T0 tw1r=-0.5,
                tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
 
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+3*c)]; };
   auto WA = [wa, ido](size_t x, size_t i)
     { return wa[i-1+x*(ido-1)]; };
 
@@ -1029,12 +799,10 @@ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=4;
-
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+4*c)]; };
   auto WA = [wa, ido](size_t x, size_t i)
     { return wa[i-1+x*(ido-1)]; };
 
@@ -1105,7 +873,6 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=5;
   constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
                tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
                tw2r= T0(-0.8090169943749474241022934171828191L),
@@ -1113,8 +880,8 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
 
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+5*c)]; };
   auto WA = [wa, ido](size_t x, size_t i)
     { return wa[i-1+x*(ido-1)]; };
 
@@ -1177,7 +944,6 @@ template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=7;
   constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
                tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
                tw2r= T0(-0.2225209339563144042889025644967948L),
@@ -1187,8 +953,8 @@ template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
 
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+7*c)]; };
   auto WA = [wa, ido](size_t x, size_t i)
     { return wa[i-1+x*(ido-1)]; };
 
@@ -1245,12 +1011,10 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=8;
-
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+8*c)]; };
   auto WA = [wa, ido](size_t x, size_t i)
     { return wa[i-1+x*(ido-1)]; };
 
@@ -1360,7 +1124,6 @@ template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const cmplx<T0> * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=11;
   constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
                tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
                tw2r= T0(0.4154150130018864255292741492296232L),
@@ -1374,8 +1137,8 @@ template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
 
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+11*c)]; };
   auto WA = [wa, ido](size_t x, size_t i)
     { return wa[i-1+x*(ido-1)]; };
 
@@ -1618,8 +1381,7 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
 
     void comp_twiddle()
       {
-      sincos_2pibyn<T0> twid(length, false);
-      auto twiddle = twid.cdata();
+      sincos_2pibyn<T0> twiddle(length);
       size_t l1=1;
       size_t memofs=0;
       for (size_t k=0; k<fact.size(); ++k)
@@ -1682,13 +1444,11 @@ template<typename T> void radf2 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const T0 * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=2;
-
   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
   auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
     { return cc[a+ido*(b+l1*c)]; };
-  auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T&
-    { return ch[a+ido*(b+cdim*c)]; };
+  auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
+    { return ch[a+ido*(b+2*c)]; };
 
   for (size_t k=0; k<l1; k++)
     PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
@@ -1721,14 +1481,13 @@ template<typename T> void radf3(size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const T0 * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=3;
   constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
 
   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
   auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
     { return cc[a+ido*(b+l1*c)]; };
-  auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T&
-    { return ch[a+ido*(b+cdim*c)]; };
+  auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
+    { return ch[a+ido*(b+3*c)]; };
 
   for (size_t k=0; k<l1; k++)
     {
@@ -1761,14 +1520,13 @@ template<typename T> void radf4(size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const T0 * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=4;
   constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
 
   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
   auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
     { return cc[a+ido*(b+l1*c)]; };
-  auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T&
-    { return ch[a+ido*(b+cdim*c)]; };
+  auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
+    { return ch[a+ido*(b+4*c)]; };
 
   for (size_t k=0; k<l1; k++)
     {
@@ -1809,7 +1567,6 @@ template<typename T> void radf5(size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const T0 * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=5;
   constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
                ti11= T0(0.9510565162951535721164393333793821L),
                tr12= T0(-0.8090169943749474241022934171828191L),
@@ -1818,8 +1575,8 @@ template<typename T> void radf5(size_t ido, size_t l1,
   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
   auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
     { return cc[a+ido*(b+l1*c)]; };
-  auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T&
-    { return ch[a+ido*(b+cdim*c)]; };
+  auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
+    { return ch[a+ido*(b+5*c)]; };
 
   for (size_t k=0; k<l1; k++)
     {
@@ -2008,11 +1765,9 @@ template<typename T> void radb2(size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const T0 * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=2;
-
   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+2*c)]; };
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
 
@@ -2040,12 +1795,11 @@ template<typename T> void radb3(size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const T0 * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=3;
   constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
 
   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+3*c)]; };
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
 
@@ -2081,12 +1835,11 @@ template<typename T> void radb4(size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const T0 * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=4;
   constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
 
   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+4*c)]; };
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
 
@@ -2134,15 +1887,14 @@ template<typename T> void radb5(size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
   const T0 * POCKETFFT_RESTRICT wa) const
   {
-  constexpr size_t cdim=5;
   constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
                ti11= T0(0.9510565162951535721164393333793821L),
                tr12= T0(-0.8090169943749474241022934171828191L),
                ti12= T0(0.5877852522924731291687059546390728L);
 
   auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
-  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
-    { return cc[a+ido*(b+cdim*c)]; };
+  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
+    { return cc[a+ido*(b+5*c)]; };
   auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
     { return ch[a+ido*(b+l1*c)]; };
 
@@ -2427,7 +2179,7 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
 
     void comp_twiddle()
       {
-      sincos_2pibyn<T0> twid(length, true);
+      sincos_2pibyn<T0> twid(length);
       size_t l1=1;
       T0 *ptr=mem.data();
       for (size_t k=0; k<fact.size(); ++k)
@@ -2439,8 +2191,8 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
           for (size_t j=1; j<ip; ++j)
             for (size_t i=1; i<=(ido-1)/2; ++i)
               {
-              fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
-              fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
+              fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
+              fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
               }
           }
         if (ip>5) // special factors required by *g functions
@@ -2450,10 +2202,10 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
           fact[k].tws[1] = 0.;
           for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
             {
-            fact[k].tws[i  ] = twid[i*(length/ip)];
-            fact[k].tws[i+1] = twid[i*(length/ip)+1];
-            fact[k].tws[ic]   = twid[i*(length/ip)];
-            fact[k].tws[ic+1] = -twid[i*(length/ip)+1];
+            fact[k].tws[i  ] = twid[i/2*(length/ip)].r;
+            fact[k].tws[i+1] = twid[i/2*(length/ip)].i;
+            fact[k].tws[ic]   = twid[i/2*(length/ip)].r;
+            fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i;
             }
           }
         l1*=ip;
@@ -2498,8 +2250,14 @@ template<typename T0> class fftblue
       plan.exec (akf.data(),1.,true);
 
       /* do the convolution */
-      for (size_t m=0; m<n2; ++m)
+      akf[0] = akf[0].template special_mul<!fwd>(bkf[0]);
+      for (size_t m=1; m<(n2+1)/2; ++m)
+        {
         akf[m] = akf[m].template special_mul<!fwd>(bkf[m]);
+        akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]);
+        }
+      if ((n2&1)==0)
+        akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]);
 
       /* inverse FFT */
       plan.exec (akf.data(),1.,false);
@@ -2511,12 +2269,11 @@ template<typename T0> class fftblue
 
   public:
     POCKETFFT_NOINLINE fftblue(size_t length)
-      : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2),
+      : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
         bk(mem.data()), bkf(mem.data()+n)
       {
       /* initialize b_k */
-      sincos_2pibyn<T0> tmp_(2*n, false);
-      auto tmp = tmp_.cdata();
+      sincos_2pibyn<T0> tmp(2*n);
       bk[0].Set(1, 0);
 
       size_t coeff=0;
@@ -2528,13 +2285,16 @@ template<typename T0> class fftblue
         }
 
       /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
+      arr<cmplx<T0>> tbkf(n2);
       T0 xn2 = T0(1)/T0(n2);
-      bkf[0] = bk[0]*xn2;
+      tbkf[0] = bk[0]*xn2;
       for (size_t m=1; m<n; ++m)
-        bkf[m] = bkf[n2-m] = bk[m]*xn2;
+        tbkf[m] = tbkf[n2-m] = bk[m]*xn2;
       for (size_t m=n;m<=(n2-n);++m)
-        bkf[m].Set(0.,0.);
-      plan.exec(bkf,1.,true);
+        tbkf[m].Set(0.,0.);
+      plan.exec(tbkf.data(),1.,true);
+      for (size_t i=0; i<n2/2+1; ++i)
+        bkf[i] = tbkf[i];
       }
 
     template<typename T> void exec(cmplx<T> c[], T0 fct, bool fwd) const
@@ -2712,9 +2472,9 @@ template<typename T0> class T_dcst23
     POCKETFFT_NOINLINE T_dcst23(size_t length)
       : fftplan(length), twiddle(length)
       {
-      const auto oo2n = T0(0.5)/T0(length);
+      sincos_2pibyn<T0> tw(4*length);
       for (size_t i=0; i<length; ++i)
-        twiddle[i] = cospi(oo2n*T0(i+1));
+        twiddle[i] = tw[i+1].r;
       }
 
     template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
@@ -2787,20 +2547,17 @@ template<typename T0> class T_dcst4
         rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
         C2((N&1) ? 0 : N/2)
       {
-      const auto oon = -T0(1.)/T0(N);
       if ((N&1)==0)
+        {
+        sincos_2pibyn<T0> tw(16*N);
         for (size_t i=0; i<N/2; ++i)
-          {
-          T0 sincos[2];
-          sincospi(oon*(T0(i)+T0(0.125)), sincos);
-          C2[i].Set(sincos[0], sincos[1]);
-          }
+          C2[i] = conj(tw[8*i+1]);
+        }
       }
 
     template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
       bool /*ortho*/, int /*type*/, bool cosine) const
       {
-      constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
       size_t n2 = N/2;
       if (!cosine)
         for (size_t k=0, kc=N-1; k<n2; ++k, --kc)
@@ -2827,7 +2584,11 @@ template<typename T0> class T_dcst4
         }
         rfft->exec(y.data(), fct, true);
         {
-        auto SGN = [sqrt2](size_t i) {return (i&2) ? -sqrt2 : sqrt2;};
+        auto SGN = [](size_t i)
+           {
+           constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
+           return (i&2) ? -sqrt2 : sqrt2;
+           };
         c[n2] = y[0]*SGN(n2+1);
         size_t i=0, i1=1, k=1;
         for (; k<n2; ++i, ++i1, k+=2)
diff --git a/stress.py b/stress.py
index b6c23ddeb62d7b9d41452287ed95afe99af179bd..e7526d5c614f538c33e0e5a7f4bca1a876be9349 100644
--- a/stress.py
+++ b/stress.py
@@ -2,8 +2,8 @@ import numpy as np
 import pypocketfft
 
 
-def _l2error(a, b):
-    return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))
+def _l2error(a, b, axes):
+    return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))/np.log2(np.max([2,np.prod(np.take(a.shape,axes))]))
 
 
 def fftn(a, axes=None, inorm=0, out=None, nthreads=1):
@@ -52,85 +52,89 @@ def test(err):
     a_32 = a.astype(np.complex64)
     b = ifftn(fftn(a, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
               nthreads=nthreads)
-    err = update_err(err, "cmax", _l2error(a, b))
+    err = update_err(err, "cmax", _l2error(a, b, axes))
     b = ifftn(fftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
               nthreads=nthreads)
-    err = update_err(err, "cmax", _l2error(a.real, b))
+    err = update_err(err, "cmax", _l2error(a.real, b, axes))
     b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
              nthreads=nthreads)
-    err = update_err(err, "cmax", _l2error(a.real, b))
-    b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
-               lastsize=lastsize, nthreads=nthreads)
-    err = update_err(err, "rmax", _l2error(a.real, b))
+    err = update_err(err, "cmax", _l2error(a.real, b, axes))
     b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads),
               axes=axes, inorm=2, nthreads=nthreads)
-    err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b))
+    err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes))
+    b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2,
+               lastsize=lastsize, nthreads=nthreads)
+    err = update_err(err, "rmax", _l2error(a.real, b, axes))
     b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads),
                axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads)
-    err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b))
+    err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes))
     b = pypocketfft.separable_hartley(
         pypocketfft.separable_hartley(a.real, axes=axes, nthreads=nthreads),
         axes=axes, inorm=2, nthreads=nthreads)
-    err = update_err(err, "hmax", _l2error(a.real, b))
+    err = update_err(err, "hmax", _l2error(a.real, b, axes))
     b = pypocketfft.genuine_hartley(
         pypocketfft.genuine_hartley(a.real, axes=axes, nthreads=nthreads),
         axes=axes, inorm=2, nthreads=nthreads)
-    err = update_err(err, "hmax", _l2error(a.real, b))
+    err = update_err(err, "hmax", _l2error(a.real, b, axes))
     b = pypocketfft.separable_hartley(
             pypocketfft.separable_hartley(
                 a.real.astype(np.float32), axes=axes, nthreads=nthreads),
             axes=axes, inorm=2, nthreads=nthreads)
-    err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b))
+    err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes))
     b = pypocketfft.genuine_hartley(
             pypocketfft.genuine_hartley(a.real.astype(np.float32), axes=axes,
                                         nthreads=nthreads),
             axes=axes, inorm=2, nthreads=nthreads)
-    err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b))
+    err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes))
     if all(a.shape[i] > 1 for i in axes):
         b = pypocketfft.dct(
             pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=1),
             axes=axes, type=1, nthreads=nthreads, inorm=2)
-        err = update_err(err, "c1max", _l2error(a.real, b))
+        err = update_err(err, "c1max", _l2error(a.real, b, axes))
         b = pypocketfft.dct(
             pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=1),
             axes=axes, type=1, nthreads=nthreads, inorm=2)
-        err = update_err(err, "c1maxf", _l2error(a_32.real, b))
+        err = update_err(err, "c1maxf", _l2error(a_32.real, b, axes))
     b = pypocketfft.dct(
         pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=2),
         axes=axes, type=3, nthreads=nthreads, inorm=2)
-    err = update_err(err, "c23max", _l2error(a.real, b))
+    err = update_err(err, "c23max", _l2error(a.real, b, axes))
     b = pypocketfft.dct(
         pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=2),
         axes=axes, type=3, nthreads=nthreads, inorm=2)
-    err = update_err(err, "c23maxf", _l2error(a_32.real, b))
+    err = update_err(err, "c23maxf", _l2error(a_32.real, b, axes))
     b = pypocketfft.dct(
         pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=4),
         axes=axes, type=4, nthreads=nthreads, inorm=2)
-    err = update_err(err, "c4max", _l2error(a.real, b))
+    err = update_err(err, "c4max", _l2error(a.real, b, axes))
     b = pypocketfft.dct(
         pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4),
         axes=axes, type=4, nthreads=nthreads, inorm=2)
-    err = update_err(err, "c4maxf", _l2error(a_32.real, b))
+    err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes))
+    b = pypocketfft.dst(
+        pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=1),
+        axes=axes, type=1, nthreads=nthreads, inorm=2)
+    err = update_err(err, "s1max", _l2error(a.real, b, axes))
     b = pypocketfft.dst(
         pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1),
         axes=axes, type=1, nthreads=nthreads, inorm=2)
-    err = update_err(err, "s1maxf", _l2error(a_32.real, b))
+    err = update_err(err, "s1maxf", _l2error(a_32.real, b, axes))
     b = pypocketfft.dst(
         pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=2),
         axes=axes, type=3, nthreads=nthreads, inorm=2)
-    err = update_err(err, "s23max", _l2error(a.real, b))
+    err = update_err(err, "s23max", _l2error(a.real, b, axes))
     b = pypocketfft.dst(
         pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=2),
         axes=axes, type=3, nthreads=nthreads, inorm=2)
-    err = update_err(err, "s23maxf", _l2error(a_32.real, b))
+    err = update_err(err, "s23maxf", _l2error(a_32.real, b, axes))
     b = pypocketfft.dst(
         pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=4),
         axes=axes, type=4, nthreads=nthreads, inorm=2)
-    err = update_err(err, "s4max", _l2error(a.real, b))
+    err = update_err(err, "s4max", _l2error(a.real, b, axes))
     b = pypocketfft.dst(
         pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=4),
         axes=axes, type=4, nthreads=nthreads, inorm=2)
-    err = update_err(err, "s4maxf", _l2error(a_32.real, b))
+    err = update_err(err, "s4maxf", _l2error(a_32.real, b, axes))
 
 
 err = dict()