diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h
index 5acd74dc57b1e0d60fd23c4915a4a5bbf3fc102b..1a193939e1fa3cca93facdaeab21ef663853d939 100644
--- a/pocketfft_hdronly.h
+++ b/pocketfft_hdronly.h
@@ -438,105 +438,103 @@ template<typename T> class sincos_2pibyn
   {
   private:
     using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
-    arr<T> data;
+    arr<cmplx<T>> data;
 
     POCKETFFT_NOINLINE void calc_first_octant(size_t den,
-      T * POCKETFFT_RESTRICT res)
+      cmplx<T> * POCKETFFT_RESTRICT res)
       {
       size_t n = (den+4)>>3;
       if (n==0) return;
-      res[0]=1.; res[1]=0.;
+      res[0].Set(1.,0.);
       if (n==1) return;
       size_t l1 = size_t(sqrt(n));
-      arr<Thigh> tmp(2*l1);
+      arr<cmplx<Thigh>> tmp(l1);
       for (size_t i=1; i<l1; ++i)
         {
-        sincosm1pi0(Thigh(2*i)/Thigh(den),&tmp[2*i]);
-        res[2*i  ] = T(tmp[2*i]+1);
-        res[2*i+1] = T(tmp[2*i+1]);
+        sincosm1pi0(Thigh(2*i)/Thigh(den),&tmp[i].r);
+        res[i].Set(T(tmp[i].r)+1,T(tmp[i].i));
         }
       size_t start=l1;
       while(start<n)
         {
-        Thigh cs[2];
-        sincosm1pi0((Thigh(2*start))/Thigh(den),cs);
-        res[2*start] = T(cs[0]+1);
-        res[2*start+1] = T(cs[1]);
+        cmplx<Thigh> cs;
+        sincosm1pi0((Thigh(2*start))/Thigh(den),&cs.r);
+        res[start].Set(T(cs.r+1), T(cs.i));
         size_t end = l1;
         if (start+end>n) end = n-start;
         for (size_t i=1; i<end; ++i)
           {
-          Thigh csx[2]={tmp[2*i], tmp[2*i+1]};
-          res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1);
-          res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]);
+          cmplx<Thigh> csx=tmp[i];
+          res[start+i].Set(T(((cs.r*csx.r - cs.i*csx.i + cs.r) + csx.r) + 1),
+                           T((cs.r*csx.i + cs.i*csx.r) + cs.i + csx.i));
           }
         start += l1;
         }
       }
 
-    void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
+    void calc_first_quadrant(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
       {
-      T * POCKETFFT_RESTRICT p = res+n;
+      cmplx<T> * POCKETFFT_RESTRICT p = res+n/2; // n is always even here
       calc_first_octant(n<<1, p);
       size_t ndone=(n+2)>>2;
-      size_t i=0, idx1=0, idx2=2*ndone-2;
-      for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
+      size_t i=0, idx1=0, idx2=ndone-1;
+      for (; i+1<ndone; i+=2, ++idx1, --idx2)
         {
-        res[idx1] = p[2*i  ]; res[idx1+1] = p[2*i+1];
-        res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2];
+        res[idx1] = p[i];
+        res[idx2].Set(p[i+1].i, p[i+1].r);
         }
       if (i!=ndone)
-        { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; }
+        res[idx1] = p[i];
       }
 
-    void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res)
+    void calc_first_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
       {
       int ndone=int(n+1)>>1;
-      T * p = res+n-1;
+      cmplx<T> * p = res+(n-1)/2; // n is always odd here
       calc_first_octant(n<<2, p);
       int i4=0, in=int(n), i=0;
       for (; i4<=in-i4; ++i, i4+=4) // octant 0
-        { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; }
+        res[i] = p[i4];
       for (; i4-in <= 0; ++i, i4+=4) // octant 1
-        { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; }
+        { auto xm = in-i4; res[i].Set(p[xm].i, p[xm].r); }
       for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
-        { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; }
+        { auto xm = i4-in; res[i].Set(-p[xm].i, p[xm].r); }
       for (; i<ndone; ++i, i4+=4) // octant 3
-        { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; }
+        { auto xm = 2*in-i4; res[i].Set(-p[xm].r, p[xm].i); }
       }
 
-    void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
+    void fill_first_quadrant(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
       {
       constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
       size_t quart = n>>2;
       if ((n&7)==0)
-        res[quart] = res[quart+1] = hsqt2;
-      for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
-        { res[j] = res[i+1]; res[j+1] = res[i]; }
+        res[quart/2].Set(hsqt2, hsqt2);
+      for (size_t i=1, j=quart-1; 2*i<quart; ++i, --j)
+        { res[j].Set(res[i].i, res[i].r); }
       }
 
-    POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res)
+    POCKETFFT_NOINLINE void fill_first_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
       {
       size_t half = n>>1;
       if ((n&3)==0)
-        for (size_t i=0; i<half; i+=2)
-          { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; }
+        for (size_t i=0; i<half/2; ++i)
+          res[i+half/2].Set(-res[i].i, res[i].r);
       else
-        for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
-          { res[j] = -res[i]; res[j+1] = res[i+1]; }
+        for (size_t i=1, j=half-1; 2*i<half; ++i, --j)
+          res[j].Set(-res[i].r, res[i].i);
       }
 
-    void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res)
+    void fill_second_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
       {
       if ((n&1)==0)
-        for (size_t i=0; i<n; ++i)
-          res[i+n] = -res[i];
+        for (size_t i=0; i<n/2; ++i)
+          res[i+n/2].Set(-res[i].r, -res[i].i);
       else
-        for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
-          { res[j] = res[i]; res[j+1] = -res[i+1]; }
+        for (size_t i=1, j=n-1; 2*i<n; ++i, --j)
+          res[j].Set(res[i].r, -res[i].i);
       }
 
-    POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res)
+    POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res)
       {
       if ((n&3)==0)
         {
@@ -555,16 +553,13 @@ template<typename T> class sincos_2pibyn
 
   public:
     POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half)
-      : data(2*n)
+      : data(n)
       {
       sincos_2pibyn_half(n, data.data());
       if (!half) fill_second_half(n, data.data());
       }
 
-    T operator[](size_t idx) const { return data[idx]; }
-    const T *rdata() const { return data; }
-    const cmplx<T> *cdata() const
-      { return reinterpret_cast<const cmplx<T> *>(data.data()); }
+    cmplx<T> operator[](size_t idx) const { return data[idx]; }
   };
 
 struct util // hack to avoid duplicate symbols
@@ -1618,8 +1613,7 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
 
     void comp_twiddle()
       {
-      sincos_2pibyn<T0> twid(length, false);
-      auto twiddle = twid.cdata();
+      sincos_2pibyn<T0> twiddle(length, false);
       size_t l1=1;
       size_t memofs=0;
       for (size_t k=0; k<fact.size(); ++k)
@@ -2439,8 +2433,8 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
           for (size_t j=1; j<ip; ++j)
             for (size_t i=1; i<=(ido-1)/2; ++i)
               {
-              fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
-              fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
+              fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
+              fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
               }
           }
         if (ip>5) // special factors required by *g functions
@@ -2450,10 +2444,10 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
           fact[k].tws[1] = 0.;
           for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
             {
-            fact[k].tws[i  ] = twid[i*(length/ip)];
-            fact[k].tws[i+1] = twid[i*(length/ip)+1];
-            fact[k].tws[ic]   = twid[i*(length/ip)];
-            fact[k].tws[ic+1] = -twid[i*(length/ip)+1];
+            fact[k].tws[i  ] = twid[i/2*(length/ip)].r;
+            fact[k].tws[i+1] = twid[i/2*(length/ip)].i;
+            fact[k].tws[ic]   = twid[i/2*(length/ip)].r;
+            fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i;
             }
           }
         l1*=ip;
@@ -2521,8 +2515,7 @@ template<typename T0> class fftblue
         bk(mem.data()), bkf(mem.data()+n)
       {
       /* initialize b_k */
-      sincos_2pibyn<T0> tmp_(2*n, false);
-      auto tmp = tmp_.cdata();
+      sincos_2pibyn<T0> tmp(2*n, false);
       bk[0].Set(1, 0);
 
       size_t coeff=0;