diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h
index 403d8c971422686e0e4b8932522000a7ed529f5b..480087e97e77ea65b993dc434982076c6e7aa45e 100644
--- a/pocketfft_hdronly.h
+++ b/pocketfft_hdronly.h
@@ -235,14 +235,19 @@ template<typename T> struct cmplx {
                : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
     }
 };
+template<typename T> inline void PM(T &a, T &b, T c, T d)
+  { a=c+d; b=c-d; }
 template<typename T> inline void PMINPLACE(T &a, T &b)
   { T t = a; a+=b; b=t-b; }
 template<typename T> inline void MPINPLACE(T &a, T &b)
   { T t = a; a-=b; b=t+b; }
-template<typename T> void PMC(T &a, T &b, const T &c, const T &d)
-  { a = c+d; b = c-d; }
 template<typename T> cmplx<T> conj(const cmplx<T> &a)
   { return {a.r, -a.i}; }
+template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
+  {
+  res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
+            : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
+  }
 
 template<typename T> void ROT90(cmplx<T> &a)
   { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
@@ -615,30 +620,27 @@ template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
       for (size_t i=1; i<ido; ++i)
         {
         CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
-        CH(i,k,1) = (CC(i,0,k)-CC(i,1,k)).template special_mul<fwd>(WA(0,i));
+        special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
         }
       }
   }
 
 #define POCKETFFT_PREP3(idx) \
         T t0 = CC(idx,0,k), t1, t2; \
-        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
+        PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
         CH(idx,k,0)=t0+t1;
 #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
         { \
-        T ca,cb; \
-        ca=t0+t1*twr; \
-        cb=t2*twi; ROT90(cb); \
-        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
+        T ca=t0+t1*twr; \
+        T cb{-t2.i*twi, t2.r*twi}; \
+        PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
         }
 #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
         { \
-        T ca,cb,da,db; \
-        ca=t0+t1*twr; \
-        cb=t2*twi; ROT90(cb); \
-        PMC(da,db,ca,cb); \
-        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
-        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
+        T ca=t0+t1*twr; \
+        T cb{-t2.i*twi, t2.r*twi}; \
+        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
+        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
         }
 template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
@@ -697,43 +699,42 @@ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
     for (size_t k=0; k<l1; ++k)
       {
       T t1, t2, t3, t4;
-      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
-      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
+      PM(t2,t1,CC(0,0,k),CC(0,2,k));
+      PM(t3,t4,CC(0,1,k),CC(0,3,k));
       ROTX90<fwd>(t4);
-      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
-      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
+      PM(CH(0,k,0),CH(0,k,2),t2,t3);
+      PM(CH(0,k,1),CH(0,k,3),t1,t4);
       }
   else
     for (size_t k=0; k<l1; ++k)
       {
       {
       T t1, t2, t3, t4;
-      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
-      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
+      PM(t2,t1,CC(0,0,k),CC(0,2,k));
+      PM(t3,t4,CC(0,1,k),CC(0,3,k));
       ROTX90<fwd>(t4);
-      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
-      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
+      PM(CH(0,k,0),CH(0,k,2),t2,t3);
+      PM(CH(0,k,1),CH(0,k,3),t1,t4);
       }
       for (size_t i=1; i<ido; ++i)
         {
-        T c2, c3, c4, t1, t2, t3, t4;
+        T t1, t2, t3, t4;
         T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
-        PMC(t2,t1,cc0,cc2);
-        PMC(t3,t4,cc1,cc3);
+        PM(t2,t1,cc0,cc2);
+        PM(t3,t4,cc1,cc3);
         ROTX90<fwd>(t4);
-        PMC(CH(i,k,0),c3,t2,t3);
-        PMC(c2,c4,t1,t4);
-        CH(i,k,1) = c2.template special_mul<fwd>(WA(0,i));
-        CH(i,k,2) = c3.template special_mul<fwd>(WA(1,i));
-        CH(i,k,3) = c4.template special_mul<fwd>(WA(2,i));
+        CH(i,k,0) = t2+t3;
+        special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
+        special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
+        special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
         }
       }
   }
 
 #define POCKETFFT_PREP5(idx) \
         T t0 = CC(idx,0,k), t1, t2, t3, t4; \
-        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
-        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
+        PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
+        PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
         CH(idx,k,0).r=t0.r+t1.r+t2.r; \
         CH(idx,k,0).i=t0.i+t1.i+t2.i;
 
@@ -744,7 +745,7 @@ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
         cb.i=twai*t4.r twbi*t3.r; \
         cb.r=-(twai*t4.i twbi*t3.i); \
-        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \
+        PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
         }
 
 #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
@@ -754,9 +755,8 @@ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
         ca.i=t0.i+twar*t1.i+twbr*t2.i; \
         cb.i=twai*t4.r twbi*t3.r; \
         cb.r=-(twai*t4.i twbi*t3.i); \
-        PMC(da,db,ca,cb); \
-        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
-        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
+        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
+        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
         }
 template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
   const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
@@ -805,9 +805,9 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
 
 #define POCKETFFT_PREP7(idx) \
         T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
-        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
-        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
-        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
+        PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
+        PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
+        PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
         CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
         CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
 
@@ -818,7 +818,7 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
         ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
         cb.i=y1*t7.r y2*t6.r y3*t5.r; \
         cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
-        PMC(out1,out2,ca,cb); \
+        PM(out1,out2,ca,cb); \
         }
 #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
         POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
@@ -826,8 +826,8 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
         { \
         T da,db; \
         POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
-        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
-        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
+        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
+        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
         }
 
 template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
@@ -915,8 +915,8 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
     for (size_t k=0; k<l1; ++k)
       {
       T a0, a1, a2, a3, a4, a5, a6, a7;
-      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
-      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
+      PM(a1,a5,CC(0,1,k),CC(0,5,k));
+      PM(a3,a7,CC(0,3,k),CC(0,7,k));
       PMINPLACE(a1,a3);
       ROTX90<fwd>(a3);
 
@@ -925,21 +925,21 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
       ROTX45<fwd>(a5);
       ROTX135<fwd>(a7);
 
-      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
-      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
-      PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1);
-      PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3);
+      PM(a0,a4,CC(0,0,k),CC(0,4,k));
+      PM(a2,a6,CC(0,2,k),CC(0,6,k));
+      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
+      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
       ROTX90<fwd>(a6);
-      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
-      PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7);
+      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
+      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
       }
   else
     for (size_t k=0; k<l1; ++k)
       {
       {
       T a0, a1, a2, a3, a4, a5, a6, a7;
-      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
-      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
+      PM(a1,a5,CC(0,1,k),CC(0,5,k));
+      PM(a3,a7,CC(0,3,k),CC(0,7,k));
       PMINPLACE(a1,a3);
       ROTX90<fwd>(a3);
 
@@ -948,38 +948,38 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
       ROTX45<fwd>(a5);
       ROTX135<fwd>(a7);
 
-      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
-      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
-      PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1);
-      PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3);
+      PM(a0,a4,CC(0,0,k),CC(0,4,k));
+      PM(a2,a6,CC(0,2,k),CC(0,6,k));
+      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
+      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
       ROTX90<fwd>(a6);
-      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
-      PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7);
+      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
+      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
       }
       for (size_t i=1; i<ido; ++i)
         {
         T a0, a1, a2, a3, a4, a5, a6, a7;
-        PMC(a1,a5,CC(i,1,k),CC(i,5,k));
-        PMC(a3,a7,CC(i,3,k),CC(i,7,k));
+        PM(a1,a5,CC(i,1,k),CC(i,5,k));
+        PM(a3,a7,CC(i,3,k),CC(i,7,k));
         ROTX90<fwd>(a7);
         PMINPLACE(a1,a3);
         ROTX90<fwd>(a3);
         PMINPLACE(a5,a7);
         ROTX45<fwd>(a5);
         ROTX135<fwd>(a7);
-        PMC(a0,a4,CC(i,0,k),CC(i,4,k));
-        PMC(a2,a6,CC(i,2,k),CC(i,6,k));
+        PM(a0,a4,CC(i,0,k),CC(i,4,k));
+        PM(a2,a6,CC(i,2,k),CC(i,6,k));
         PMINPLACE(a0,a2);
         CH(i,k,0) = a0+a1;
-        CH(i,k,4) = (a0-a1).template special_mul<fwd>(WA(3,i));
-        CH(i,k,2) = (a2+a3).template special_mul<fwd>(WA(1,i));
-        CH(i,k,6) = (a2-a3).template special_mul<fwd>(WA(5,i));
+        special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4));
+        special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2));
+        special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6));
         ROTX90<fwd>(a6);
         PMINPLACE(a4,a6);
-        CH(i,k,1) = (a4+a5).template special_mul<fwd>(WA(0,i));
-        CH(i,k,5) = (a4-a5).template special_mul<fwd>(WA(4,i));
-        CH(i,k,3) = (a6+a7).template special_mul<fwd>(WA(2,i));
-        CH(i,k,7) = (a6-a7).template special_mul<fwd>(WA(6,i));
+        special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1));
+        special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5));
+        special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3));
+        special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7));
         }
       }
    }
@@ -987,11 +987,11 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
 
 #define POCKETFFT_PREP11(idx) \
         T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
-        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
-        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
-        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
-        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
-        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
+        PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
+        PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
+        PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
+        PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
+        PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
         CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
         CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
 
@@ -1001,7 +1001,7 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
           cb; \
         cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
         cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
-        PMC(out1,out2,ca,cb); \
+        PM(out1,out2,ca,cb); \
         }
 #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
         POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
@@ -1009,8 +1009,8 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
         { \
         T da,db; \
         POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
-        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
-        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
+        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
+        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
         }
 
 template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
@@ -1105,7 +1105,7 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
   for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
     for (size_t k=0; k<l1; ++k)
       for (size_t i=0; i<ido; ++i)
-        PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
+        PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
   for (size_t k=0; k<l1; ++k)
     for (size_t i=0; i<ido; ++i)
       {
@@ -1161,7 +1161,7 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
       for (size_t ik=0; ik<idl1; ++ik)
         {
         T t1=CX2(ik,j), t2=CX2(ik,jc);
-        PMC(CX2(ik,j),CX2(ik,jc),t1,t2);
+        PM(CX2(ik,j),CX2(ik,jc),t1,t2);
         }
   else
     {
@@ -1169,15 +1169,15 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
       for (size_t k=0; k<l1; ++k)
         {
         T t1=CX(0,k,j), t2=CX(0,k,jc);
-        PMC(CX(0,k,j),CX(0,k,jc),t1,t2);
+        PM(CX(0,k,j),CX(0,k,jc),t1,t2);
         for (size_t i=1; i<ido; ++i)
           {
           T x1, x2;
-          PMC(x1,x2,CX(i,k,j),CX(i,k,jc));
+          PM(x1,x2,CX(i,k,j),CX(i,k,jc));
           size_t idij=(j-1)*(ido-1)+i-1;
-          CX(i,k,j) = x1.template special_mul<fwd>(wa[idij]);
+          special_mul<fwd>(x1,wa[idij],CX(i,k,j));
           idij=(jc-1)*(ido-1)+i-1;
-          CX(i,k,jc) = x2.template special_mul<fwd>(wa[idij]);
+          special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
           }
         }
     }
@@ -1333,9 +1333,6 @@ template<typename T0> class rfftp
     void add_factor(size_t factor)
       { fact.push_back({factor, nullptr, nullptr}); }
 
-template<typename T> inline void PM(T &a, T &b, T c, T d)
-  { a=c+d; b=c-d; }
-
 /* (a+ib) = conj(c+id) * (e+if) */
 template<typename T1, typename T2, typename T3> inline void MULPM
   (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f)
@@ -1557,15 +1554,13 @@ template<typename T> void radfg(size_t ido, size_t ip, size_t l1,
         for (size_t i=1; i<=ido-2; i+=2)                      // 112
           {
           T t1=C1(i,k,j ), t2=C1(i+1,k,j ),
-                 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
+            t3=C1(i,k,jc), t4=C1(i+1,k,jc);
           T x1=wa[idij]*t1 + wa[idij+1]*t2,
-                 x2=wa[idij]*t2 - wa[idij+1]*t1,
-                 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
-                 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
-          C1(i  ,k,j ) = x1+x3;
-          C1(i  ,k,jc) = x2-x4;
-          C1(i+1,k,j ) = x2+x4;
-          C1(i+1,k,jc) = x3-x1;
+            x2=wa[idij]*t2 - wa[idij+1]*t1,
+            x3=wa[idij2]*t3 + wa[idij2+1]*t4,
+            x4=wa[idij2]*t4 - wa[idij2+1]*t3;
+          PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
+          PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
           idij+=2;
           idij2+=2;
           }
@@ -1575,11 +1570,7 @@ template<typename T> void radfg(size_t ido, size_t ip, size_t l1,
 
   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
     for (size_t k=0; k<l1; ++k)                              // 122
-      {
-      T t1=C1(0,k,j), t2=C1(0,k,jc);
-      C1(0,k,j ) = t1+t2;
-      C1(0,k,jc) = t2-t1;
-      }
+      MPINPLACE(C1(0,k,jc), C1(0,k,j));
 
 //everything in C
 //memset(ch,0,ip*l1*ido*sizeof(double));
@@ -1961,10 +1952,7 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
       CH2(ik,0) += CH2(ik,j);
   for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
     for (size_t k=0; k<l1; ++k)
-      {
-      CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
-      CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
-      }
+      PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
 
   if (ido==1) return;
 
@@ -2171,7 +2159,7 @@ template<typename T0> class fftblue
 
       /* initialize a_k and FFT it */
       for (size_t m=0; m<n; ++m)
-        akf[m] = c[m].template special_mul<fwd>(bk[m]);
+        special_mul<fwd>(c[m],bk[m],akf[m]);
       auto zero = akf[0]*T0(0);
       for (size_t m=n; m<n2; ++m)
         akf[m]=zero;
@@ -2366,7 +2354,7 @@ template<typename T0> class T_dct1
       for (size_t i=1; i<n; ++i)
         c[i] = tmp[2*i-1];
       if (ortho)
-        { c[0]/=sqrt2; c[n-1]/=sqrt2; }
+        { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
       }
 
     size_t length() const { return fftplan.length()/2+1; }
@@ -2439,7 +2427,7 @@ template<typename T0> class T_dcst23
         if (!cosine)
           for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
             swap(c[k], c[kc]);
-        if (ortho) c[0]/=sqrt2;
+        if (ortho) c[0]*=sqrt2*T0(0.5);
         }
       else
         {