diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 403d8c971422686e0e4b8932522000a7ed529f5b..480087e97e77ea65b993dc434982076c6e7aa45e 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -235,14 +235,19 @@ template<typename T> struct cmplx { : Tres(r*other.r-i*other.i, r*other.i+i*other.r); } }; +template<typename T> inline void PM(T &a, T &b, T c, T d) + { a=c+d; b=c-d; } template<typename T> inline void PMINPLACE(T &a, T &b) { T t = a; a+=b; b=t-b; } template<typename T> inline void MPINPLACE(T &a, T &b) { T t = a; a-=b; b=t+b; } -template<typename T> void PMC(T &a, T &b, const T &c, const T &d) - { a = c+d; b = c-d; } template<typename T> cmplx<T> conj(const cmplx<T> &a) { return {a.r, -a.i}; } +template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res) + { + res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i) + : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r); + } template<typename T> void ROT90(cmplx<T> &a) { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } @@ -615,30 +620,27 @@ template<bool fwd, typename T> void pass2 (size_t ido, size_t l1, for (size_t i=1; i<ido; ++i) { CH(i,k,0) = CC(i,0,k)+CC(i,1,k); - CH(i,k,1) = (CC(i,0,k)-CC(i,1,k)).template special_mul<fwd>(WA(0,i)); + special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1)); } } } #define POCKETFFT_PREP3(idx) \ T t0 = CC(idx,0,k), t1, t2; \ - PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ + PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ CH(idx,k,0)=t0+t1; #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ { \ - T ca,cb; \ - ca=t0+t1*twr; \ - cb=t2*twi; ROT90(cb); \ - PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ + T ca=t0+t1*twr; \ + T cb{-t2.i*twi, t2.r*twi}; \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ } #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ { \ - T ca,cb,da,db; \ - ca=t0+t1*twr; \ - cb=t2*twi; ROT90(cb); \ - PMC(da,db,ca,cb); \ - CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \ - CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \ + T ca=t0+t1*twr; \ + T cb{-t2.i*twi, t2.r*twi}; \ + special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ } template<bool fwd, typename T> void pass3 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, @@ -697,43 +699,42 @@ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1, for (size_t k=0; k<l1; ++k) { T t1, t2, t3, t4; - PMC(t2,t1,CC(0,0,k),CC(0,2,k)); - PMC(t3,t4,CC(0,1,k),CC(0,3,k)); + PM(t2,t1,CC(0,0,k),CC(0,2,k)); + PM(t3,t4,CC(0,1,k),CC(0,3,k)); ROTX90<fwd>(t4); - PMC(CH(0,k,0),CH(0,k,2),t2,t3); - PMC(CH(0,k,1),CH(0,k,3),t1,t4); + PM(CH(0,k,0),CH(0,k,2),t2,t3); + PM(CH(0,k,1),CH(0,k,3),t1,t4); } else for (size_t k=0; k<l1; ++k) { { T t1, t2, t3, t4; - PMC(t2,t1,CC(0,0,k),CC(0,2,k)); - PMC(t3,t4,CC(0,1,k),CC(0,3,k)); + PM(t2,t1,CC(0,0,k),CC(0,2,k)); + PM(t3,t4,CC(0,1,k),CC(0,3,k)); ROTX90<fwd>(t4); - PMC(CH(0,k,0),CH(0,k,2),t2,t3); - PMC(CH(0,k,1),CH(0,k,3),t1,t4); + PM(CH(0,k,0),CH(0,k,2),t2,t3); + PM(CH(0,k,1),CH(0,k,3),t1,t4); } for (size_t i=1; i<ido; ++i) { - T c2, c3, c4, t1, t2, t3, t4; + T t1, t2, t3, t4; T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k); - PMC(t2,t1,cc0,cc2); - PMC(t3,t4,cc1,cc3); + PM(t2,t1,cc0,cc2); + PM(t3,t4,cc1,cc3); ROTX90<fwd>(t4); - PMC(CH(i,k,0),c3,t2,t3); - PMC(c2,c4,t1,t4); - CH(i,k,1) = c2.template special_mul<fwd>(WA(0,i)); - CH(i,k,2) = c3.template special_mul<fwd>(WA(1,i)); - CH(i,k,3) = c4.template special_mul<fwd>(WA(2,i)); + CH(i,k,0) = t2+t3; + special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1)); + special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2)); + special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3)); } } } #define POCKETFFT_PREP5(idx) \ T t0 = CC(idx,0,k), t1, t2, t3, t4; \ - PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ - PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ + PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ + PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ CH(idx,k,0).r=t0.r+t1.r+t2.r; \ CH(idx,k,0).i=t0.i+t1.i+t2.i; @@ -744,7 +745,7 @@ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1, ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ - PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ } #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ @@ -754,9 +755,8 @@ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1, ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ - PMC(da,db,ca,cb); \ - CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \ - CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \ + special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ } template<bool fwd, typename T> void pass5 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, @@ -805,9 +805,9 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1, #define POCKETFFT_PREP7(idx) \ T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ - PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ - PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ - PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ + PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ + PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ + PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; @@ -818,7 +818,7 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1, ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ cb.i=y1*t7.r y2*t6.r y3*t5.r; \ cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ - PMC(out1,out2,ca,cb); \ + PM(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) @@ -826,8 +826,8 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1, { \ T da,db; \ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ - CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \ - CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \ + special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \ } template<bool fwd, typename T> void pass7(size_t ido, size_t l1, @@ -915,8 +915,8 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, for (size_t k=0; k<l1; ++k) { T a0, a1, a2, a3, a4, a5, a6, a7; - PMC(a1,a5,CC(0,1,k),CC(0,5,k)); - PMC(a3,a7,CC(0,3,k),CC(0,7,k)); + PM(a1,a5,CC(0,1,k),CC(0,5,k)); + PM(a3,a7,CC(0,3,k),CC(0,7,k)); PMINPLACE(a1,a3); ROTX90<fwd>(a3); @@ -925,21 +925,21 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, ROTX45<fwd>(a5); ROTX135<fwd>(a7); - PMC(a0,a4,CC(0,0,k),CC(0,4,k)); - PMC(a2,a6,CC(0,2,k),CC(0,6,k)); - PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1); - PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3); + PM(a0,a4,CC(0,0,k),CC(0,4,k)); + PM(a2,a6,CC(0,2,k),CC(0,6,k)); + PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); ROTX90<fwd>(a6); - PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5); - PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7); + PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); } else for (size_t k=0; k<l1; ++k) { { T a0, a1, a2, a3, a4, a5, a6, a7; - PMC(a1,a5,CC(0,1,k),CC(0,5,k)); - PMC(a3,a7,CC(0,3,k),CC(0,7,k)); + PM(a1,a5,CC(0,1,k),CC(0,5,k)); + PM(a3,a7,CC(0,3,k),CC(0,7,k)); PMINPLACE(a1,a3); ROTX90<fwd>(a3); @@ -948,38 +948,38 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, ROTX45<fwd>(a5); ROTX135<fwd>(a7); - PMC(a0,a4,CC(0,0,k),CC(0,4,k)); - PMC(a2,a6,CC(0,2,k),CC(0,6,k)); - PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1); - PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3); + PM(a0,a4,CC(0,0,k),CC(0,4,k)); + PM(a2,a6,CC(0,2,k),CC(0,6,k)); + PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); ROTX90<fwd>(a6); - PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5); - PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7); + PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); } for (size_t i=1; i<ido; ++i) { T a0, a1, a2, a3, a4, a5, a6, a7; - PMC(a1,a5,CC(i,1,k),CC(i,5,k)); - PMC(a3,a7,CC(i,3,k),CC(i,7,k)); + PM(a1,a5,CC(i,1,k),CC(i,5,k)); + PM(a3,a7,CC(i,3,k),CC(i,7,k)); ROTX90<fwd>(a7); PMINPLACE(a1,a3); ROTX90<fwd>(a3); PMINPLACE(a5,a7); ROTX45<fwd>(a5); ROTX135<fwd>(a7); - PMC(a0,a4,CC(i,0,k),CC(i,4,k)); - PMC(a2,a6,CC(i,2,k),CC(i,6,k)); + PM(a0,a4,CC(i,0,k),CC(i,4,k)); + PM(a2,a6,CC(i,2,k),CC(i,6,k)); PMINPLACE(a0,a2); CH(i,k,0) = a0+a1; - CH(i,k,4) = (a0-a1).template special_mul<fwd>(WA(3,i)); - CH(i,k,2) = (a2+a3).template special_mul<fwd>(WA(1,i)); - CH(i,k,6) = (a2-a3).template special_mul<fwd>(WA(5,i)); + special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4)); + special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2)); + special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6)); ROTX90<fwd>(a6); PMINPLACE(a4,a6); - CH(i,k,1) = (a4+a5).template special_mul<fwd>(WA(0,i)); - CH(i,k,5) = (a4-a5).template special_mul<fwd>(WA(4,i)); - CH(i,k,3) = (a6+a7).template special_mul<fwd>(WA(2,i)); - CH(i,k,7) = (a6-a7).template special_mul<fwd>(WA(6,i)); + special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1)); + special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5)); + special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3)); + special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7)); } } } @@ -987,11 +987,11 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, #define POCKETFFT_PREP11(idx) \ T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ - PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ - PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ - PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ - PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ - PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ + PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ + PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ + PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ + PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ + PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; @@ -1001,7 +1001,7 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, cb; \ cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ - PMC(out1,out2,ca,cb); \ + PM(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) @@ -1009,8 +1009,8 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, { \ T da,db; \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ - CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \ - CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \ + special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \ } template<bool fwd, typename T> void pass11 (size_t ido, size_t l1, @@ -1105,7 +1105,7 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip, for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) for (size_t k=0; k<l1; ++k) for (size_t i=0; i<ido; ++i) - PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k)); + PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k)); for (size_t k=0; k<l1; ++k) for (size_t i=0; i<ido; ++i) { @@ -1161,7 +1161,7 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip, for (size_t ik=0; ik<idl1; ++ik) { T t1=CX2(ik,j), t2=CX2(ik,jc); - PMC(CX2(ik,j),CX2(ik,jc),t1,t2); + PM(CX2(ik,j),CX2(ik,jc),t1,t2); } else { @@ -1169,15 +1169,15 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip, for (size_t k=0; k<l1; ++k) { T t1=CX(0,k,j), t2=CX(0,k,jc); - PMC(CX(0,k,j),CX(0,k,jc),t1,t2); + PM(CX(0,k,j),CX(0,k,jc),t1,t2); for (size_t i=1; i<ido; ++i) { T x1, x2; - PMC(x1,x2,CX(i,k,j),CX(i,k,jc)); + PM(x1,x2,CX(i,k,j),CX(i,k,jc)); size_t idij=(j-1)*(ido-1)+i-1; - CX(i,k,j) = x1.template special_mul<fwd>(wa[idij]); + special_mul<fwd>(x1,wa[idij],CX(i,k,j)); idij=(jc-1)*(ido-1)+i-1; - CX(i,k,jc) = x2.template special_mul<fwd>(wa[idij]); + special_mul<fwd>(x2,wa[idij],CX(i,k,jc)); } } } @@ -1333,9 +1333,6 @@ template<typename T0> class rfftp void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } -template<typename T> inline void PM(T &a, T &b, T c, T d) - { a=c+d; b=c-d; } - /* (a+ib) = conj(c+id) * (e+if) */ template<typename T1, typename T2, typename T3> inline void MULPM (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) @@ -1557,15 +1554,13 @@ template<typename T> void radfg(size_t ido, size_t ip, size_t l1, for (size_t i=1; i<=ido-2; i+=2) // 112 { T t1=C1(i,k,j ), t2=C1(i+1,k,j ), - t3=C1(i,k,jc), t4=C1(i+1,k,jc); + t3=C1(i,k,jc), t4=C1(i+1,k,jc); T x1=wa[idij]*t1 + wa[idij+1]*t2, - x2=wa[idij]*t2 - wa[idij+1]*t1, - x3=wa[idij2]*t3 + wa[idij2+1]*t4, - x4=wa[idij2]*t4 - wa[idij2+1]*t3; - C1(i ,k,j ) = x1+x3; - C1(i ,k,jc) = x2-x4; - C1(i+1,k,j ) = x2+x4; - C1(i+1,k,jc) = x3-x1; + x2=wa[idij]*t2 - wa[idij+1]*t1, + x3=wa[idij2]*t3 + wa[idij2+1]*t4, + x4=wa[idij2]*t4 - wa[idij2+1]*t3; + PM(C1(i,k,j),C1(i+1,k,jc),x3,x1); + PM(C1(i+1,k,j),C1(i,k,jc),x2,x4); idij+=2; idij2+=2; } @@ -1575,11 +1570,7 @@ template<typename T> void radfg(size_t ido, size_t ip, size_t l1, for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123 for (size_t k=0; k<l1; ++k) // 122 - { - T t1=C1(0,k,j), t2=C1(0,k,jc); - C1(0,k,j ) = t1+t2; - C1(0,k,jc) = t2-t1; - } + MPINPLACE(C1(0,k,jc), C1(0,k,j)); //everything in C //memset(ch,0,ip*l1*ido*sizeof(double)); @@ -1961,10 +1952,7 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1, CH2(ik,0) += CH2(ik,j); for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124 for (size_t k=0; k<l1; ++k) - { - CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc); - CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc); - } + PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc)); if (ido==1) return; @@ -2171,7 +2159,7 @@ template<typename T0> class fftblue /* initialize a_k and FFT it */ for (size_t m=0; m<n; ++m) - akf[m] = c[m].template special_mul<fwd>(bk[m]); + special_mul<fwd>(c[m],bk[m],akf[m]); auto zero = akf[0]*T0(0); for (size_t m=n; m<n2; ++m) akf[m]=zero; @@ -2366,7 +2354,7 @@ template<typename T0> class T_dct1 for (size_t i=1; i<n; ++i) c[i] = tmp[2*i-1]; if (ortho) - { c[0]/=sqrt2; c[n-1]/=sqrt2; } + { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); } } size_t length() const { return fftplan.length()/2+1; } @@ -2439,7 +2427,7 @@ template<typename T0> class T_dcst23 if (!cosine) for (size_t k=0, kc=N-1; k<kc; ++k, --kc) swap(c[k], c[kc]); - if (ortho) c[0]/=sqrt2; + if (ortho) c[0]*=sqrt2*T0(0.5); } else {