From fb991029fd64149ee5af64e659a042764a40b6eb Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 2 Aug 2019 16:19:05 +0200 Subject: [PATCH 1/9] code cosmetics --- pocketfft_hdronly.h | 200 +++++++++++++++++++++----------------------- 1 file changed, 94 insertions(+), 106 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 403d8c9..480087e 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -235,14 +235,19 @@ template struct cmplx { : Tres(r*other.r-i*other.i, r*other.i+i*other.r); } }; +template inline void PM(T &a, T &b, T c, T d) + { a=c+d; b=c-d; } template inline void PMINPLACE(T &a, T &b) { T t = a; a+=b; b=t-b; } template inline void MPINPLACE(T &a, T &b) { T t = a; a-=b; b=t+b; } -template void PMC(T &a, T &b, const T &c, const T &d) - { a = c+d; b = c-d; } template cmplx conj(const cmplx &a) { return {a.r, -a.i}; } +template void special_mul (const cmplx &v1, const cmplx &v2, cmplx &res) + { + res = fwd ? cmplx(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i) + : cmplx(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r); + } template void ROT90(cmplx &a) { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } @@ -615,30 +620,27 @@ template void pass2 (size_t ido, size_t l1, for (size_t i=1; i(WA(0,i)); + special_mul(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1)); } } } #define POCKETFFT_PREP3(idx) \ T t0 = CC(idx,0,k), t1, t2; \ - PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ + PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ CH(idx,k,0)=t0+t1; #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ { \ - T ca,cb; \ - ca=t0+t1*twr; \ - cb=t2*twi; ROT90(cb); \ - PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ + T ca=t0+t1*twr; \ + T cb{-t2.i*twi, t2.r*twi}; \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ } #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ { \ - T ca,cb,da,db; \ - ca=t0+t1*twr; \ - cb=t2*twi; ROT90(cb); \ - PMC(da,db,ca,cb); \ - CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ - CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ + T ca=t0+t1*twr; \ + T cb{-t2.i*twi, t2.r*twi}; \ + special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ } template void pass3 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, @@ -697,43 +699,42 @@ template void pass4 (size_t ido, size_t l1, for (size_t k=0; k(t4); - PMC(CH(0,k,0),CH(0,k,2),t2,t3); - PMC(CH(0,k,1),CH(0,k,3),t1,t4); + PM(CH(0,k,0),CH(0,k,2),t2,t3); + PM(CH(0,k,1),CH(0,k,3),t1,t4); } else for (size_t k=0; k(t4); - PMC(CH(0,k,0),CH(0,k,2),t2,t3); - PMC(CH(0,k,1),CH(0,k,3),t1,t4); + PM(CH(0,k,0),CH(0,k,2),t2,t3); + PM(CH(0,k,1),CH(0,k,3),t1,t4); } for (size_t i=1; i(t4); - PMC(CH(i,k,0),c3,t2,t3); - PMC(c2,c4,t1,t4); - CH(i,k,1) = c2.template special_mul(WA(0,i)); - CH(i,k,2) = c3.template special_mul(WA(1,i)); - CH(i,k,3) = c4.template special_mul(WA(2,i)); + CH(i,k,0) = t2+t3; + special_mul(t1+t4,WA(0,i),CH(i,k,1)); + special_mul(t2-t3,WA(1,i),CH(i,k,2)); + special_mul(t1-t4,WA(2,i),CH(i,k,3)); } } } #define POCKETFFT_PREP5(idx) \ T t0 = CC(idx,0,k), t1, t2, t3, t4; \ - PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ - PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ + PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ + PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ CH(idx,k,0).r=t0.r+t1.r+t2.r; \ CH(idx,k,0).i=t0.i+t1.i+t2.i; @@ -744,7 +745,7 @@ template void pass4 (size_t ido, size_t l1, ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ - PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ } #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ @@ -754,9 +755,8 @@ template void pass4 (size_t ido, size_t l1, ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ - PMC(da,db,ca,cb); \ - CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ - CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ + special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ } template void pass5 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, @@ -805,9 +805,9 @@ template void pass5 (size_t ido, size_t l1, #define POCKETFFT_PREP7(idx) \ T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ - PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ - PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ - PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ + PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ + PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ + PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; @@ -818,7 +818,7 @@ template void pass5 (size_t ido, size_t l1, ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ cb.i=y1*t7.r y2*t6.r y3*t5.r; \ cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ - PMC(out1,out2,ca,cb); \ + PM(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) @@ -826,8 +826,8 @@ template void pass5 (size_t ido, size_t l1, { \ T da,db; \ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ - CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ - CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ + special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ } template void pass7(size_t ido, size_t l1, @@ -915,8 +915,8 @@ template void pass8 (size_t ido, size_t l1, for (size_t k=0; k(a3); @@ -925,21 +925,21 @@ template void pass8 (size_t ido, size_t l1, ROTX45(a5); ROTX135(a7); - PMC(a0,a4,CC(0,0,k),CC(0,4,k)); - PMC(a2,a6,CC(0,2,k),CC(0,6,k)); - PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1); - PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3); + PM(a0,a4,CC(0,0,k),CC(0,4,k)); + PM(a2,a6,CC(0,2,k),CC(0,6,k)); + PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); ROTX90(a6); - PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5); - PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7); + PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); } else for (size_t k=0; k(a3); @@ -948,38 +948,38 @@ template void pass8 (size_t ido, size_t l1, ROTX45(a5); ROTX135(a7); - PMC(a0,a4,CC(0,0,k),CC(0,4,k)); - PMC(a2,a6,CC(0,2,k),CC(0,6,k)); - PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1); - PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3); + PM(a0,a4,CC(0,0,k),CC(0,4,k)); + PM(a2,a6,CC(0,2,k),CC(0,6,k)); + PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); ROTX90(a6); - PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5); - PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7); + PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); } for (size_t i=1; i(a7); PMINPLACE(a1,a3); ROTX90(a3); PMINPLACE(a5,a7); ROTX45(a5); ROTX135(a7); - PMC(a0,a4,CC(i,0,k),CC(i,4,k)); - PMC(a2,a6,CC(i,2,k),CC(i,6,k)); + PM(a0,a4,CC(i,0,k),CC(i,4,k)); + PM(a2,a6,CC(i,2,k),CC(i,6,k)); PMINPLACE(a0,a2); CH(i,k,0) = a0+a1; - CH(i,k,4) = (a0-a1).template special_mul(WA(3,i)); - CH(i,k,2) = (a2+a3).template special_mul(WA(1,i)); - CH(i,k,6) = (a2-a3).template special_mul(WA(5,i)); + special_mul(a0-a1,WA(3,i),CH(i,k,4)); + special_mul(a2+a3,WA(1,i),CH(i,k,2)); + special_mul(a2-a3,WA(5,i),CH(i,k,6)); ROTX90(a6); PMINPLACE(a4,a6); - CH(i,k,1) = (a4+a5).template special_mul(WA(0,i)); - CH(i,k,5) = (a4-a5).template special_mul(WA(4,i)); - CH(i,k,3) = (a6+a7).template special_mul(WA(2,i)); - CH(i,k,7) = (a6-a7).template special_mul(WA(6,i)); + special_mul(a4+a5,WA(0,i),CH(i,k,1)); + special_mul(a4-a5,WA(4,i),CH(i,k,5)); + special_mul(a6+a7,WA(2,i),CH(i,k,3)); + special_mul(a6-a7,WA(6,i),CH(i,k,7)); } } } @@ -987,11 +987,11 @@ template void pass8 (size_t ido, size_t l1, #define POCKETFFT_PREP11(idx) \ T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ - PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ - PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ - PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ - PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ - PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ + PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ + PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ + PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ + PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ + PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; @@ -1001,7 +1001,7 @@ template void pass8 (size_t ido, size_t l1, cb; \ cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ - PMC(out1,out2,ca,cb); \ + PM(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) @@ -1009,8 +1009,8 @@ template void pass8 (size_t ido, size_t l1, { \ T da,db; \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ - CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ - CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ + special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ } template void pass11 (size_t ido, size_t l1, @@ -1105,7 +1105,7 @@ template void passg (size_t ido, size_t ip, for (size_t j=1, jc=ip-1; j void passg (size_t ido, size_t ip, for (size_t ik=0; ik void passg (size_t ido, size_t ip, for (size_t k=0; k(wa[idij]); + special_mul(x1,wa[idij],CX(i,k,j)); idij=(jc-1)*(ido-1)+i-1; - CX(i,k,jc) = x2.template special_mul(wa[idij]); + special_mul(x2,wa[idij],CX(i,k,jc)); } } } @@ -1333,9 +1333,6 @@ template class rfftp void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } -template inline void PM(T &a, T &b, T c, T d) - { a=c+d; b=c-d; } - /* (a+ib) = conj(c+id) * (e+if) */ template inline void MULPM (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) @@ -1557,15 +1554,13 @@ template void radfg(size_t ido, size_t ip, size_t l1, for (size_t i=1; i<=ido-2; i+=2) // 112 { T t1=C1(i,k,j ), t2=C1(i+1,k,j ), - t3=C1(i,k,jc), t4=C1(i+1,k,jc); + t3=C1(i,k,jc), t4=C1(i+1,k,jc); T x1=wa[idij]*t1 + wa[idij+1]*t2, - x2=wa[idij]*t2 - wa[idij+1]*t1, - x3=wa[idij2]*t3 + wa[idij2+1]*t4, - x4=wa[idij2]*t4 - wa[idij2+1]*t3; - C1(i ,k,j ) = x1+x3; - C1(i ,k,jc) = x2-x4; - C1(i+1,k,j ) = x2+x4; - C1(i+1,k,jc) = x3-x1; + x2=wa[idij]*t2 - wa[idij+1]*t1, + x3=wa[idij2]*t3 + wa[idij2+1]*t4, + x4=wa[idij2]*t4 - wa[idij2+1]*t3; + PM(C1(i,k,j),C1(i+1,k,jc),x3,x1); + PM(C1(i+1,k,j),C1(i,k,jc),x2,x4); idij+=2; idij2+=2; } @@ -1575,11 +1570,7 @@ template void radfg(size_t ido, size_t ip, size_t l1, for (size_t j=1, jc=ip-1; j void radbg(size_t ido, size_t ip, size_t l1, CH2(ik,0) += CH2(ik,j); for (size_t j=1, jc=ip-1; j class fftblue /* initialize a_k and FFT it */ for (size_t m=0; m(bk[m]); + special_mul(c[m],bk[m],akf[m]); auto zero = akf[0]*T0(0); for (size_t m=n; m class T_dct1 for (size_t i=1; i class T_dcst23 if (!cosine) for (size_t k=0, kc=N-1; k Date: Fri, 2 Aug 2019 18:04:01 +0200 Subject: [PATCH 2/9] make code a bit more compact --- pocketfft_hdronly.h | 253 ++++++++++++++++++++------------------------ 1 file changed, 113 insertions(+), 140 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 480087e..e6bfd73 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -595,7 +595,7 @@ template class cfftp template void pass2 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const cmplx * POCKETFFT_RESTRICT wa) + const cmplx * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=2; @@ -644,7 +644,7 @@ template void pass2 (size_t ido, size_t l1, } template void pass3 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const cmplx * POCKETFFT_RESTRICT wa) + const cmplx * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=3; constexpr T0 tw1r=-0.5, @@ -684,7 +684,7 @@ template void pass3 (size_t ido, size_t l1, template void pass4 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const cmplx * POCKETFFT_RESTRICT wa) + const cmplx * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=4; @@ -760,7 +760,7 @@ template void pass4 (size_t ido, size_t l1, } template void pass5 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const cmplx * POCKETFFT_RESTRICT wa) + const cmplx * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=5; constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L), @@ -832,7 +832,7 @@ template void pass5 (size_t ido, size_t l1, template void pass7(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const cmplx * POCKETFFT_RESTRICT wa) + const cmplx * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=7; constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L), @@ -881,7 +881,7 @@ template void pass7(size_t ido, size_t l1, #undef POCKETFFT_PARTSTEP7a #undef POCKETFFT_PREP7 -template void ROTX45(T &a) +template void ROTX45(T &a) const { constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); if (fwd) @@ -889,7 +889,7 @@ template void ROTX45(T &a) else { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); } } -template void ROTX135(T &a) +template void ROTX135(T &a) const { constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); if (fwd) @@ -900,7 +900,7 @@ template void ROTX135(T &a) template void pass8 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const cmplx * POCKETFFT_RESTRICT wa) + const cmplx * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=8; @@ -1015,7 +1015,7 @@ template void pass8 (size_t ido, size_t l1, template void pass11 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const cmplx * POCKETFFT_RESTRICT wa) + const cmplx * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=11; constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L), @@ -1077,7 +1077,7 @@ template void pass11 (size_t ido, size_t l1, template void passg (size_t ido, size_t ip, size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa, - const cmplx * POCKETFFT_RESTRICT csarr) + const cmplx * POCKETFFT_RESTRICT csarr) const { const size_t cdim=ip; size_t ipph = (ip+1)/2; @@ -1183,7 +1183,7 @@ template void passg (size_t ido, size_t ip, } } -template void pass_all(T c[], T0 fct) +template void pass_all(T c[], T0 fct) const { if (length==1) { c[0]*=fct; return; } size_t l1=1; @@ -1232,11 +1232,8 @@ template void pass_all(T c[], T0 fct) } public: - template void forward(T c[], T0 fct) - { pass_all(c, fct); } - - template void backward(T c[], T0 fct) - { pass_all(c, fct); } + template void exec(T c[], T0 fct, bool fwd) const + { fwd ? pass_all(c, fct) : pass_all(c, fct); } private: POCKETFFT_NOINLINE void factorize() @@ -1335,12 +1332,12 @@ template class rfftp /* (a+ib) = conj(c+id) * (e+if) */ template inline void MULPM - (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) + (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const { a=c*e+d*f; b=c*f-d*e; } template void radf2 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa) + const T0 * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=2; @@ -1379,7 +1376,7 @@ template void radf2 (size_t ido, size_t l1, template void radf3(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa) + const T0 * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); @@ -1419,7 +1416,7 @@ template void radf3(size_t ido, size_t l1, template void radf4(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa) + const T0 * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=4; constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); @@ -1467,7 +1464,7 @@ template void radf4(size_t ido, size_t l1, template void radf5(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa) + const T0 * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=5; constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), @@ -1524,7 +1521,7 @@ template void radf5(size_t ido, size_t l1, template void radfg(size_t ido, size_t ip, size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) + const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const { const size_t cdim=ip; size_t ipph=(ip+1)/2; @@ -1666,7 +1663,7 @@ template void radfg(size_t ido, size_t ip, size_t l1, template void radb2(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa) + const T0 * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=2; @@ -1698,7 +1695,7 @@ template void radb2(size_t ido, size_t l1, template void radb3(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa) + const T0 * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); @@ -1739,7 +1736,7 @@ template void radb3(size_t ido, size_t l1, template void radb4(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa) + const T0 * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=4; constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); @@ -1792,7 +1789,7 @@ template void radb4(size_t ido, size_t l1, template void radb5(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa) + const T0 * POCKETFFT_RESTRICT wa) const { constexpr size_t cdim=5; constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), @@ -1852,7 +1849,7 @@ template void radb5(size_t ido, size_t l1, template void radbg(size_t ido, size_t ip, size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, - const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) + const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const { const size_t cdim=ip; size_t ipph=(ip+1)/ 2; @@ -1985,7 +1982,7 @@ template void radbg(size_t ido, size_t ip, size_t l1, } } - template void copy_and_norm(T *c, T *p1, size_t n, T0 fct) + template void copy_and_norm(T *c, T *p1, size_t n, T0 fct) const { if (p1!=c) { @@ -2002,60 +1999,51 @@ template void radbg(size_t ido, size_t ip, size_t l1, } public: - template void forward(T c[], T0 fct) + template void exec(T c[], T0 fct, bool r2hc) const { if (length==1) { c[0]*=fct; return; } - size_t n=length; - size_t l1=n, nf=fact.size(); + size_t n=length, nf=fact.size(); arr ch(n); T *p1=c, *p2=ch.data(); - for(size_t k1=0; k1 void backward(T c[], T0 fct) - { - if (length==1) { c[0]*=fct; return; } - size_t n=length; - size_t l1=1, nf=fact.size(); - arr ch(n); - T *p1=c, *p2=ch.data(); + if (r2hc) + for(size_t k1=0, l1=n; k1 class fftblue arr> mem; cmplx *bk, *bkf; - template void fft(cmplx c[], T0 fct) + template void fft(cmplx c[], T0 fct) const { arr> akf(n2); @@ -2164,14 +2152,14 @@ template class fftblue for (size_t m=n; m(bkf[m]); /* inverse FFT */ - plan.backward (akf.data(),1.); + plan.exec (akf.data(),1.,false); /* multiply by b_k */ for (size_t m=0; m class fftblue bkf[m] = bkf[n2-m] = bk[m]*xn2; for (size_t m=n;m<=(n2-n);++m) bkf[m].Set(0.,0.); - plan.forward(bkf,1.); + plan.exec(bkf,1.,true); } - template void backward(cmplx c[], T0 fct) - { fft(c,fct); } + template void exec(cmplx c[], T0 fct, bool fwd) const + { fwd ? fft(c,fct) : fft(c,fct); } - template void forward(cmplx c[], T0 fct) - { fft(c,fct); } - - template void backward_r(T c[], T0 fct) - { - arr> tmp(n); - tmp[0].Set(c[0],c[0]*0); - memcpy (reinterpret_cast(tmp.data()+1), - reinterpret_cast(c+1), (n-1)*sizeof(T)); - if ((n&1)==0) tmp[n/2].i=T0(0)*c[0]; - for (size_t m=1; 2*m(tmp.data(),fct); - for (size_t m=0; m void forward_r(T c[], T0 fct) + template void exec_r(T c[], T0 fct, bool fwd) { arr> tmp(n); - auto zero = T0(0)*c[0]; - for (size_t m=0; m(tmp.data(),fct); - c[0] = tmp[0].r; - memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T)); + if (fwd) + { + auto zero = T0(0)*c[0]; + for (size_t m=0; m(tmp.data(),fct); + c[0] = tmp[0].r; + memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T)); + } + else + { + tmp[0].Set(c[0],c[0]*0); + memcpy (reinterpret_cast(tmp.data()+1), + reinterpret_cast(c+1), (n-1)*sizeof(T)); + if ((n&1)==0) tmp[n/2].i=T0(0)*c[0]; + for (size_t m=1; 2*m(tmp.data(),fct); + for (size_t m=0; m class pocketfft_c packplan=unique_ptr>(new cfftp(length)); } - template POCKETFFT_NOINLINE void backward(cmplx c[], T0 fct) const - { packplan ? packplan->backward(c,fct) : blueplan->backward(c,fct); } - - template POCKETFFT_NOINLINE void forward(cmplx c[], T0 fct) const - { packplan ? packplan->forward(c,fct) : blueplan->forward(c,fct); } + template POCKETFFT_NOINLINE void exec(cmplx c[], T0 fct, bool fwd) const + { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); } size_t length() const { return len; } }; @@ -2309,17 +2292,8 @@ template class pocketfft_r packplan=unique_ptr>(new rfftp(length)); } - template POCKETFFT_NOINLINE void backward(T c[], T0 fct) const - { - packplan ? packplan->backward(c,fct) - : blueplan->backward_r(c,fct); - } - - template POCKETFFT_NOINLINE void forward(T c[], T0 fct) const - { - packplan ? packplan->forward(c,fct) - : blueplan->forward_r(c,fct); - } + template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const + { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec_r(c,fct,fwd); } size_t length() const { return len; } }; @@ -2349,7 +2323,7 @@ template class T_dct1 tmp[0] = c[0]; for (size_t i=1; i class T_dst1 tmp[0] = tmp[n+1] = c[0]*0; for (size_t i=0; i class T_dcst23 if ((N&1)==0) c[N-1]*=2; for (size_t k=1; k class T_dcst23 } if ((N&1)==0) c[NS2] *= 2*twiddle[NS2-1]; - fftplan.forward(c, fct); + fftplan.exec(c, fct, true); for (size_t k=1; k class T_dcst4 for (; iforward(y.data(), fct); + rfft->exec(y.data(), fct, true); { size_t i=0; for (; i+i+1 class T_dcst4 y[i].Set(c[2*i],c[N-1-2*i]); y[i] *= C2[i]; } - fft->forward(y.data(), fct); + fft->exec(y.data(), fct, true); for(size_t i=0, ic=n2-1; i POCKETFFT_NOINLINE void general_c( tdatav[i].r[j] = tin[it.iofs(j,i)].r; tdatav[i].i[j] = tin[it.iofs(j,i)].i; } - forward ? plan->forward (tdatav, fct) : plan->backward(tdatav, fct); + plan->exec (tdatav, fct, forward); for (size_t i=0; i POCKETFFT_NOINLINE void general_c( if (buf != &tin[it.iofs(0)]) for (size_t i=0; iforward (buf, fct) : plan->backward(buf, fct); + plan->exec (buf, fct, forward); if (buf != &out[it.oofs(0)]) for (size_t i=0; i POCKETFFT_NOINLINE void general_hartley( for (size_t i=0; iforward(tdatav, fct); + plan->exec(tdatav, fct, true); for (size_t j=0; j POCKETFFT_NOINLINE void general_hartley( auto tdata = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdata, fct); + plan->exec(tdata, fct, true); // Hartley order out[it.oofs(0)] = tdata[0]; size_t i=1, i1=1, i2=len-1; @@ -3072,7 +3046,7 @@ template POCKETFFT_NOINLINE void general_r2c( for (size_t i=0; iforward(tdatav, fct); + plan->exec(tdatav, fct, true); for (size_t j=0; j POCKETFFT_NOINLINE void general_r2c( auto tdata = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdata, fct); + plan->exec(tdata, fct, true); out[it.oofs(0)].Set(tdata[0]); size_t i=1, ii=1; if (forward) @@ -3145,7 +3119,7 @@ template POCKETFFT_NOINLINE void general_c2r( for (size_t j=0; jbackward(tdatav, fct); + plan->exec(tdatav, fct, false); for (size_t i=0; i POCKETFFT_NOINLINE void general_c2r( if (ibackward(tdata, fct); + plan->exec(tdata, fct, false); for (size_t i=0; i POCKETFFT_NOINLINE void general_r( if ((!r2c) && forward) for (size_t i=2; iforward (tdatav, fct) - : plan->backward(tdatav, fct); + plan->exec (tdatav, fct, forward); if (r2c && (!forward)) for (size_t i=2; i POCKETFFT_NOINLINE void general_r( if ((!r2c) && forward) for (size_t i=2; iforward(buf, fct) : plan->backward(buf, fct); + plan->exec(buf, fct, forward); if (r2c && (!forward)) for (size_t i=2; i Date: Sat, 3 Aug 2019 17:17:44 +0100 Subject: [PATCH 3/9] Use alias templates for vtype --- pocketfft_hdronly.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index e6bfd73..b1d5a4d 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2812,6 +2812,8 @@ template<> struct VTYPE { using type = long double __attribute__ ((vector_size (VLEN::val*sizeof(long double)))); }; + +template using vtype_t = typename VTYPE::type; #endif template arr alloc_tmp(const shape_t &shape, @@ -2866,9 +2868,8 @@ template POCKETFFT_NOINLINE void general_c( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); + auto tdatav = reinterpret_cast> *>(storage.data()); for (size_t i=0; i POCKETFFT_NOINLINE void general_hartley( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; i POCKETFFT_NOINLINE void general_dcst( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; i POCKETFFT_NOINLINE void general_r2c( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; i POCKETFFT_NOINLINE void general_c2r( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t j=0; j POCKETFFT_NOINLINE void general_r( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; i Date: Sat, 3 Aug 2019 17:57:40 +0100 Subject: [PATCH 4/9] Isolate copying into its own function --- pocketfft_hdronly.h | 178 +++++++++++++++++++++++++------------------- 1 file changed, 102 insertions(+), 76 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index b1d5a4d..305fa4d 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2844,6 +2844,57 @@ template arr alloc_tmp(const shape_t &shape, #define POCKETFFT_NTHREADS #endif +template void copy_input(const multi_iter &it, + const cndarr> &src, cmplx> *POCKETFFT_RESTRICT dst) + { + for (size_t i=0; i void copy_input(const multi_iter &it, + const cndarr &src, vtype_t *POCKETFFT_RESTRICT dst) + { + for (size_t i=0; i void copy_input(const multi_iter &it, + const cndarr &src, T *POCKETFFT_RESTRICT dst) + { + if (dst == &src[it.iofs(0)]) return; // in-place + for (size_t i=0; i void copy_output(const multi_iter &it, + const cmplx> *POCKETFFT_RESTRICT src, ndarr> &dst) + { + for (size_t i=0; i void copy_output(const multi_iter &it, + const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) + { + for (size_t i=0; i void copy_output(const multi_iter &it, + const T *POCKETFFT_RESTRICT src, ndarr &dst) + { + if (src == &dst[it.oofs(0)]) return; // in-place + for (size_t i=0; i POCKETFFT_NOINLINE void general_c( const cndarr> &in, ndarr> &out, const shape_t &axes, bool forward, T fct, size_t POCKETFFT_NTHREADS) @@ -2870,16 +2921,9 @@ template POCKETFFT_NOINLINE void general_c( { it.advance(vlen); auto tdatav = reinterpret_cast> *>(storage.data()); - for (size_t i=0; iexec (tdatav, fct, forward); - for (size_t i=0; i0) @@ -2888,19 +2932,46 @@ template POCKETFFT_NOINLINE void general_c( auto buf = it.stride_out() == sizeof(cmplx) ? &out[it.oofs(0)] : reinterpret_cast *>(storage.data()); - if (buf != &tin[it.iofs(0)]) - for (size_t i=0; iexec (buf, fct, forward); - if (buf != &out[it.oofs(0)]) - for (size_t i=0; i void copy_hartley(const multi_iter &it, + const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) + { + for (size_t j=0; j void copy_hartley(const multi_iter &it, + const T *POCKETFFT_RESTRICT src, ndarr &dst) + { + dst[it.oofs(0)] = src[0]; + size_t i=1, i1=1, i2=it.length_out()-1; + for (i=1; i POCKETFFT_NOINLINE void general_hartley( const cndarr &in, ndarr &out, const shape_t &axes, T fct, size_t POCKETFFT_NTHREADS) @@ -2927,41 +2998,18 @@ template POCKETFFT_NOINLINE void general_hartley( { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; iexec(tdatav, fct, true); - for (size_t j=0; j0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); - for (size_t i=0; iexec(tdata, fct, true); - // Hartley order - out[it.oofs(0)] = tdata[0]; - size_t i=1, i1=1, i2=len-1; - for (i=1; i POCKETFFT_NOINLINE void general_dcst( { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; iexec(tdatav, fct, ortho, type, cosine); - for (size_t i=0; i0) @@ -3009,13 +3053,9 @@ template POCKETFFT_NOINLINE void general_dcst( auto buf = it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast(storage.data()); - if (buf != &tin[it.iofs(0)]) - for (size_t i=0; iexec(buf, fct, ortho, type, cosine); - if (buf != &out[it.oofs(0)]) - for (size_t i=0; i POCKETFFT_NOINLINE void general_r2c( { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; iexec(tdatav, fct, true); for (size_t j=0; j POCKETFFT_NOINLINE void general_r2c( { it.advance(1); auto tdata = reinterpret_cast(storage.data()); - for (size_t i=0; iexec(tdata, fct, true); out[it.oofs(0)].Set(tdata[0]); size_t i=1, ii=1; @@ -3117,9 +3154,7 @@ template POCKETFFT_NOINLINE void general_c2r( tdatav[i][j] = in[it.iofs(j,ii)].r; } plan->exec(tdatav, fct, false); - for (size_t i=0; i0) @@ -3139,8 +3174,7 @@ template POCKETFFT_NOINLINE void general_c2r( tdata[i] = in[it.iofs(ii)].r; } plan->exec(tdata, fct, false); - for (size_t i=0; i POCKETFFT_NOINLINE void general_r( { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; i POCKETFFT_NOINLINE void general_r( if (r2c && (!forward)) for (size_t i=2; i0) @@ -3192,9 +3222,7 @@ template POCKETFFT_NOINLINE void general_r( auto buf = it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast(storage.data()); - if (buf != &tin[it.iofs(0)]) - for (size_t i=0; i POCKETFFT_NOINLINE void general_r( if (r2c && (!forward)) for (size_t i=2; i Date: Sat, 3 Aug 2019 20:29:47 +0100 Subject: [PATCH 5/9] Refactor n-dimensional general_x functions --- pocketfft_hdronly.h | 239 +++++++++++++++++--------------------------- 1 file changed, 91 insertions(+), 148 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 305fa4d..895ee0d 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2895,24 +2895,29 @@ template void copy_output(const multi_iter &it, dst[it.oofs(i)] = src[i]; } -template POCKETFFT_NOINLINE void general_c( - const cndarr> &in, ndarr> &out, - const shape_t &axes, bool forward, T fct, size_t POCKETFFT_NTHREADS) +template struct add_vec { using type = vtype_t; }; +template struct add_vec> + { using type = cmplx>; }; +template using add_vec_t = typename add_vec::type; + +template +POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, + const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec) { - shared_ptr> plan; + shared_ptr plan; for (size_t iax=0; iax::val; + constexpr auto vlen = VLEN::val; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); + plan = get_plan(len); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { - auto storage = alloc_tmp(in.shape(), len, sizeof(cmplx)); + auto storage = alloc_tmp(in.shape(), len, sizeof(T)); const auto &tin(iax==0? in : out); multi_iter it(tin, out, axes[iax]); #ifndef POCKETFFT_NO_VECTORS @@ -2920,27 +2925,43 @@ template POCKETFFT_NOINLINE void general_c( while (it.remaining()>=vlen) { it.advance(vlen); - auto tdatav = reinterpret_cast> *>(storage.data()); - copy_input(it, tin, tdatav); - plan->exec (tdatav, fct, forward); - copy_output(it, tdatav, out); + auto tdatav = reinterpret_cast *>(storage.data()); + exec(it, in, out, tdatav, *plan, fct); } #endif while (it.remaining()>0) { it.advance(1); - auto buf = it.stride_out() == sizeof(cmplx) ? - &out[it.oofs(0)] : reinterpret_cast *>(storage.data()); - - copy_input(it, tin, buf); - plan->exec (buf, fct, forward); - copy_output(it, buf, out); + auto buf = it.stride_out() == sizeof(T) ? + &out[it.oofs(0)] : reinterpret_cast(storage.data()); + exec(it, in, out, buf, *plan, fct); } } // end of parallel region - fct = T(1); // factor has been applied, use 1 for remaining axes + fct = T0(1); // factor has been applied, use 1 for remaining axes } } +struct ExecC2C + { + bool forward; + + template void operator () ( + const multi_iter &it, const cndarr> &in, + ndarr> &out, T * buf, const pocketfft_c &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, forward); + copy_output(it, buf, out); + } + }; + +template void general_c( + const cndarr> &in, ndarr> &out, + const shape_t &axes, bool forward, T fct, size_t nthreads) + { + general_nd>(in, out, axes, fct, nthreads, ExecC2C{forward}); + } + template void copy_hartley(const multi_iter &it, const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) { @@ -2972,94 +2993,46 @@ template void copy_hartley(const multi_iter &it, dst[it.oofs(i1)] = src[i]; } +struct ExecHartley + { + template void operator () ( + const multi_iter &it, const cndarr &in, ndarr &out, + T * buf, const pocketfft_r &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, true); + copy_hartley(it, buf, out); + } + }; + template POCKETFFT_NOINLINE void general_hartley( const cndarr &in, ndarr &out, const shape_t &axes, T fct, - size_t POCKETFFT_NTHREADS) + size_t nthreads) { - shared_ptr> plan; + general_nd>(in, out, axes, fct, nthreads, ExecHartley{}); + } - for (size_t iax=0; iax::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); +struct ExecDcst + { + bool ortho; + int type; + bool cosine; -#ifdef POCKETFFT_OPENMP -#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) -#endif -{ - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); - copy_input(it, tin, tdatav); - plan->exec(tdatav, fct, true); - copy_hartley(it, tdatav, out); - } -#endif - while (it.remaining()>0) - { - it.advance(1); - auto tdata = reinterpret_cast(storage.data()); - copy_input(it, tin, tdata); - plan->exec(tdata, fct, true); - copy_hartley(it, tdata, out); - } -} // end of parallel region - fct = T(1); // factor has been applied, use 1 for remaining axes + template + void operator () (const multi_iter &it, const cndarr &in, + ndarr &out, T * buf, const Tplan &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, ortho, type, cosine); + copy_output(it, buf, out); } - } + }; template POCKETFFT_NOINLINE void general_dcst( const cndarr &in, ndarr &out, const shape_t &axes, - T fct, bool ortho, int type, bool cosine, size_t POCKETFFT_NTHREADS) + T fct, bool ortho, int type, bool cosine, size_t nthreads) { - shared_ptr plan; - - for (size_t iax=0; iax::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan(len); - -#ifdef POCKETFFT_OPENMP -#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) -#endif -{ - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); - copy_input(it, tin, tdatav); - plan->exec(tdatav, fct, ortho, type, cosine); - copy_output(it, tdatav, out); - } -#endif - while (it.remaining()>0) - { - it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? &out[it.oofs(0)] - : reinterpret_cast(storage.data()); - - copy_input(it, tin, buf); - plan->exec(buf, fct, ortho, type, cosine); - copy_output(it, buf, out); - } -} // end of parallel region - fct = T(1); // factor has been applied, use 1 for remaining axes - } + general_nd(in, out, axes, fct, nthreads, ExecDcst{ortho, type, cosine}); } template POCKETFFT_NOINLINE void general_r2c( @@ -3179,62 +3152,32 @@ template POCKETFFT_NOINLINE void general_c2r( } // end of parallel region } -template POCKETFFT_NOINLINE void general_r( - const cndarr &in, ndarr &out, const shape_t &axes, bool r2c, - bool forward, T fct, size_t POCKETFFT_NTHREADS) +struct ExecR2R { - shared_ptr> plan; + bool r2c, forward; - for (size_t iax=0; iax void operator () ( + const multi_iter &it, const cndarr &in, ndarr &out, T * buf, + const pocketfft_r &plan, T0 fct) const { - constexpr auto vlen = VLEN::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); - -#ifdef POCKETFFT_OPENMP -#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) -#endif -{ - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); - copy_input(it, tin, tdatav); - if ((!r2c) && forward) - for (size_t i=2; iexec (tdatav, fct, forward); - if (r2c && (!forward)) - for (size_t i=2; i0) - { - it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? - &out[it.oofs(0)] : reinterpret_cast(storage.data()); - - copy_input(it, tin, buf); - if ((!r2c) && forward) - for (size_t i=2; iexec(buf, fct, forward); - if (r2c && (!forward)) - for (size_t i=2; i POCKETFFT_NOINLINE void general_r( + const cndarr &in, ndarr &out, const shape_t &axes, bool r2c, + bool forward, T fct, size_t nthreads) + { + general_nd>(in, out, axes, fct, nthreads, + ExecR2R{r2c, forward}); } #undef POCKETFFT_NTHREADS -- GitLab From 1971f449e91330b4e92565c527caae832ad8815d Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sun, 4 Aug 2019 01:10:08 +0100 Subject: [PATCH 6/9] Remove specific general_ functions {c, r, hartley & dcst} --- pocketfft_hdronly.h | 50 +++++++++++---------------------------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 895ee0d..7afc983 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2955,13 +2955,6 @@ struct ExecC2C } }; -template void general_c( - const cndarr> &in, ndarr> &out, - const shape_t &axes, bool forward, T fct, size_t nthreads) - { - general_nd>(in, out, axes, fct, nthreads, ExecC2C{forward}); - } - template void copy_hartley(const multi_iter &it, const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) { @@ -3005,13 +2998,6 @@ struct ExecHartley } }; -template POCKETFFT_NOINLINE void general_hartley( - const cndarr &in, ndarr &out, const shape_t &axes, T fct, - size_t nthreads) - { - general_nd>(in, out, axes, fct, nthreads, ExecHartley{}); - } - struct ExecDcst { bool ortho; @@ -3028,13 +3014,6 @@ struct ExecDcst } }; -template POCKETFFT_NOINLINE void general_dcst( - const cndarr &in, ndarr &out, const shape_t &axes, - T fct, bool ortho, int type, bool cosine, size_t nthreads) - { - general_nd(in, out, axes, fct, nthreads, ExecDcst{ortho, type, cosine}); - } - template POCKETFFT_NOINLINE void general_r2c( const cndarr &in, ndarr> &out, size_t axis, bool forward, T fct, size_t POCKETFFT_NTHREADS) @@ -3172,14 +3151,6 @@ struct ExecR2R } }; -template POCKETFFT_NOINLINE void general_r( - const cndarr &in, ndarr &out, const shape_t &axes, bool r2c, - bool forward, T fct, size_t nthreads) - { - general_nd>(in, out, axes, fct, nthreads, - ExecR2R{r2c, forward}); - } - #undef POCKETFFT_NTHREADS template void c2c(const shape_t &shape, const stride_t &stride_in, @@ -3191,7 +3162,7 @@ template void c2c(const shape_t &shape, const stride_t &stride_in, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr> ain(data_in, shape, stride_in); ndarr> aout(data_out, shape, stride_out); - general_c(ain, aout, axes, forward, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, ExecC2C{forward}); } template void dct(const shape_t &shape, @@ -3203,12 +3174,13 @@ template void dct(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); + const ExecDcst exec{ortho, type, true}; if (type==1) - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else if (type==4) - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); } template void dst(const shape_t &shape, @@ -3220,12 +3192,13 @@ template void dst(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); + const ExecDcst exec{ortho, type, false}; if (type==1) - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else if (type==4) - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); } template void r2c(const shape_t &shape_in, @@ -3309,7 +3282,8 @@ template void r2r_fftpack(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); - general_r(ain, aout, axes, real2hermitian, forward, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, + ExecR2R{real2hermitian, forward}); } template void r2r_separable_hartley(const shape_t &shape, @@ -3320,7 +3294,7 @@ template void r2r_separable_hartley(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); - general_hartley(ain, aout, axes, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, ExecHartley{}); } } // namespace detail -- GitLab From ac891a3056b935185205626a6da9998e51dae8fb Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sun, 4 Aug 2019 02:34:43 +0100 Subject: [PATCH 7/9] Fix hartley transforms by disallowing inplace FFT output --- pocketfft_hdronly.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 7afc983..0b955ba 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2902,7 +2902,8 @@ template using add_vec_t = typename add_vec::type; template POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, - const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec) + const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec, + const bool allow_inplace=true) { shared_ptr plan; @@ -2932,7 +2933,7 @@ POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, while (it.remaining()>0) { it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? + auto buf = allow_inplace && it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast(storage.data()); exec(it, in, out, buf, *plan, fct); } @@ -2979,8 +2980,8 @@ template void copy_hartley(const multi_iter &it, size_t i=1, i1=1, i2=it.length_out()-1; for (i=1; i void r2r_separable_hartley(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); - general_nd>(ain, aout, axes, fct, nthreads, ExecHartley{}); + general_nd>(ain, aout, axes, fct, nthreads, ExecHartley{}, + false); } } // namespace detail -- GitLab From 15278280a70f11f1a0ecb0f17851f5dd13d3614c Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 4 Aug 2019 15:29:35 +0200 Subject: [PATCH 8/9] fix --- pocketfft_hdronly.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 0b955ba..d1eec5d 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2927,7 +2927,7 @@ POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - exec(it, in, out, tdatav, *plan, fct); + exec(it, tin, out, tdatav, *plan, fct); } #endif while (it.remaining()>0) @@ -2935,7 +2935,7 @@ POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, it.advance(1); auto buf = allow_inplace && it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast(storage.data()); - exec(it, in, out, buf, *plan, fct); + exec(it, tin, out, buf, *plan, fct); } } // end of parallel region fct = T0(1); // factor has been applied, use 1 for remaining axes -- GitLab From 8aa2d5cd467187ec895a45498164aa27608dde41 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sun, 4 Aug 2019 16:57:20 +0100 Subject: [PATCH 9/9] xfail long double tests on WSL and platforms without extended precision --- test.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test.py b/test.py index c718c86..e33837b 100644 --- a/test.py +++ b/test.py @@ -58,9 +58,18 @@ def irfft_scipy(a, axis, inorm=0, out=None, nthreads=1): tol = {np.float32: 6e-7, np.float64: 1.5e-15, np.longfloat: 1e-18} ctype = {np.float32: np.complex64, np.float64: np.complex128, np.longfloat: np.longcomplex} +import platform +on_wsl = "microsoft" in platform.uname()[3].lower() +true_long_double = (np.longfloat != np.float64 and not on_wsl) +dtypes = [np.float32, np.float64, + pytest.param(np.longfloat, marks=pytest.mark.xfail( + not true_long_double, + reason="Long double doesn't offer more precision"))] + + @pmp("len", len1D) @pmp("inorm", [0, 1, 2]) -@pmp("dtype", [np.float32, np.float64, np.longfloat]) +@pmp("dtype", dtypes) def test1D(len, inorm, dtype): a = np.random.rand(len)-0.5 + 1j*np.random.rand(len)-0.5j a = a.astype(ctype[dtype]) @@ -199,7 +208,7 @@ def test_genuine_hartley_2D(shp, axes): @pmp("len", len1D) @pmp("inorm", [0, 1]) # inorm==2 not needed, tested via inverse @pmp("type", [1, 2, 3, 4]) -@pmp("dtype", [np.float32, np.float64, np.longfloat]) +@pmp("dtype", dtypes) def testdcst1D(len, inorm, type, dtype): a = (np.random.rand(len)-0.5).astype(dtype) eps = tol[dtype] -- GitLab