diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 67087e691572f1a4d18518dbee171516c8a52a2d..34a129f69b0150beafb53ada1f85b8f625f45677 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -502,10 +502,6 @@ struct util // hack to avoid duplicate symbols #endif }; -#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] -#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] -#define WA(x,i) wa[(i)-1+(x)*(ido-1)] - // // complex FFTPACK transforms // @@ -531,6 +527,13 @@ template void pass2 (size_t ido, size_t l1, { constexpr size_t cdim=2; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + if (ido==1) for (size_t k=0; k void pass2 (size_t ido, size_t l1, } } -#define PREP3(idx) \ +#define POCKETFFT_PREP3(idx) \ T t0 = CC(idx,0,k), t1, t2; \ PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ CH(idx,k,0)=t0+t1; -#define PARTSTEP3a(u1,u2,twr,twi) \ +#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ { \ T ca,cb; \ ca=t0+t1*twr; \ cb=t2*twi; ROT90(cb); \ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ } -#define PARTSTEP3b(u1,u2,twr,twi) \ +#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ { \ T ca,cb,da,db; \ ca=t0+t1*twr; \ @@ -577,36 +580,50 @@ template void pass3 (size_t ido, size_t l1, constexpr T0 tw1r=-0.5, tw1i= (bwd ? 1: -1) * T0(0.8660254037844386467637231707529362L); + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + if (ido==1) for (size_t k=0; k void pass4 (size_t ido, size_t l1, const T * RESTRICT cc, T * RESTRICT ch, const cmplx * RESTRICT wa) { constexpr size_t cdim=4; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + if (ido==1) for (size_t k=0; k void pass4 (size_t ido, size_t l1, } } -#define PREP5(idx) \ +#define POCKETFFT_PREP5(idx) \ T t0 = CC(idx,0,k), t1, t2, t3, t4; \ PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ CH(idx,k,0).r=t0.r+t1.r+t2.r; \ CH(idx,k,0).i=t0.i+t1.i+t2.i; -#define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ +#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ { \ T ca,cb; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ @@ -661,7 +678,7 @@ template void pass4 (size_t ido, size_t l1, PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \ } -#define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ +#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ { \ T ca,cb,da,db; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ @@ -681,35 +698,42 @@ template void pass5 (size_t ido, size_t l1, tw2r= T0(-0.8090169943749474241022934171828191L), tw2i= (bwd ? 1: -1) * T0(0.5877852522924731291687059546390728L); + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + if (ido==1) for (size_t k=0; k void pass5 (size_t ido, size_t l1, CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; -#define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ +#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ { \ T ca,cb; \ ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \ @@ -726,12 +750,12 @@ template void pass5 (size_t ido, size_t l1, cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ PMC(out1,out2,ca,cb); \ } -#define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ - PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) -#define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ +#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ + POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) +#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ { \ T da,db; \ - PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ + POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } @@ -747,37 +771,44 @@ template void pass7(size_t ido, size_t l1, tw3r= T0(-0.9009688679024191262361023195074451L), tw3i= (bwd ? 1 : -1) * T0(0.433883739117558120475768332848359L); + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + if (ido==1) for (size_t k=0; k void ROTX45(T &a) { @@ -803,6 +834,13 @@ template void pass8 (size_t ido, size_t l1, { constexpr size_t cdim=8; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + if (ido==1) for (size_t k=0; k void pass8 (size_t ido, size_t l1, } -#define PREP11(idx) \ +#define POCKETFFT_PREP11(idx) \ T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ @@ -923,10 +961,17 @@ template void pass11 (size_t ido, size_t l1, tw5r= T0(-0.9594929736144973898903680570663277L), tw5i= (bwd ? 1 : -1) * T0(0.2817325568414296977114179153466169L); + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto WA = [wa, ido](size_t x, size_t i) + { return wa[i-1+x*(ido-1)]; }; + if (ido==1) for (size_t k=0; k void pass11 (size_t ido, size_t l1, for (size_t k=0; k void pass11 (size_t ido, size_t l1, } for (size_t i=1; i void pass11 (size_t ido, size_t l1, #undef PARTSTEP11 #undef PARTSTEP11a0 #undef PARTSTEP11a -#undef PREP11 - -#define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))] -#define CX2(a,b) cc[(a)+idl1*(b)] -#define CH2(a,b) ch[(a)+idl1*(b)] +#undef POCKETFFT_PREP11 template void passg (size_t ido, size_t ip, size_t l1, T * RESTRICT cc, T * RESTRICT ch, const cmplx * RESTRICT wa, @@ -973,6 +1014,17 @@ template void passg (size_t ido, size_t ip, size_t ipph = (ip+1)/2; size_t idl1 = ido*l1; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T& + { return cc[a+ido*(b+l1*c)]; }; + auto CX2 = [cc, idl1](size_t a, size_t b) -> T& + { return cc[a+idl1*b]; }; + auto CH2 = [ch, idl1](size_t a, size_t b) -> const T& + { return ch[a+idl1*b]; }; + arr> wal(ip); wal[0] = cmplx(1., 0.); for (size_t i=1; i void passg (size_t ido, size_t ip, } } -#undef CH2 -#undef CX2 -#undef CX - template void pass_all(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } @@ -1114,10 +1162,6 @@ template void pass_all(T c[], T0 fct) c[i] *= fct; } -#undef WA -#undef CC -#undef CH - public: template void forward(T c[], T0 fct) { pass_all(c, fct); } @@ -1220,7 +1264,6 @@ template class rfftp void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } -#define WA(x,i) wa[(i)+(x)*(ido-1)] template inline void PM(T &a, T &b, T c, T d) { a=c+d; b=c-d; } @@ -1229,14 +1272,17 @@ template inline void MULPM (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) { a=c*e+d*f; b=c*f-d*e; } -#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] -#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] - template void radf2 (size_t ido, size_t l1, const T * RESTRICT cc, T * RESTRICT ch, const T0 * RESTRICT wa) { constexpr size_t cdim=2; + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+cdim*c)]; }; + for (size_t k=0; k void radf2 (size_t ido, size_t l1, } // a2=a+b; b2=i*(b-a); -#define REARRANGE(rx, ix, ry, iy) \ +#define POCKETFFT_REARRANGE(rx, ix, ry, iy) \ {\ auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \ rx=t1; ix=t3; ry=t4; iy=t2; \ @@ -1270,6 +1316,12 @@ template void radf3(size_t ido, size_t l1, constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+cdim*c)]; }; + for (size_t k=0; k void radf3(size_t ido, size_t l1, T di2, di3, dr2, dr3; MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2 - REARRANGE(dr2, di2, dr3, di3); + POCKETFFT_REARRANGE(dr2, di2, dr3, di3); CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add CH(i ,0,k) = CC(i ,k,0)+di2; T tr2 = CC(i-1,k,0)+taur*dr2; // c add @@ -1303,6 +1355,12 @@ template void radf4(size_t ido, size_t l1, constexpr size_t cdim=4; constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+cdim*c)]; }; + for (size_t k=0; k void radf5(size_t ido, size_t l1, tr12= T0(-0.8090169943749474241022934171828191L), ti12= T0(0.5877852522924731291687059546390728L); + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+cdim*c)]; }; + for (size_t k=0; k void radf5(size_t ido, size_t l1, MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3)); MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4)); - REARRANGE(dr2, di2, dr5, di5); - REARRANGE(dr3, di3, dr4, di4); + POCKETFFT_REARRANGE(dr2, di2, dr5, di5); + POCKETFFT_REARRANGE(dr3, di3, dr4, di4); CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3; CH(i ,0,k)=CC(i ,k,0)+di2+di3; T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3; @@ -1386,13 +1450,8 @@ template void radf5(size_t ido, size_t l1, } } -#undef CC -#undef CH -#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))] -#define C2(a,b) cc[(a)+idl1*(b)] -#define CH2(a,b) ch[(a)+idl1*(b)] -#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] -#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] +#undef POCKETFFT_REARRANGE + template void radfg(size_t ido, size_t ip, size_t l1, T * RESTRICT cc, T * RESTRICT ch, const T0 * RESTRICT wa, const T0 * RESTRICT csarr) @@ -1401,6 +1460,17 @@ template void radfg(size_t ido, size_t ip, size_t l1, size_t ipph=(ip+1)/2; size_t idl1 = ido*l1; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return ch[a+ido*(b+l1*c)]; }; + auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T& + { return cc[a+ido*(b+l1*c)]; }; + auto C2 = [cc,idl1] (size_t a, size_t b) -> T& + { return cc[a+idl1*b]; }; + auto CH2 = [ch,idl1] (size_t a, size_t b) -> T& + { return ch[a+idl1*b]; }; + if (ido>1) { for (size_t j=1, jc=ip-1; j void radfg(size_t ido, size_t ip, size_t l1, } } } -#undef C1 -#undef C2 -#undef CH2 - -#undef CH -#undef CC -#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] -#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] template void radb2(size_t ido, size_t l1, const T * RESTRICT cc, T * RESTRICT ch, const T0 * RESTRICT wa) { constexpr size_t cdim=2; + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + for (size_t k=0; k void radb3(size_t ido, size_t l1, constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + for (size_t k=0; k void radb4(size_t ido, size_t l1, constexpr size_t cdim=4; constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + for (size_t k=0; k void radb5(size_t ido, size_t l1, tr12= T0(-0.8090169943749474241022934171828191L), ti12= T0(0.5877852522924731291687059546390728L); + auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + for (size_t k=0; k void radb5(size_t ido, size_t l1, } } -#undef CH -#undef CC -#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] -#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] -#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))] -#define C2(a,b) cc[(a)+idl1*(b)] -#define CH2(a,b) ch[(a)+idl1*(b)] - template void radbg(size_t ido, size_t ip, size_t l1, T * RESTRICT cc, T * RESTRICT ch, const T0 * RESTRICT wa, const T0 * RESTRICT csarr) @@ -1712,6 +1790,17 @@ template void radbg(size_t ido, size_t ip, size_t l1, size_t ipph=(ip+1)/ 2; size_t idl1 = ido*l1; + auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+l1*c)]; }; + auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+l1*c)]; }; + auto C2 = [cc,idl1](size_t a, size_t b) -> T& + { return cc[a+idl1*b]; }; + auto CH2 = [ch,idl1](size_t a, size_t b) -> T& + { return ch[a+idl1*b]; }; + for (size_t k=0; k void radbg(size_t ido, size_t ip, size_t l1, } } } -#undef C1 -#undef C2 -#undef CH2 - -#undef CH -#undef CC - -#undef WA template void copy_and_norm(T *c, T *p1, size_t n, T0 fct) {