diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 5acd74dc57b1e0d60fd23c4915a4a5bbf3fc102b..1a193939e1fa3cca93facdaeab21ef663853d939 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -438,105 +438,103 @@ template<typename T> class sincos_2pibyn { private: using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; - arr<T> data; + arr<cmplx<T>> data; POCKETFFT_NOINLINE void calc_first_octant(size_t den, - T * POCKETFFT_RESTRICT res) + cmplx<T> * POCKETFFT_RESTRICT res) { size_t n = (den+4)>>3; if (n==0) return; - res[0]=1.; res[1]=0.; + res[0].Set(1.,0.); if (n==1) return; size_t l1 = size_t(sqrt(n)); - arr<Thigh> tmp(2*l1); + arr<cmplx<Thigh>> tmp(l1); for (size_t i=1; i<l1; ++i) { - sincosm1pi0(Thigh(2*i)/Thigh(den),&tmp[2*i]); - res[2*i ] = T(tmp[2*i]+1); - res[2*i+1] = T(tmp[2*i+1]); + sincosm1pi0(Thigh(2*i)/Thigh(den),&tmp[i].r); + res[i].Set(T(tmp[i].r)+1,T(tmp[i].i)); } size_t start=l1; while(start<n) { - Thigh cs[2]; - sincosm1pi0((Thigh(2*start))/Thigh(den),cs); - res[2*start] = T(cs[0]+1); - res[2*start+1] = T(cs[1]); + cmplx<Thigh> cs; + sincosm1pi0((Thigh(2*start))/Thigh(den),&cs.r); + res[start].Set(T(cs.r+1), T(cs.i)); size_t end = l1; if (start+end>n) end = n-start; for (size_t i=1; i<end; ++i) { - Thigh csx[2]={tmp[2*i], tmp[2*i+1]}; - res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1); - res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]); + cmplx<Thigh> csx=tmp[i]; + res[start+i].Set(T(((cs.r*csx.r - cs.i*csx.i + cs.r) + csx.r) + 1), + T((cs.r*csx.i + cs.i*csx.r) + cs.i + csx.i)); } start += l1; } } - void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res) + void calc_first_quadrant(size_t n, cmplx<T> * POCKETFFT_RESTRICT res) { - T * POCKETFFT_RESTRICT p = res+n; + cmplx<T> * POCKETFFT_RESTRICT p = res+n/2; // n is always even here calc_first_octant(n<<1, p); size_t ndone=(n+2)>>2; - size_t i=0, idx1=0, idx2=2*ndone-2; - for (; i+1<ndone; i+=2, idx1+=2, idx2-=2) + size_t i=0, idx1=0, idx2=ndone-1; + for (; i+1<ndone; i+=2, ++idx1, --idx2) { - res[idx1] = p[2*i ]; res[idx1+1] = p[2*i+1]; - res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2]; + res[idx1] = p[i]; + res[idx2].Set(p[i+1].i, p[i+1].r); } if (i!=ndone) - { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; } + res[idx1] = p[i]; } - void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res) + void calc_first_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res) { int ndone=int(n+1)>>1; - T * p = res+n-1; + cmplx<T> * p = res+(n-1)/2; // n is always odd here calc_first_octant(n<<2, p); int i4=0, in=int(n), i=0; for (; i4<=in-i4; ++i, i4+=4) // octant 0 - { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; } + res[i] = p[i4]; for (; i4-in <= 0; ++i, i4+=4) // octant 1 - { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; } + { auto xm = in-i4; res[i].Set(p[xm].i, p[xm].r); } for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 - { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; } + { auto xm = i4-in; res[i].Set(-p[xm].i, p[xm].r); } for (; i<ndone; ++i, i4+=4) // octant 3 - { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; } + { auto xm = 2*in-i4; res[i].Set(-p[xm].r, p[xm].i); } } - void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res) + void fill_first_quadrant(size_t n, cmplx<T> * POCKETFFT_RESTRICT res) { constexpr T hsqt2 = T(0.707106781186547524400844362104849L); size_t quart = n>>2; if ((n&7)==0) - res[quart] = res[quart+1] = hsqt2; - for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2) - { res[j] = res[i+1]; res[j+1] = res[i]; } + res[quart/2].Set(hsqt2, hsqt2); + for (size_t i=1, j=quart-1; 2*i<quart; ++i, --j) + { res[j].Set(res[i].i, res[i].r); } } - POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res) + POCKETFFT_NOINLINE void fill_first_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res) { size_t half = n>>1; if ((n&3)==0) - for (size_t i=0; i<half; i+=2) - { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; } + for (size_t i=0; i<half/2; ++i) + res[i+half/2].Set(-res[i].i, res[i].r); else - for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2) - { res[j] = -res[i]; res[j+1] = res[i+1]; } + for (size_t i=1, j=half-1; 2*i<half; ++i, --j) + res[j].Set(-res[i].r, res[i].i); } - void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res) + void fill_second_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res) { if ((n&1)==0) - for (size_t i=0; i<n; ++i) - res[i+n] = -res[i]; + for (size_t i=0; i<n/2; ++i) + res[i+n/2].Set(-res[i].r, -res[i].i); else - for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2) - { res[j] = res[i]; res[j+1] = -res[i+1]; } + for (size_t i=1, j=n-1; 2*i<n; ++i, --j) + res[j].Set(res[i].r, -res[i].i); } - POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res) + POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, cmplx<T> * POCKETFFT_RESTRICT res) { if ((n&3)==0) { @@ -555,16 +553,13 @@ template<typename T> class sincos_2pibyn public: POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half) - : data(2*n) + : data(n) { sincos_2pibyn_half(n, data.data()); if (!half) fill_second_half(n, data.data()); } - T operator[](size_t idx) const { return data[idx]; } - const T *rdata() const { return data; } - const cmplx<T> *cdata() const - { return reinterpret_cast<const cmplx<T> *>(data.data()); } + cmplx<T> operator[](size_t idx) const { return data[idx]; } }; struct util // hack to avoid duplicate symbols @@ -1618,8 +1613,7 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const void comp_twiddle() { - sincos_2pibyn<T0> twid(length, false); - auto twiddle = twid.cdata(); + sincos_2pibyn<T0> twiddle(length, false); size_t l1=1; size_t memofs=0; for (size_t k=0; k<fact.size(); ++k) @@ -2439,8 +2433,8 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1, for (size_t j=1; j<ip; ++j) for (size_t i=1; i<=(ido-1)/2; ++i) { - fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i]; - fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1]; + fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r; + fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i; } } if (ip>5) // special factors required by *g functions @@ -2450,10 +2444,10 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1, fact[k].tws[1] = 0.; for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2) { - fact[k].tws[i ] = twid[i*(length/ip)]; - fact[k].tws[i+1] = twid[i*(length/ip)+1]; - fact[k].tws[ic] = twid[i*(length/ip)]; - fact[k].tws[ic+1] = -twid[i*(length/ip)+1]; + fact[k].tws[i ] = twid[i/2*(length/ip)].r; + fact[k].tws[i+1] = twid[i/2*(length/ip)].i; + fact[k].tws[ic] = twid[i/2*(length/ip)].r; + fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i; } } l1*=ip; @@ -2521,8 +2515,7 @@ template<typename T0> class fftblue bk(mem.data()), bkf(mem.data()+n) { /* initialize b_k */ - sincos_2pibyn<T0> tmp_(2*n, false); - auto tmp = tmp_.cdata(); + sincos_2pibyn<T0> tmp(2*n, false); bk[0].Set(1, 0); size_t coeff=0;