diff --git a/bench.py b/bench.py index 85a34348ee8bb77ea0483f5b3d198eb0a33a26d7..e403ee10e8ec9063ed590b938812ee3a5314819f 100644 --- a/bench.py +++ b/bench.py @@ -1,6 +1,5 @@ import numpy as np import pypocketfft -import pyfftw from time import time import matplotlib.pyplot as plt import math diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index f0a03e0d7e40a0827a0704f20ccb646f0281e627..8c59c48c0e192484f82f887233d6be47e9f01c1b 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -266,305 +266,78 @@ template<bool fwd, typename T> void ROTX90(cmplx<T> &a) // // twiddle factor section // - -/** Approximate sin(pi*a) within the range [-0.25, 0.25] */ -inline float sinpi0(float a) - { - // adapted from https://stackoverflow.com/questions/42792939/ - float s = a * a; - float r = -5.957031250000000000e-01f; - r = fma (r, s, 2.550399541854858398e+00f); - r = fma (r, s, -5.167724132537841797e+00f); - r = (a * s) * r; - return fma (a, 3.141592741012573242e+00f, r); - } - -/** Approximate sin(pi*a) within the range [-0.25, 0.25] */ -inline double sinpi0(double a) - { - // adapted from https://stackoverflow.com/questions/42792939/ - double s = a * a; - double r = 4.6151442520157035e-4; - r = fma (r, s, -7.3700183130883555e-3); - r = fma (r, s, 8.2145868949323936e-2); - r = fma (r, s, -5.9926452893214921e-1); - r = fma (r, s, 2.5501640398732688e+0); - r = fma (r, s, -5.1677127800499516e+0); - s = s * a; - r = r * s; - return fma (a, 3.1415926535897931e+0, r); - } - -/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ -inline float cosm1pi0(float a) - { - // adapted from https://stackoverflow.com/questions/42792939/ - float s = a * a; - float r = 2.313842773437500000e-01f; - r = fmaf (r, s, -1.335021972656250000e+00f); - r = fmaf (r, s, 4.058703899383544922e+00f); - r = fmaf (r, s, -4.934802055358886719e+00f); - return r*s; - } - -/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ -inline double cosm1pi0(double a) - { - // adapted from https://stackoverflow.com/questions/42792939/ - double s = a * a; - double r = -1.0369917389758117e-4; - r = fma (r, s, 1.9294935641298806e-3); - r = fma (r, s, -2.5806887942825395e-2); - r = fma (r, s, 2.3533063028328211e-1); - r = fma (r, s, -1.3352627688538006e+0); - r = fma (r, s, 4.0587121264167623e+0); - r = fma (r, s, -4.9348022005446790e+0); - return r*s; - } - -template <typename T> void sincosm1pi0(T a_, T *POCKETFFT_RESTRICT res) - { - if (sizeof(T)>sizeof(double)) // don't have the code for long double - { - constexpr T pi = T(3.141592653589793238462643383279502884197L); - auto s = sin(pi*a_); - res[1] = s; - res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1); - return; - } - res[0] = T(cosm1pi0(double(a_))); - res[1] = T(sinpi0(double(a_))); - } - -template <typename T> T sinpi(T a) - { - // reduce argument to primary approximation interval (-0.25, 0.25) - auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding - auto i = (int64_t)r; - auto t = fma (T(-0.5), r, a); - - switch (i%4) - { - case 0: - return sinpi0(t); - case 1: case -3: - return cosm1pi0(t) + T(1.); - case 2: case -2: - return T(0.) - sinpi0(t); - case 3: case -1: - return T(-1.) - cosm1pi0(t); - } - throw runtime_error("cannot happen"); - } - -template <typename T> T cospi(T a) - { - // reduce argument to primary approximation interval (-0.25, 0.25) - auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding - auto i = (int64_t)r; - auto t = fma (T(-0.5), r, a); - - switch (i%4) - { - case 0: - return cosm1pi0(t) + T(1.); - case 1: case -3: - return T(0.) - sinpi0(t); - case 2: case -2: - return T(-1.) - cosm1pi0(t); - case 3: case -1: - return sinpi0(t); - } - throw runtime_error("cannot happen"); - } - -inline long double cospi(long double a) - { - constexpr auto pi = 3.141592653589793238462643383279502884197L; - return sizeof(long double) > sizeof(double) ? cos(a * pi) : - static_cast<long double>(cospi(static_cast<double>(a))); - } - -inline long double sinpi(long double a) - { - constexpr auto pi = 3.141592653589793238462643383279502884197L; - return sizeof(long double) > sizeof(double) ? sin(a * pi) : - static_cast<long double>(sinpi(static_cast<double>(a))); - } - -template <typename T> void sincospi(T a, T *POCKETFFT_RESTRICT res) - { - // reduce argument to primary approximation interval (-0.25, 0.25) - auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding - auto i = (int64_t)r; - auto t = fma (T(-0.5), r, a); - - auto c=cosm1pi0(t)+T(1.); - auto s=sinpi0(t); - - // map results according to quadrant - if (i & 2) - { - s = T(0.)-s; - c = T(0.)-c; - } - if (i & 1) - { - swap(s, c); - c = T(0.)-c; - } - res[0]=c; - res[1]=s; - } - -inline void sincospi(long double a, long double *POCKETFFT_RESTRICT res) - { - if (sizeof(long double) > sizeof(double)) - { - constexpr auto pi = 3.141592653589793238462643383279502884197L; - res[0] = cos(pi * a); - res[1] = sin(pi * a); - } - else - { - double sincos[2]; - sincospi(static_cast<double>(a), sincos); - res[0] = static_cast<long double>(sincos[0]); - res[1] = static_cast<long double>(sincos[1]); - } - } - template<typename T> class sincos_2pibyn { private: using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; - arr<T> data; - - POCKETFFT_NOINLINE void calc_first_octant(size_t den, - T * POCKETFFT_RESTRICT res) - { - size_t n = (den+4)>>3; - if (n==0) return; - res[0]=1.; res[1]=0.; - if (n==1) return; - size_t l1 = size_t(sqrt(n)); - arr<Thigh> tmp(2*l1); - for (size_t i=1; i<l1; ++i) - { - sincosm1pi0(Thigh(2*i)/Thigh(den),&tmp[2*i]); - res[2*i ] = T(tmp[2*i]+1); - res[2*i+1] = T(tmp[2*i+1]); - } - size_t start=l1; - while(start<n) - { - Thigh cs[2]; - sincosm1pi0((Thigh(2*start))/Thigh(den),cs); - res[2*start] = T(cs[0]+1); - res[2*start+1] = T(cs[1]); - size_t end = l1; - if (start+end>n) end = n-start; - for (size_t i=1; i<end; ++i) - { - Thigh csx[2]={tmp[2*i], tmp[2*i+1]}; - res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1); - res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]); - } - start += l1; - } - } + size_t N, mask, shift; + arr<cmplx<Thigh>> v1, v2; - void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res) + static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang) { - T * POCKETFFT_RESTRICT p = res+n; - calc_first_octant(n<<1, p); - size_t ndone=(n+2)>>2; - size_t i=0, idx1=0, idx2=2*ndone-2; - for (; i+1<ndone; i+=2, idx1+=2, idx2-=2) + x<<=3; + if (x<4*n) // first half { - res[idx1] = p[2*i ]; res[idx1+1] = p[2*i+1]; - res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2]; + if (x<2*n) // first quadrant + { + if (x<n) return cmplx<Thigh>(cos(Thigh(x)*ang), sin(Thigh(x)*ang)); + return cmplx<Thigh>(sin(Thigh(2*n-x)*ang), cos(Thigh(2*n-x)*ang)); + } + else // second quadrant + { + x-=2*n; + if (x<n) return cmplx<Thigh>(-sin(Thigh(x)*ang), cos(Thigh(x)*ang)); + return cmplx<Thigh>(-cos(Thigh(2*n-x)*ang), sin(Thigh(2*n-x)*ang)); + } } - if (i!=ndone) - { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; } - } - - void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res) - { - int ndone=int(n+1)>>1; - T * p = res+n-1; - calc_first_octant(n<<2, p); - int i4=0, in=int(n), i=0; - for (; i4<=in-i4; ++i, i4+=4) // octant 0 - { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; } - for (; i4-in <= 0; ++i, i4+=4) // octant 1 - { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; } - for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 - { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; } - for (; i<ndone; ++i, i4+=4) // octant 3 - { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; } - } - - void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res) - { - constexpr T hsqt2 = T(0.707106781186547524400844362104849L); - size_t quart = n>>2; - if ((n&7)==0) - res[quart] = res[quart+1] = hsqt2; - for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2) - { res[j] = res[i+1]; res[j+1] = res[i]; } - } - - POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res) - { - size_t half = n>>1; - if ((n&3)==0) - for (size_t i=0; i<half; i+=2) - { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; } - else - for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2) - { res[j] = -res[i]; res[j+1] = res[i+1]; } - } - - void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res) - { - if ((n&1)==0) - for (size_t i=0; i<n; ++i) - res[i+n] = -res[i]; else - for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2) - { res[j] = res[i]; res[j+1] = -res[i+1]; } - } - - POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res) - { - if ((n&3)==0) { - calc_first_octant(n, res); - fill_first_quadrant(n, res); - fill_first_half(n, res); - } - else if ((n&1)==0) - { - calc_first_quadrant(n, res); - fill_first_half(n, res); + x=8*n-x; + if (x<2*n) // third quadrant + { + if (x<n) return cmplx<Thigh>(cos(Thigh(x)*ang), -sin(Thigh(x)*ang)); + return cmplx<Thigh>(sin(Thigh(2*n-x)*ang), -cos(Thigh(2*n-x)*ang)); + } + else // fourth quadrant + { + x-=2*n; + if (x<n) return cmplx<Thigh>(-sin(Thigh(x)*ang), -cos(Thigh(x)*ang)); + return cmplx<Thigh>(-cos(Thigh(2*n-x)*ang), -sin(Thigh(2*n-x)*ang)); + } } - else - calc_first_half(n, res); } public: - POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half) - : data(2*n) - { - sincos_2pibyn_half(n, data.data()); - if (!half) fill_second_half(n, data.data()); + POCKETFFT_NOINLINE sincos_2pibyn(size_t n) + : N(n) + { + constexpr auto pi = 3.141592653589793238462643383279502884197L; + Thigh ang = Thigh(0.25L*pi/n); + size_t nval = (n+2)/2; + shift = 1; + while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift; + mask = (size_t(1)<<shift)-1; + v1.resize(mask+1); + v1[0].Set(Thigh(1), Thigh(0)); + for (size_t i=1; i<v1.size(); ++i) + v1[i]=calc(i,n,ang); + v2.resize((nval+mask)/(mask+1)); + v2[0].Set(Thigh(1), Thigh(0)); + for (size_t i=1; i<v2.size(); ++i) + v2[i]=calc(i*(mask+1),n,ang); + } + + cmplx<T> operator[](size_t idx) const + { + if (2*idx<=N) + { + auto x1=v1[idx&mask], x2=v2[idx>>shift]; + return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r)); + } + idx = N-idx; + auto x1=v1[idx&mask], x2=v2[idx>>shift]; + return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r)); } - - T operator[](size_t idx) const { return data[idx]; } - const T *rdata() const { return data; } - const cmplx<T> *cdata() const - { return reinterpret_cast<const cmplx<T> *>(data.data()); } }; struct util // hack to avoid duplicate symbols @@ -940,12 +713,10 @@ template<bool fwd, typename T> void pass2 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx<T0> * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=2; - auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+2*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; @@ -989,14 +760,13 @@ template<bool fwd, typename T> void pass3 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx<T0> * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=3; constexpr T0 tw1r=-0.5, tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L); auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+3*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; @@ -1029,12 +799,10 @@ template<bool fwd, typename T> void pass4 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx<T0> * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=4; - auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+4*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; @@ -1105,7 +873,6 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx<T0> * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=5; constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L), tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L), tw2r= T0(-0.8090169943749474241022934171828191L), @@ -1113,8 +880,8 @@ template<bool fwd, typename T> void pass5 (size_t ido, size_t l1, auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+5*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; @@ -1177,7 +944,6 @@ template<bool fwd, typename T> void pass7(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx<T0> * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=7; constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L), tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L), tw2r= T0(-0.2225209339563144042889025644967948L), @@ -1187,8 +953,8 @@ template<bool fwd, typename T> void pass7(size_t ido, size_t l1, auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+7*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; @@ -1245,12 +1011,10 @@ template<bool fwd, typename T> void pass8 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx<T0> * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=8; - auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+8*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; @@ -1360,7 +1124,6 @@ template<bool fwd, typename T> void pass11 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx<T0> * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=11; constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L), tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L), tw2r= T0(0.4154150130018864255292741492296232L), @@ -1374,8 +1137,8 @@ template<bool fwd, typename T> void pass11 (size_t ido, size_t l1, auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+11*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; @@ -1618,8 +1381,7 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const void comp_twiddle() { - sincos_2pibyn<T0> twid(length, false); - auto twiddle = twid.cdata(); + sincos_2pibyn<T0> twiddle(length); size_t l1=1; size_t memofs=0; for (size_t k=0; k<fact.size(); ++k) @@ -1682,13 +1444,11 @@ template<typename T> void radf2 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=2; - auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; - auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& - { return ch[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+2*c)]; }; for (size_t k=0; k<l1; k++) PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1)); @@ -1721,14 +1481,13 @@ template<typename T> void radf3(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; - auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& - { return ch[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+3*c)]; }; for (size_t k=0; k<l1; k++) { @@ -1761,14 +1520,13 @@ template<typename T> void radf4(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=4; constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; - auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& - { return ch[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+4*c)]; }; for (size_t k=0; k<l1; k++) { @@ -1809,7 +1567,6 @@ template<typename T> void radf5(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=5; constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), ti11= T0(0.9510565162951535721164393333793821L), tr12= T0(-0.8090169943749474241022934171828191L), @@ -1818,8 +1575,8 @@ template<typename T> void radf5(size_t ido, size_t l1, auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; - auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& - { return ch[a+ido*(b+cdim*c)]; }; + auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T& + { return ch[a+ido*(b+5*c)]; }; for (size_t k=0; k<l1; k++) { @@ -2008,11 +1765,9 @@ template<typename T> void radb2(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=2; - auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+2*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; @@ -2040,12 +1795,11 @@ template<typename T> void radb3(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+3*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; @@ -2081,12 +1835,11 @@ template<typename T> void radb4(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=4; constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+4*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; @@ -2134,15 +1887,14 @@ template<typename T> void radb5(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) const { - constexpr size_t cdim=5; constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), ti11= T0(0.9510565162951535721164393333793821L), tr12= T0(-0.8090169943749474241022934171828191L), ti12= T0(0.5877852522924731291687059546390728L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; - auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& - { return cc[a+ido*(b+cdim*c)]; }; + auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& + { return cc[a+ido*(b+5*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; @@ -2427,7 +2179,7 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1, void comp_twiddle() { - sincos_2pibyn<T0> twid(length, true); + sincos_2pibyn<T0> twid(length); size_t l1=1; T0 *ptr=mem.data(); for (size_t k=0; k<fact.size(); ++k) @@ -2439,8 +2191,8 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1, for (size_t j=1; j<ip; ++j) for (size_t i=1; i<=(ido-1)/2; ++i) { - fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i]; - fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1]; + fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r; + fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i; } } if (ip>5) // special factors required by *g functions @@ -2450,10 +2202,10 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1, fact[k].tws[1] = 0.; for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2) { - fact[k].tws[i ] = twid[i*(length/ip)]; - fact[k].tws[i+1] = twid[i*(length/ip)+1]; - fact[k].tws[ic] = twid[i*(length/ip)]; - fact[k].tws[ic+1] = -twid[i*(length/ip)+1]; + fact[k].tws[i ] = twid[i/2*(length/ip)].r; + fact[k].tws[i+1] = twid[i/2*(length/ip)].i; + fact[k].tws[ic] = twid[i/2*(length/ip)].r; + fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i; } } l1*=ip; @@ -2498,8 +2250,14 @@ template<typename T0> class fftblue plan.exec (akf.data(),1.,true); /* do the convolution */ - for (size_t m=0; m<n2; ++m) + akf[0] = akf[0].template special_mul<!fwd>(bkf[0]); + for (size_t m=1; m<(n2+1)/2; ++m) + { akf[m] = akf[m].template special_mul<!fwd>(bkf[m]); + akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]); + } + if ((n2&1)==0) + akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]); /* inverse FFT */ plan.exec (akf.data(),1.,false); @@ -2511,12 +2269,11 @@ template<typename T0> class fftblue public: POCKETFFT_NOINLINE fftblue(size_t length) - : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2), + : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1), bk(mem.data()), bkf(mem.data()+n) { /* initialize b_k */ - sincos_2pibyn<T0> tmp_(2*n, false); - auto tmp = tmp_.cdata(); + sincos_2pibyn<T0> tmp(2*n); bk[0].Set(1, 0); size_t coeff=0; @@ -2528,13 +2285,16 @@ template<typename T0> class fftblue } /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ + arr<cmplx<T0>> tbkf(n2); T0 xn2 = T0(1)/T0(n2); - bkf[0] = bk[0]*xn2; + tbkf[0] = bk[0]*xn2; for (size_t m=1; m<n; ++m) - bkf[m] = bkf[n2-m] = bk[m]*xn2; + tbkf[m] = tbkf[n2-m] = bk[m]*xn2; for (size_t m=n;m<=(n2-n);++m) - bkf[m].Set(0.,0.); - plan.exec(bkf,1.,true); + tbkf[m].Set(0.,0.); + plan.exec(tbkf.data(),1.,true); + for (size_t i=0; i<n2/2+1; ++i) + bkf[i] = tbkf[i]; } template<typename T> void exec(cmplx<T> c[], T0 fct, bool fwd) const @@ -2712,9 +2472,9 @@ template<typename T0> class T_dcst23 POCKETFFT_NOINLINE T_dcst23(size_t length) : fftplan(length), twiddle(length) { - const auto oo2n = T0(0.5)/T0(length); + sincos_2pibyn<T0> tw(4*length); for (size_t i=0; i<length; ++i) - twiddle[i] = cospi(oo2n*T0(i+1)); + twiddle[i] = tw[i+1].r; } template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, @@ -2787,20 +2547,17 @@ template<typename T0> class T_dcst4 rfft((N&1)? new pocketfft_r<T0>(N) : nullptr), C2((N&1) ? 0 : N/2) { - const auto oon = -T0(1.)/T0(N); if ((N&1)==0) + { + sincos_2pibyn<T0> tw(16*N); for (size_t i=0; i<N/2; ++i) - { - T0 sincos[2]; - sincospi(oon*(T0(i)+T0(0.125)), sincos); - C2[i].Set(sincos[0], sincos[1]); - } + C2[i] = conj(tw[8*i+1]); + } } template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/, int /*type*/, bool cosine) const { - constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t n2 = N/2; if (!cosine) for (size_t k=0, kc=N-1; k<n2; ++k, --kc) @@ -2827,7 +2584,11 @@ template<typename T0> class T_dcst4 } rfft->exec(y.data(), fct, true); { - auto SGN = [sqrt2](size_t i) {return (i&2) ? -sqrt2 : sqrt2;}; + auto SGN = [](size_t i) + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + return (i&2) ? -sqrt2 : sqrt2; + }; c[n2] = y[0]*SGN(n2+1); size_t i=0, i1=1, k=1; for (; k<n2; ++i, ++i1, k+=2) diff --git a/stress.py b/stress.py index b6c23ddeb62d7b9d41452287ed95afe99af179bd..e7526d5c614f538c33e0e5a7f4bca1a876be9349 100644 --- a/stress.py +++ b/stress.py @@ -2,8 +2,8 @@ import numpy as np import pypocketfft -def _l2error(a, b): - return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2)) +def _l2error(a, b, axes): + return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))/np.log2(np.max([2,np.prod(np.take(a.shape,axes))])) def fftn(a, axes=None, inorm=0, out=None, nthreads=1): @@ -52,85 +52,89 @@ def test(err): a_32 = a.astype(np.complex64) b = ifftn(fftn(a, axes=axes, nthreads=nthreads), axes=axes, inorm=2, nthreads=nthreads) - err = update_err(err, "cmax", _l2error(a, b)) + err = update_err(err, "cmax", _l2error(a, b, axes)) b = ifftn(fftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, nthreads=nthreads) - err = update_err(err, "cmax", _l2error(a.real, b)) + err = update_err(err, "cmax", _l2error(a.real, b, axes)) b = fftn(ifftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, nthreads=nthreads) - err = update_err(err, "cmax", _l2error(a.real, b)) - b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, - lastsize=lastsize, nthreads=nthreads) - err = update_err(err, "rmax", _l2error(a.real, b)) + err = update_err(err, "cmax", _l2error(a.real, b, axes)) b = ifftn(fftn(a.astype(np.complex64), axes=axes, nthreads=nthreads), axes=axes, inorm=2, nthreads=nthreads) - err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b)) + err = update_err(err, "cmaxf", _l2error(a.astype(np.complex64), b, axes)) + b = irfftn(rfftn(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, + lastsize=lastsize, nthreads=nthreads) + err = update_err(err, "rmax", _l2error(a.real, b, axes)) b = irfftn(rfftn(a.real.astype(np.float32), axes=axes, nthreads=nthreads), axes=axes, inorm=2, lastsize=lastsize, nthreads=nthreads) - err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b)) + err = update_err(err, "rmaxf", _l2error(a.real.astype(np.float32), b, axes)) b = pypocketfft.separable_hartley( pypocketfft.separable_hartley(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, nthreads=nthreads) - err = update_err(err, "hmax", _l2error(a.real, b)) + err = update_err(err, "hmax", _l2error(a.real, b, axes)) b = pypocketfft.genuine_hartley( pypocketfft.genuine_hartley(a.real, axes=axes, nthreads=nthreads), axes=axes, inorm=2, nthreads=nthreads) - err = update_err(err, "hmax", _l2error(a.real, b)) + err = update_err(err, "hmax", _l2error(a.real, b, axes)) b = pypocketfft.separable_hartley( pypocketfft.separable_hartley( a.real.astype(np.float32), axes=axes, nthreads=nthreads), axes=axes, inorm=2, nthreads=nthreads) - err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b)) + err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes)) b = pypocketfft.genuine_hartley( pypocketfft.genuine_hartley(a.real.astype(np.float32), axes=axes, nthreads=nthreads), axes=axes, inorm=2, nthreads=nthreads) - err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b)) + err = update_err(err, "hmaxf", _l2error(a.real.astype(np.float32), b, axes)) if all(a.shape[i] > 1 for i in axes): b = pypocketfft.dct( pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=1), axes=axes, type=1, nthreads=nthreads, inorm=2) - err = update_err(err, "c1max", _l2error(a.real, b)) + err = update_err(err, "c1max", _l2error(a.real, b, axes)) b = pypocketfft.dct( pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=1), axes=axes, type=1, nthreads=nthreads, inorm=2) - err = update_err(err, "c1maxf", _l2error(a_32.real, b)) + err = update_err(err, "c1maxf", _l2error(a_32.real, b, axes)) b = pypocketfft.dct( pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=2), axes=axes, type=3, nthreads=nthreads, inorm=2) - err = update_err(err, "c23max", _l2error(a.real, b)) + err = update_err(err, "c23max", _l2error(a.real, b, axes)) b = pypocketfft.dct( pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=2), axes=axes, type=3, nthreads=nthreads, inorm=2) - err = update_err(err, "c23maxf", _l2error(a_32.real, b)) + err = update_err(err, "c23maxf", _l2error(a_32.real, b, axes)) b = pypocketfft.dct( pypocketfft.dct(a.real, axes=axes, nthreads=nthreads, type=4), axes=axes, type=4, nthreads=nthreads, inorm=2) - err = update_err(err, "c4max", _l2error(a.real, b)) + err = update_err(err, "c4max", _l2error(a.real, b, axes)) b = pypocketfft.dct( pypocketfft.dct(a_32.real, axes=axes, nthreads=nthreads, type=4), axes=axes, type=4, nthreads=nthreads, inorm=2) - err = update_err(err, "c4maxf", _l2error(a_32.real, b)) + err = update_err(err, "c4maxf", _l2error(a_32.real, b, axes)) + b = pypocketfft.dst( + pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=1), + axes=axes, type=1, nthreads=nthreads, inorm=2) + err = update_err(err, "s1max", _l2error(a.real, b, axes)) b = pypocketfft.dst( pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=1), axes=axes, type=1, nthreads=nthreads, inorm=2) - err = update_err(err, "s1maxf", _l2error(a_32.real, b)) + err = update_err(err, "s1maxf", _l2error(a_32.real, b, axes)) b = pypocketfft.dst( pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=2), axes=axes, type=3, nthreads=nthreads, inorm=2) - err = update_err(err, "s23max", _l2error(a.real, b)) + err = update_err(err, "s23max", _l2error(a.real, b, axes)) b = pypocketfft.dst( pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=2), axes=axes, type=3, nthreads=nthreads, inorm=2) - err = update_err(err, "s23maxf", _l2error(a_32.real, b)) + err = update_err(err, "s23maxf", _l2error(a_32.real, b, axes)) b = pypocketfft.dst( pypocketfft.dst(a.real, axes=axes, nthreads=nthreads, type=4), axes=axes, type=4, nthreads=nthreads, inorm=2) - err = update_err(err, "s4max", _l2error(a.real, b)) + err = update_err(err, "s4max", _l2error(a.real, b, axes)) b = pypocketfft.dst( pypocketfft.dst(a_32.real, axes=axes, nthreads=nthreads, type=4), axes=axes, type=4, nthreads=nthreads, inorm=2) - err = update_err(err, "s4maxf", _l2error(a_32.real, b)) + err = update_err(err, "s4maxf", _l2error(a_32.real, b, axes)) err = dict()