diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index d1eec5d1c9f16f905f589f416dd0bcdce4de2e82..dcf40a43bc5b71cc45bb3aa96cca880714a4ea09 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -258,50 +258,177 @@ template void ROTX90(cmplx &a) // twiddle factor section // +/** Approximate sin(pi*a) within the range [-0.25, 0.25] */ +inline float sinpi0(float a) + { + // adapted from https://stackoverflow.com/questions/42792939/ + float s = a * a; + float r = -5.957031250000000000e-01f; + r = fma (r, s, 2.550399541854858398e+00f); + r = fma (r, s, -5.167724132537841797e+00f); + r = (a * s) * r; + return fma (a, 3.141592741012573242e+00f, r); + } + +/** Approximate sin(pi*a) within the range [-0.25, 0.25] */ +inline double sinpi0(double a) + { + // adapted from https://stackoverflow.com/questions/42792939/ + double s = a * a; + double r = 4.6151442520157035e-4; + r = fma (r, s, -7.3700183130883555e-3); + r = fma (r, s, 8.2145868949323936e-2); + r = fma (r, s, -5.9926452893214921e-1); + r = fma (r, s, 2.5501640398732688e+0); + r = fma (r, s, -5.1677127800499516e+0); + s = s * a; + r = r * s; + return fma (a, 3.1415926535897931e+0, r); + } + +/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ +inline float cosm1pi0(float a) + { + // adapted from https://stackoverflow.com/questions/42792939/ + float s = a * a; + float r = 2.313842773437500000e-01f; + r = fmaf (r, s, -1.335021972656250000e+00f); + r = fmaf (r, s, 4.058703899383544922e+00f); + r = fmaf (r, s, -4.934802055358886719e+00f); + return r*s; + } + +/** Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ +inline double cosm1pi0(double a) + { + // adapted from https://stackoverflow.com/questions/42792939/ + double s = a * a; + double r = -1.0369917389758117e-4; + r = fma (r, s, 1.9294935641298806e-3); + r = fma (r, s, -2.5806887942825395e-2); + r = fma (r, s, 2.3533063028328211e-1); + r = fma (r, s, -1.3352627688538006e+0); + r = fma (r, s, 4.0587121264167623e+0); + r = fma (r, s, -4.9348022005446790e+0); + return r*s; + } + +template void sincosm1pi0(T a_, T *POCKETFFT_RESTRICT res) + { + if (sizeof(T)>sizeof(double)) // don't have the code for long double + { + constexpr T pi = T(3.141592653589793238462643383279502884197L); + auto s = sin(pi*a_); + res[1] = s; + res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1); + return; + } + res[0] = T(cosm1pi0(double(a_))); + res[1] = T(sinpi0(double(a_))); + } + +template T sinpi(T a) + { + // reduce argument to primary approximation interval (-0.25, 0.25) + auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding + auto i = (int64_t)r; + auto t = fma (T(-0.5), r, a); + + switch (i%4) + { + case 0: + return sinpi0(t); + case 1: case -3: + return cosm1pi0(t) + T(1.); + case 2: case -2: + return T(0.) - sinpi0(t); + case 3: case -1: + return T(-1.) - cosm1pi0(t); + } + } + +template T cospi(T a) + { + // reduce argument to primary approximation interval (-0.25, 0.25) + auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding + auto i = (int64_t)r; + auto t = fma (T(-0.5), r, a); + + switch (i%4) + { + case 0: + return cosm1pi0(t) + T(1.); + case 1: case -3: + return T(0.) - sinpi0(t); + case 2: case -2: + return T(-1.) - cosm1pi0(t); + case 3: case -1: + return sinpi0(t); + } + } + +inline long double cospi(long double a) + { + constexpr auto pi = 3.141592653589793238462643383279502884197L; + return sizeof(long double) > sizeof(double) ? cos(a * pi) : + static_cast(cospi(static_cast(a))); + } + +inline long double sinpi(long double a) + { + constexpr auto pi = 3.141592653589793238462643383279502884197L; + return sizeof(long double) > sizeof(double) ? sin(a * pi) : + static_cast(sinpi(static_cast(a))); + } + +template void sincospi(T a, T *POCKETFFT_RESTRICT res) + { + // reduce argument to primary approximation interval (-0.25, 0.25) + auto r = nearbyint (a + a); // must use IEEE-754 "to nearest" rounding + auto i = (int64_t)r; + auto t = fma (T(-0.5), r, a); + + auto c=cosm1pi0(t)+T(1.); + auto s=sinpi0(t); + + // map results according to quadrant + if (i & 2) + { + s = T(0.)-s; + c = T(0.)-c; + } + if (i & 1) + { + swap(s, c); + c = T(0.)-c; + } + res[0]=c; + res[1]=s; + } + +inline void sincospi(long double a, long double *POCKETFFT_RESTRICT res) + { + if (sizeof(long double) > sizeof(double)) + { + constexpr auto pi = 3.141592653589793238462643383279502884197L; + res[0] = cos(pi * a); + res[1] = sin(pi * a); + } + else + { + double sincos[2]; + sincospi(static_cast(a), sincos); + res[0] = static_cast(sincos[0]); + res[1] = static_cast(sincos[1]); + } + } + template class sincos_2pibyn { private: using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; arr data; - void my_sincosm1pi (Thigh a_, Thigh *POCKETFFT_RESTRICT res) - { - if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double - { - constexpr Thigh pi = Thigh(3.141592653589793238462643383279502884197L); - auto s = sin(pi*a_); - res[1] = s; - res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1); - return; - } - // adapted from https://stackoverflow.com/questions/42792939/ - // CAUTION: this function only works for arguments in the range - // [-0.25; 0.25]! - double a = double(a_); - double s = a * a; - /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ - double r = -1.0369917389758117e-4; - r = fma (r, s, 1.9294935641298806e-3); - r = fma (r, s, -2.5806887942825395e-2); - r = fma (r, s, 2.3533063028328211e-1); - r = fma (r, s, -1.3352627688538006e+0); - r = fma (r, s, 4.0587121264167623e+0); - r = fma (r, s, -4.9348022005446790e+0); - double c = r*s; - /* Approximate sin(pi*x) for x in [-0.25,0.25] */ - r = 4.6151442520157035e-4; - r = fma (r, s, -7.3700183130883555e-3); - r = fma (r, s, 8.2145868949323936e-2); - r = fma (r, s, -5.9926452893214921e-1); - r = fma (r, s, 2.5501640398732688e+0); - r = fma (r, s, -5.1677127800499516e+0); - s = s * a; - r = r * s; - s = fma (a, 3.1415926535897931e+0, r); - res[0] = c; - res[1] = s; - } - POCKETFFT_NOINLINE void calc_first_octant(size_t den, T * POCKETFFT_RESTRICT res) { @@ -313,7 +440,7 @@ template class sincos_2pibyn arr tmp(2*l1); for (size_t i=1; i class sincos_2pibyn while(start class T_dcst23 POCKETFFT_NOINLINE T_dcst23(size_t length) : fftplan(length), twiddle(length) { - constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); + const auto oo2n = T0(0.5)/T0(length); for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, @@ -2446,12 +2573,13 @@ template class T_dcst4 rfft((N&1)? new pocketfft_r(N) : nullptr), C2((N&1) ? 0 : N/2) { - constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); + const auto oon = -T0(1.)/T0(N); if ((N&1)==0) for (size_t i=0; i