diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 5c552ef546200af6d53e5f0dc6961a23a0ce1fc8..e821e6220950b24dea662774243940247931b1aa 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -37,6 +37,8 @@ namespace pocketfft { using shape_t = std::vector; using stride_t = std::vector; +constexpr bool FORWARD = true, + BACKWARD = false; namespace detail { @@ -157,13 +159,29 @@ template void ROTM90(cmplx &a) template class sincos_2pibyn { private: + template struct TypeSelector + {}; + template struct TypeSelector + { using type = Ta; }; + template struct TypeSelector + { using type = Tb; }; + + using Thigh = typename TypeSelectorsizeof(double))>::type; arr data; // adapted from https://stackoverflow.com/questions/42792939/ // CAUTION: this function only works for arguments in the range // [-0.25; 0.25]! - void my_sincosm1pi (double a, double *restrict res) + void my_sincosm1pi (Thigh a, Thigh *restrict res) { + if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double + { + Thigh pi = Thigh(3.141592653589793238462643383279502884197L); + res[1] = sin(pi*a); + auto s = res[1]; + res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1); + return; + } double s = a * a; /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ double r = -1.0369917389758117e-4; @@ -195,25 +213,25 @@ template class sincos_2pibyn res[0]=1.; res[1]=0.; if (n==1) return; size_t l1 = size_t(sqrt(n)); - arr tmp(2*l1); + arr tmp(2*l1); for (size_t i=1; in) end = n-start; for (size_t i=1; i class sincos_2pibyn void fill_first_quadrant(size_t n, T * restrict res) { - const double hsqt2 = 0.707106781186547524400844362104849; + constexpr Thigh hsqt2 = Thigh(0.707106781186547524400844362104849L); size_t quart = n>>2; if ((n&7)==0) res[quart] = res[quart+1] = hsqt2; @@ -337,7 +355,7 @@ struct util // hack to avoid duplicate symbols static NOINLINE double cost_guess (size_t n) { - const double lfp=1.1; // penalty for non-hardcoded larger factors + constexpr double lfp=1.1; // penalty for non-hardcoded larger factors size_t ni=n; double result=0.; while ((n&1)==0) @@ -388,9 +406,8 @@ struct util // hack to avoid duplicate symbols throw runtime_error("stride dimension mismatch"); for (auto shp: shape) if (shp<1) throw runtime_error("zero extent detected"); - if (inplace) - for (size_t i=0; i class cfftp { private: - struct fctdata { size_t fct; @@ -489,7 +505,8 @@ template void pass3 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=3; - constexpr T0 tw1r=-0.5, tw1i= (bwd ? 1.: -1.) * 0.86602540378443864676; + constexpr T0 tw1r=-0.5, + tw1i= (bwd ? 1: -1) * T0(0.8660254037844386467637231707529362L); if (ido==1) for (size_t k=0; k void pass5 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=5; - constexpr T0 tw1r= 0.3090169943749474241, - tw1i= (bwd ? 1.: -1.) * 0.95105651629515357212, - tw2r= -0.8090169943749474241, - tw2i= (bwd ? 1.: -1.) * 0.58778525229247312917; + constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L), + tw1i= (bwd ? 1: -1) * T0(0.9510565162951535721164393333793821L), + tw2r= T0(-0.8090169943749474241022934171828191L), + tw2i= (bwd ? 1: -1) * T0(0.5877852522924731291687059546390728L); if (ido==1) for (size_t k=0; k void pass7(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=7; - constexpr T0 tw1r= 0.623489801858733530525, - tw1i= (bwd ? 1. : -1.) * 0.7818314824680298087084, - tw2r= -0.222520933956314404289, - tw2i= (bwd ? 1. : -1.) * 0.9749279121818236070181, - tw3r= -0.9009688679024191262361, - tw3i= (bwd ? 1. : -1.) * 0.4338837391175581204758; + constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L), + tw1i= (bwd ? 1 : -1) * T0(0.7818314824680298087084445266740578L), + tw2r= T0(-0.2225209339563144042889025644967948L), + tw2i= (bwd ? 1 : -1) * T0(0.9749279121818236070181316829939312L), + tw3r= T0(-0.9009688679024191262361023195074451L), + tw3i= (bwd ? 1 : -1) * T0(0.433883739117558120475768332848359L); if (ido==1) for (size_t k=0; k void pass7(size_t ido, size_t l1, template void pass11 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { - const size_t cdim=11; - const T0 tw1r = 0.8412535328311811688618, - tw1i = (bwd ? 1. : -1.) * 0.5406408174555975821076, - tw2r = 0.4154150130018864255293, - tw2i = (bwd ? 1. : -1.) * 0.9096319953545183714117, - tw3r = -0.1423148382732851404438, - tw3i = (bwd ? 1. : -1.) * 0.9898214418809327323761, - tw4r = -0.6548607339452850640569, - tw4i = (bwd ? 1. : -1.) * 0.755749574354258283774, - tw5r = -0.9594929736144973898904, - tw5i = (bwd ? 1. : -1.) * 0.2817325568414296977114; + constexpr size_t cdim=11; + constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L), + tw1i= (bwd ? 1 : -1) * T0(0.5406408174555975821076359543186917L), + tw2r= T0(0.4154150130018864255292741492296232L), + tw2i= (bwd ? 1 : -1) * T0(0.9096319953545183714117153830790285L), + tw3r= T0(-0.1423148382732851404437926686163697L), + tw3i= (bwd ? 1 : -1) * T0(0.9898214418809327323760920377767188L), + tw4r= T0(-0.6548607339452850640569250724662936L), + tw4i= (bwd ? 1 : -1) * T0(0.7557495743542582837740358439723444L), + tw5r= T0(-0.9594929736144973898903680570663277L), + tw5i= (bwd ? 1 : -1) * T0(0.2817325568414296977114179153466169L); if (ido==1) for (size_t k=0; k void radf3(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=3; - constexpr T0 taur=-0.5, taui=0.86602540378443864676; + constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); for (size_t k=0; k void radf4(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=4; - constexpr T0 hsqt2=0.70710678118654752440; + constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); for (size_t k=0; k void radf5(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=5; - constexpr T0 tr11= 0.3090169943749474241, ti11=0.95105651629515357212, - tr12=-0.8090169943749474241, ti12=0.58778525229247312917; + constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), + ti11= T0(0.9510565162951535721164393333793821L), + tr12= T0(-0.8090169943749474241022934171828191L), + ti12= T0(0.5877852522924731291687059546390728L); for (size_t k=0; k void radb3(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=3; - constexpr T0 taur=-0.5, taui=0.86602540378443864676; + constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); for (size_t k=0; k void radb4(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=4; - constexpr T0 sqrt2=1.41421356237309504880; + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); for (size_t k=0; k void radb5(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=5; - constexpr T0 tr11= 0.3090169943749474241, ti11=0.95105651629515357212, - tr12=-0.8090169943749474241, ti12=0.58778525229247312917; + constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), + ti11= T0(0.9510565162951535721164393333793821L), + tr12= T0(-0.8090169943749474241022934171828191L), + ti12= T0(0.5877852522924731291687059546390728L); for (size_t k=0; k class fftblue } /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ - T0 xn2 = 1./n2; + T0 xn2 = T0(1)/n2; bkf[0] = bk[0]*xn2; for (size_t m=1; m class multi_iter for (int i=pos.size()-1; i>=0; --i) { if (i==int(idim)) continue; - ++pos[i]; p_ii += iarr.stride(i); p_oi += oarr.stride(i); - if (pos[i] < iarr.shape(i)) + if (++pos[i] < iarr.shape(i)) return; pos[i] = 0; p_ii -= iarr.shape(i)*iarr.stride(i); @@ -2076,7 +2096,12 @@ template class multi_iter }; #if defined(HAVE_VECSUPPORT) -template struct VTYPE{}; +template struct VTYPE {}; +template<> struct VTYPE + { + using type = long double __attribute__ ((vector_size (sizeof(long double)))); + static constexpr int vlen=1; + }; template<> struct VTYPE { using type = double __attribute__ ((vector_size (VBYTELEN))); @@ -2088,10 +2113,7 @@ template<> struct VTYPE static constexpr int vlen=VBYTELEN/sizeof(float); }; #else -template struct VTYPE{}; -template<> struct VTYPE - { static constexpr int vlen=1; }; -template<> struct VTYPE +template struct VTYPE { static constexpr int vlen=1; }; #endif @@ -2393,68 +2415,70 @@ template void c2c(const shape_t &shape, const stride_t &stride_in, general_c(ain, aout, axes, forward, fct); } -template void r2c(const shape_t &shape, const stride_t &stride_in, - const stride_t &stride_out, size_t axis, const T *data_in, - std::complex *data_out, T fct) +template void r2c(const shape_t &shape_in, + const stride_t &stride_in, const stride_t &stride_out, size_t axis, + const T *data_in, std::complex *data_out, T fct) { using namespace detail; - util::sanity_check(shape, stride_in, stride_out, false, axis); - ndarr ain(data_in, shape, stride_in); - ndarr> aout(data_out, shape, stride_out); + util::sanity_check(shape_in, stride_in, stride_out, false, axis); + ndarr ain(data_in, shape_in, stride_in); + ndarr> aout(data_out, shape_in, stride_out); // FIXME general_r2c(ain, aout, axis, fct); } -template void r2c(const shape_t &shape, const stride_t &stride_in, - const stride_t &stride_out, const shape_t &axes, const T *data_in, - std::complex *data_out, T fct) +template void r2c(const shape_t &shape_in, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + const T *data_in, std::complex *data_out, T fct) { using namespace detail; - util::sanity_check(shape, stride_in, stride_out, false, axes); - r2c(shape, stride_in, stride_out, axes.back(), data_in, data_out, fct); + util::sanity_check(shape_in, stride_in, stride_out, false, axes); + r2c(shape_in, stride_in, stride_out, axes.back(), data_in, data_out, fct); if (axes.size()==1) return; - shape_t shape_out(shape); - shape_out[axes.back()] = shape[axes.back()]/2 + 1; + shape_t shape_out(shape_in); + shape_out[axes.back()] = shape_in[axes.back()]/2 + 1; auto newaxes = shape_t{axes.begin(), --axes.end()}; c2c(shape_out, stride_out, stride_out, newaxes, true, data_out, data_out, T(1)); } -template void c2r(const shape_t &shape, size_t new_size, +template void c2r(const shape_t &shape_out, const stride_t &stride_in, const stride_t &stride_out, size_t axis, const std::complex *data_in, T *data_out, T fct) { using namespace detail; - util::sanity_check(shape, stride_in, stride_out, false, axis); - shape_t shape_out(shape); - shape_out[axis] = new_size; - ndarr> ain(data_in, shape, stride_in); + util::sanity_check(shape_out, stride_in, stride_out, false, axis); + shape_t shape_in(shape_out); + shape_in[axis] = shape_out[axis]/2 + 1; + ndarr> ain(data_in, shape_in, stride_in); ndarr aout(data_out, shape_out, stride_out); general_c2r(ain, aout, axis, fct); } -template void c2r(const shape_t &shape, size_t new_size, +template void c2r(const shape_t &shape_out, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const std::complex *data_in, T *data_out, T fct) { using namespace detail; if (axes.size()==1) { - c2r(shape, new_size, stride_in, stride_out, axes[0], - data_in, data_out, fct); + c2r(shape_out, stride_in, stride_out, axes[0], data_in, data_out, fct); return; } - util::sanity_check(shape, stride_in, stride_out, false, axes); - auto nval = util::prod(shape); - stride_t stride_inter(shape.size()); + util::sanity_check(shape_out, stride_in, stride_out, false, axes); + auto shape_in = shape_out; + shape_in[axes.back()] = shape_out[axes.back()]/2 + 1; + auto nval = util::prod(shape_in); + stride_t stride_inter(shape_in.size()); stride_inter.back() = sizeof(cmplx); - for (int i=shape.size()-2; i>=0; --i) - stride_inter[i] = stride_inter[i+1]*shape[i+1]; + for (int i=shape_in.size()-2; i>=0; --i) + stride_inter[i] = stride_inter[i+1]*shape_in[i+1]; arr> tmp(nval); auto newaxes = shape_t({axes.begin(), --axes.end()}); - c2c(shape, stride_in, stride_inter, newaxes, false, data_in, tmp.data(), T(1)); - c2r(shape, new_size, stride_inter, stride_out, axes.back(), - tmp.data(), data_out, fct); + c2c(shape_in, stride_in, stride_inter, newaxes, false, data_in, tmp.data(), + T(1)); + c2r(shape_out, stride_inter, stride_out, axes.back(), tmp.data(), data_out, + fct); } template void r2r_fftpack(const shape_t &shape,