diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index e6bfd7360f0c0fb60a3004a85ce497b0cae66f0b..d1eec5d1c9f16f905f589f416dd0bcdce4de2e82 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2812,6 +2812,8 @@ template<> struct VTYPE { using type = long double __attribute__ ((vector_size (VLEN::val*sizeof(long double)))); }; + +template using vtype_t = typename VTYPE::type; #endif template arr alloc_tmp(const shape_t &shape, @@ -2842,187 +2844,177 @@ template arr alloc_tmp(const shape_t &shape, #define POCKETFFT_NTHREADS #endif -template POCKETFFT_NOINLINE void general_c( - const cndarr> &in, ndarr> &out, - const shape_t &axes, bool forward, T fct, size_t POCKETFFT_NTHREADS) +template void copy_input(const multi_iter &it, + const cndarr> &src, cmplx> *POCKETFFT_RESTRICT dst) { - shared_ptr> plan; + for (size_t i=0; i::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); +template void copy_input(const multi_iter &it, + const cndarr &src, vtype_t *POCKETFFT_RESTRICT dst) + { + for (size_t i=0; i(in.shape(), len, sizeof(cmplx)); - const auto &tin(iax==0? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - using vtype = typename VTYPE::type; - it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; iexec (tdatav, fct, forward); - for (size_t i=0; i0) - { - it.advance(1); - auto buf = it.stride_out() == sizeof(cmplx) ? - &out[it.oofs(0)] : reinterpret_cast *>(storage.data()); +template void copy_input(const multi_iter &it, + const cndarr &src, T *POCKETFFT_RESTRICT dst) + { + if (dst == &src[it.iofs(0)]) return; // in-place + for (size_t i=0; iexec (buf, fct, forward); - if (buf != &out[it.oofs(0)]) - for (size_t i=0; i void copy_output(const multi_iter &it, + const cmplx> *POCKETFFT_RESTRICT src, ndarr> &dst) + { + for (size_t i=0; i POCKETFFT_NOINLINE void general_hartley( - const cndarr &in, ndarr &out, const shape_t &axes, T fct, - size_t POCKETFFT_NTHREADS) +template void copy_output(const multi_iter &it, + const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) + { + for (size_t i=0; i void copy_output(const multi_iter &it, + const T *POCKETFFT_RESTRICT src, ndarr &dst) + { + if (src == &dst[it.oofs(0)]) return; // in-place + for (size_t i=0; i struct add_vec { using type = vtype_t; }; +template struct add_vec> + { using type = cmplx>; }; +template using add_vec_t = typename add_vec::type; + +template +POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, + const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec, + const bool allow_inplace=true) { - shared_ptr> plan; + shared_ptr plan; for (size_t iax=0; iax::val; + constexpr auto vlen = VLEN::val; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); + plan = get_plan(len); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); + auto storage = alloc_tmp(in.shape(), len, sizeof(T)); + const auto &tin(iax==0? in : out); multi_iter it(tin, out, axes[iax]); #ifndef POCKETFFT_NO_VECTORS if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); - for (size_t i=0; iexec(tdatav, fct, true); - for (size_t j=0; j *>(storage.data()); + exec(it, tin, out, tdatav, *plan, fct); } #endif while (it.remaining()>0) { it.advance(1); - auto tdata = reinterpret_cast(storage.data()); - for (size_t i=0; iexec(tdata, fct, true); - // Hartley order - out[it.oofs(0)] = tdata[0]; - size_t i=1, i1=1, i2=len-1; - for (i=1; i(storage.data()); + exec(it, tin, out, buf, *plan, fct); } } // end of parallel region - fct = T(1); // factor has been applied, use 1 for remaining axes + fct = T0(1); // factor has been applied, use 1 for remaining axes } } -template POCKETFFT_NOINLINE void general_dcst( - const cndarr &in, ndarr &out, const shape_t &axes, - T fct, bool ortho, int type, bool cosine, size_t POCKETFFT_NTHREADS) +struct ExecC2C { - shared_ptr plan; + bool forward; - for (size_t iax=0; iax void operator () ( + const multi_iter &it, const cndarr> &in, + ndarr> &out, T * buf, const pocketfft_c &plan, T0 fct) const { - constexpr auto vlen = VLEN::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan(len); + copy_input(it, in, buf); + plan.exec(buf, fct, forward); + copy_output(it, buf, out); + } + }; -#ifdef POCKETFFT_OPENMP -#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) -#endif -{ - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - using vtype = typename VTYPE::type; - it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); - for (size_t i=0; iexec(tdatav, fct, ortho, type, cosine); - for (size_t i=0; i0) - { - it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? &out[it.oofs(0)] - : reinterpret_cast(storage.data()); +template void copy_hartley(const multi_iter &it, + const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) + { + for (size_t j=0; jexec(buf, fct, ortho, type, cosine); - if (buf != &out[it.oofs(0)]) - for (size_t i=0; i void copy_hartley(const multi_iter &it, + const T *POCKETFFT_RESTRICT src, ndarr &dst) + { + dst[it.oofs(0)] = src[0]; + size_t i=1, i1=1, i2=it.length_out()-1; + for (i=1; i void operator () ( + const multi_iter &it, const cndarr &in, ndarr &out, + T * buf, const pocketfft_r &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, true); + copy_hartley(it, buf, out); + } + }; + +struct ExecDcst + { + bool ortho; + int type; + bool cosine; + + template + void operator () (const multi_iter &it, const cndarr &in, + ndarr &out, T * buf, const Tplan &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, ortho, type, cosine); + copy_output(it, buf, out); + } + }; + template POCKETFFT_NOINLINE void general_r2c( const cndarr &in, ndarr> &out, size_t axis, bool forward, T fct, size_t POCKETFFT_NTHREADS) @@ -3040,12 +3032,9 @@ template POCKETFFT_NOINLINE void general_r2c( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); - for (size_t i=0; i *>(storage.data()); + copy_input(it, in, tdatav); plan->exec(tdatav, fct, true); for (size_t j=0; j POCKETFFT_NOINLINE void general_r2c( { it.advance(1); auto tdata = reinterpret_cast(storage.data()); - for (size_t i=0; iexec(tdata, fct, true); out[it.oofs(0)].Set(tdata[0]); size_t i=1, ii=1; @@ -3100,9 +3088,8 @@ template POCKETFFT_NOINLINE void general_c2r( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t j=0; j POCKETFFT_NOINLINE void general_c2r( tdatav[i][j] = in[it.iofs(j,ii)].r; } plan->exec(tdatav, fct, false); - for (size_t i=0; i0) @@ -3142,78 +3127,30 @@ template POCKETFFT_NOINLINE void general_c2r( tdata[i] = in[it.iofs(ii)].r; } plan->exec(tdata, fct, false); - for (size_t i=0; i POCKETFFT_NOINLINE void general_r( - const cndarr &in, ndarr &out, const shape_t &axes, bool r2c, - bool forward, T fct, size_t POCKETFFT_NTHREADS) +struct ExecR2R { - shared_ptr> plan; + bool r2c, forward; - for (size_t iax=0; iax void operator () ( + const multi_iter &it, const cndarr &in, ndarr &out, T * buf, + const pocketfft_r &plan, T0 fct) const { - constexpr auto vlen = VLEN::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); - -#ifdef POCKETFFT_OPENMP -#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) -#endif -{ - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - using vtype = typename VTYPE::type; - it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); - for (size_t i=0; iexec (tdatav, fct, forward); - if (r2c && (!forward)) - for (size_t i=2; i0) - { - it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? - &out[it.oofs(0)] : reinterpret_cast(storage.data()); - - if (buf != &tin[it.iofs(0)]) - for (size_t i=0; iexec(buf, fct, forward); - if (r2c && (!forward)) - for (size_t i=2; i void c2c(const shape_t &shape, const stride_t &stride_in, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr> ain(data_in, shape, stride_in); ndarr> aout(data_out, shape, stride_out); - general_c(ain, aout, axes, forward, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, ExecC2C{forward}); } template void dct(const shape_t &shape, @@ -3238,12 +3175,13 @@ template void dct(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); + const ExecDcst exec{ortho, type, true}; if (type==1) - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else if (type==4) - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); } template void dst(const shape_t &shape, @@ -3255,12 +3193,13 @@ template void dst(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); + const ExecDcst exec{ortho, type, false}; if (type==1) - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else if (type==4) - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); } template void r2c(const shape_t &shape_in, @@ -3344,7 +3283,8 @@ template void r2r_fftpack(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); - general_r(ain, aout, axes, real2hermitian, forward, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, + ExecR2R{real2hermitian, forward}); } template void r2r_separable_hartley(const shape_t &shape, @@ -3355,7 +3295,8 @@ template void r2r_separable_hartley(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); - general_hartley(ain, aout, axes, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, ExecHartley{}, + false); } } // namespace detail diff --git a/test.py b/test.py index c718c8698f6e657028d17b8c615c578a09b00e25..e33837b9ab13417e562590539dafe25bf509dfa4 100644 --- a/test.py +++ b/test.py @@ -58,9 +58,18 @@ def irfft_scipy(a, axis, inorm=0, out=None, nthreads=1): tol = {np.float32: 6e-7, np.float64: 1.5e-15, np.longfloat: 1e-18} ctype = {np.float32: np.complex64, np.float64: np.complex128, np.longfloat: np.longcomplex} +import platform +on_wsl = "microsoft" in platform.uname()[3].lower() +true_long_double = (np.longfloat != np.float64 and not on_wsl) +dtypes = [np.float32, np.float64, + pytest.param(np.longfloat, marks=pytest.mark.xfail( + not true_long_double, + reason="Long double doesn't offer more precision"))] + + @pmp("len", len1D) @pmp("inorm", [0, 1, 2]) -@pmp("dtype", [np.float32, np.float64, np.longfloat]) +@pmp("dtype", dtypes) def test1D(len, inorm, dtype): a = np.random.rand(len)-0.5 + 1j*np.random.rand(len)-0.5j a = a.astype(ctype[dtype]) @@ -199,7 +208,7 @@ def test_genuine_hartley_2D(shp, axes): @pmp("len", len1D) @pmp("inorm", [0, 1]) # inorm==2 not needed, tested via inverse @pmp("type", [1, 2, 3, 4]) -@pmp("dtype", [np.float32, np.float64, np.longfloat]) +@pmp("dtype", dtypes) def testdcst1D(len, inorm, type, dtype): a = (np.random.rand(len)-0.5).astype(dtype) eps = tol[dtype]