From cb72d6026521c760b1344f948e262252d85afc14 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sat, 3 Aug 2019 17:17:44 +0100 Subject: [PATCH 1/7] Use alias templates for vtype --- pocketfft_hdronly.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index e6bfd73..b1d5a4d 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2812,6 +2812,8 @@ template<> struct VTYPE { using type = long double __attribute__ ((vector_size (VLEN::val*sizeof(long double)))); }; + +template using vtype_t = typename VTYPE::type; #endif template arr alloc_tmp(const shape_t &shape, @@ -2866,9 +2868,8 @@ template POCKETFFT_NOINLINE void general_c( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); + auto tdatav = reinterpret_cast> *>(storage.data()); for (size_t i=0; i POCKETFFT_NOINLINE void general_hartley( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; i POCKETFFT_NOINLINE void general_dcst( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; i POCKETFFT_NOINLINE void general_r2c( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; i POCKETFFT_NOINLINE void general_c2r( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t j=0; j POCKETFFT_NOINLINE void general_r( if (vlen>1) while (it.remaining()>=vlen) { - using vtype = typename VTYPE::type; it.advance(vlen); - auto tdatav = reinterpret_cast(storage.data()); + auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; i Date: Sat, 3 Aug 2019 17:57:40 +0100 Subject: [PATCH 2/7] Isolate copying into its own function --- pocketfft_hdronly.h | 178 +++++++++++++++++++++++++------------------- 1 file changed, 102 insertions(+), 76 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index b1d5a4d..305fa4d 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2844,6 +2844,57 @@ template arr alloc_tmp(const shape_t &shape, #define POCKETFFT_NTHREADS #endif +template void copy_input(const multi_iter &it, + const cndarr> &src, cmplx> *POCKETFFT_RESTRICT dst) + { + for (size_t i=0; i void copy_input(const multi_iter &it, + const cndarr &src, vtype_t *POCKETFFT_RESTRICT dst) + { + for (size_t i=0; i void copy_input(const multi_iter &it, + const cndarr &src, T *POCKETFFT_RESTRICT dst) + { + if (dst == &src[it.iofs(0)]) return; // in-place + for (size_t i=0; i void copy_output(const multi_iter &it, + const cmplx> *POCKETFFT_RESTRICT src, ndarr> &dst) + { + for (size_t i=0; i void copy_output(const multi_iter &it, + const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) + { + for (size_t i=0; i void copy_output(const multi_iter &it, + const T *POCKETFFT_RESTRICT src, ndarr &dst) + { + if (src == &dst[it.oofs(0)]) return; // in-place + for (size_t i=0; i POCKETFFT_NOINLINE void general_c( const cndarr> &in, ndarr> &out, const shape_t &axes, bool forward, T fct, size_t POCKETFFT_NTHREADS) @@ -2870,16 +2921,9 @@ template POCKETFFT_NOINLINE void general_c( { it.advance(vlen); auto tdatav = reinterpret_cast> *>(storage.data()); - for (size_t i=0; iexec (tdatav, fct, forward); - for (size_t i=0; i0) @@ -2888,19 +2932,46 @@ template POCKETFFT_NOINLINE void general_c( auto buf = it.stride_out() == sizeof(cmplx) ? &out[it.oofs(0)] : reinterpret_cast *>(storage.data()); - if (buf != &tin[it.iofs(0)]) - for (size_t i=0; iexec (buf, fct, forward); - if (buf != &out[it.oofs(0)]) - for (size_t i=0; i void copy_hartley(const multi_iter &it, + const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) + { + for (size_t j=0; j void copy_hartley(const multi_iter &it, + const T *POCKETFFT_RESTRICT src, ndarr &dst) + { + dst[it.oofs(0)] = src[0]; + size_t i=1, i1=1, i2=it.length_out()-1; + for (i=1; i POCKETFFT_NOINLINE void general_hartley( const cndarr &in, ndarr &out, const shape_t &axes, T fct, size_t POCKETFFT_NTHREADS) @@ -2927,41 +2998,18 @@ template POCKETFFT_NOINLINE void general_hartley( { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; iexec(tdatav, fct, true); - for (size_t j=0; j0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); - for (size_t i=0; iexec(tdata, fct, true); - // Hartley order - out[it.oofs(0)] = tdata[0]; - size_t i=1, i1=1, i2=len-1; - for (i=1; i POCKETFFT_NOINLINE void general_dcst( { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; iexec(tdatav, fct, ortho, type, cosine); - for (size_t i=0; i0) @@ -3009,13 +3053,9 @@ template POCKETFFT_NOINLINE void general_dcst( auto buf = it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast(storage.data()); - if (buf != &tin[it.iofs(0)]) - for (size_t i=0; iexec(buf, fct, ortho, type, cosine); - if (buf != &out[it.oofs(0)]) - for (size_t i=0; i POCKETFFT_NOINLINE void general_r2c( { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; iexec(tdatav, fct, true); for (size_t j=0; j POCKETFFT_NOINLINE void general_r2c( { it.advance(1); auto tdata = reinterpret_cast(storage.data()); - for (size_t i=0; iexec(tdata, fct, true); out[it.oofs(0)].Set(tdata[0]); size_t i=1, ii=1; @@ -3117,9 +3154,7 @@ template POCKETFFT_NOINLINE void general_c2r( tdatav[i][j] = in[it.iofs(j,ii)].r; } plan->exec(tdatav, fct, false); - for (size_t i=0; i0) @@ -3139,8 +3174,7 @@ template POCKETFFT_NOINLINE void general_c2r( tdata[i] = in[it.iofs(ii)].r; } plan->exec(tdata, fct, false); - for (size_t i=0; i POCKETFFT_NOINLINE void general_r( { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - for (size_t i=0; i POCKETFFT_NOINLINE void general_r( if (r2c && (!forward)) for (size_t i=2; i0) @@ -3192,9 +3222,7 @@ template POCKETFFT_NOINLINE void general_r( auto buf = it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast(storage.data()); - if (buf != &tin[it.iofs(0)]) - for (size_t i=0; i POCKETFFT_NOINLINE void general_r( if (r2c && (!forward)) for (size_t i=2; i Date: Sat, 3 Aug 2019 20:29:47 +0100 Subject: [PATCH 3/7] Refactor n-dimensional general_x functions --- pocketfft_hdronly.h | 239 +++++++++++++++++--------------------------- 1 file changed, 91 insertions(+), 148 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 305fa4d..895ee0d 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2895,24 +2895,29 @@ template void copy_output(const multi_iter &it, dst[it.oofs(i)] = src[i]; } -template POCKETFFT_NOINLINE void general_c( - const cndarr> &in, ndarr> &out, - const shape_t &axes, bool forward, T fct, size_t POCKETFFT_NTHREADS) +template struct add_vec { using type = vtype_t; }; +template struct add_vec> + { using type = cmplx>; }; +template using add_vec_t = typename add_vec::type; + +template +POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, + const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec) { - shared_ptr> plan; + shared_ptr plan; for (size_t iax=0; iax::val; + constexpr auto vlen = VLEN::val; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); + plan = get_plan(len); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { - auto storage = alloc_tmp(in.shape(), len, sizeof(cmplx)); + auto storage = alloc_tmp(in.shape(), len, sizeof(T)); const auto &tin(iax==0? in : out); multi_iter it(tin, out, axes[iax]); #ifndef POCKETFFT_NO_VECTORS @@ -2920,27 +2925,43 @@ template POCKETFFT_NOINLINE void general_c( while (it.remaining()>=vlen) { it.advance(vlen); - auto tdatav = reinterpret_cast> *>(storage.data()); - copy_input(it, tin, tdatav); - plan->exec (tdatav, fct, forward); - copy_output(it, tdatav, out); + auto tdatav = reinterpret_cast *>(storage.data()); + exec(it, in, out, tdatav, *plan, fct); } #endif while (it.remaining()>0) { it.advance(1); - auto buf = it.stride_out() == sizeof(cmplx) ? - &out[it.oofs(0)] : reinterpret_cast *>(storage.data()); - - copy_input(it, tin, buf); - plan->exec (buf, fct, forward); - copy_output(it, buf, out); + auto buf = it.stride_out() == sizeof(T) ? + &out[it.oofs(0)] : reinterpret_cast(storage.data()); + exec(it, in, out, buf, *plan, fct); } } // end of parallel region - fct = T(1); // factor has been applied, use 1 for remaining axes + fct = T0(1); // factor has been applied, use 1 for remaining axes } } +struct ExecC2C + { + bool forward; + + template void operator () ( + const multi_iter &it, const cndarr> &in, + ndarr> &out, T * buf, const pocketfft_c &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, forward); + copy_output(it, buf, out); + } + }; + +template void general_c( + const cndarr> &in, ndarr> &out, + const shape_t &axes, bool forward, T fct, size_t nthreads) + { + general_nd>(in, out, axes, fct, nthreads, ExecC2C{forward}); + } + template void copy_hartley(const multi_iter &it, const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) { @@ -2972,94 +2993,46 @@ template void copy_hartley(const multi_iter &it, dst[it.oofs(i1)] = src[i]; } +struct ExecHartley + { + template void operator () ( + const multi_iter &it, const cndarr &in, ndarr &out, + T * buf, const pocketfft_r &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, true); + copy_hartley(it, buf, out); + } + }; + template POCKETFFT_NOINLINE void general_hartley( const cndarr &in, ndarr &out, const shape_t &axes, T fct, - size_t POCKETFFT_NTHREADS) + size_t nthreads) { - shared_ptr> plan; + general_nd>(in, out, axes, fct, nthreads, ExecHartley{}); + } - for (size_t iax=0; iax::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); +struct ExecDcst + { + bool ortho; + int type; + bool cosine; -#ifdef POCKETFFT_OPENMP -#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) -#endif -{ - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); - copy_input(it, tin, tdatav); - plan->exec(tdatav, fct, true); - copy_hartley(it, tdatav, out); - } -#endif - while (it.remaining()>0) - { - it.advance(1); - auto tdata = reinterpret_cast(storage.data()); - copy_input(it, tin, tdata); - plan->exec(tdata, fct, true); - copy_hartley(it, tdata, out); - } -} // end of parallel region - fct = T(1); // factor has been applied, use 1 for remaining axes + template + void operator () (const multi_iter &it, const cndarr &in, + ndarr &out, T * buf, const Tplan &plan, T0 fct) const + { + copy_input(it, in, buf); + plan.exec(buf, fct, ortho, type, cosine); + copy_output(it, buf, out); } - } + }; template POCKETFFT_NOINLINE void general_dcst( const cndarr &in, ndarr &out, const shape_t &axes, - T fct, bool ortho, int type, bool cosine, size_t POCKETFFT_NTHREADS) + T fct, bool ortho, int type, bool cosine, size_t nthreads) { - shared_ptr plan; - - for (size_t iax=0; iax::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan(len); - -#ifdef POCKETFFT_OPENMP -#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) -#endif -{ - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); - copy_input(it, tin, tdatav); - plan->exec(tdatav, fct, ortho, type, cosine); - copy_output(it, tdatav, out); - } -#endif - while (it.remaining()>0) - { - it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? &out[it.oofs(0)] - : reinterpret_cast(storage.data()); - - copy_input(it, tin, buf); - plan->exec(buf, fct, ortho, type, cosine); - copy_output(it, buf, out); - } -} // end of parallel region - fct = T(1); // factor has been applied, use 1 for remaining axes - } + general_nd(in, out, axes, fct, nthreads, ExecDcst{ortho, type, cosine}); } template POCKETFFT_NOINLINE void general_r2c( @@ -3179,62 +3152,32 @@ template POCKETFFT_NOINLINE void general_c2r( } // end of parallel region } -template POCKETFFT_NOINLINE void general_r( - const cndarr &in, ndarr &out, const shape_t &axes, bool r2c, - bool forward, T fct, size_t POCKETFFT_NTHREADS) +struct ExecR2R { - shared_ptr> plan; + bool r2c, forward; - for (size_t iax=0; iax void operator () ( + const multi_iter &it, const cndarr &in, ndarr &out, T * buf, + const pocketfft_r &plan, T0 fct) const { - constexpr auto vlen = VLEN::val; - size_t len=in.shape(axes[iax]); - if ((!plan) || (len!=plan->length())) - plan = get_plan>(len); - -#ifdef POCKETFFT_OPENMP -#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) -#endif -{ - auto storage = alloc_tmp(in.shape(), len, sizeof(T)); - const auto &tin(iax==0 ? in : out); - multi_iter it(tin, out, axes[iax]); -#ifndef POCKETFFT_NO_VECTORS - if (vlen>1) - while (it.remaining()>=vlen) - { - it.advance(vlen); - auto tdatav = reinterpret_cast *>(storage.data()); - copy_input(it, tin, tdatav); - if ((!r2c) && forward) - for (size_t i=2; iexec (tdatav, fct, forward); - if (r2c && (!forward)) - for (size_t i=2; i0) - { - it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? - &out[it.oofs(0)] : reinterpret_cast(storage.data()); - - copy_input(it, tin, buf); - if ((!r2c) && forward) - for (size_t i=2; iexec(buf, fct, forward); - if (r2c && (!forward)) - for (size_t i=2; i POCKETFFT_NOINLINE void general_r( + const cndarr &in, ndarr &out, const shape_t &axes, bool r2c, + bool forward, T fct, size_t nthreads) + { + general_nd>(in, out, axes, fct, nthreads, + ExecR2R{r2c, forward}); } #undef POCKETFFT_NTHREADS -- GitLab From 1971f449e91330b4e92565c527caae832ad8815d Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sun, 4 Aug 2019 01:10:08 +0100 Subject: [PATCH 4/7] Remove specific general_ functions {c, r, hartley & dcst} --- pocketfft_hdronly.h | 50 +++++++++++---------------------------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 895ee0d..7afc983 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2955,13 +2955,6 @@ struct ExecC2C } }; -template void general_c( - const cndarr> &in, ndarr> &out, - const shape_t &axes, bool forward, T fct, size_t nthreads) - { - general_nd>(in, out, axes, fct, nthreads, ExecC2C{forward}); - } - template void copy_hartley(const multi_iter &it, const vtype_t *POCKETFFT_RESTRICT src, ndarr &dst) { @@ -3005,13 +2998,6 @@ struct ExecHartley } }; -template POCKETFFT_NOINLINE void general_hartley( - const cndarr &in, ndarr &out, const shape_t &axes, T fct, - size_t nthreads) - { - general_nd>(in, out, axes, fct, nthreads, ExecHartley{}); - } - struct ExecDcst { bool ortho; @@ -3028,13 +3014,6 @@ struct ExecDcst } }; -template POCKETFFT_NOINLINE void general_dcst( - const cndarr &in, ndarr &out, const shape_t &axes, - T fct, bool ortho, int type, bool cosine, size_t nthreads) - { - general_nd(in, out, axes, fct, nthreads, ExecDcst{ortho, type, cosine}); - } - template POCKETFFT_NOINLINE void general_r2c( const cndarr &in, ndarr> &out, size_t axis, bool forward, T fct, size_t POCKETFFT_NTHREADS) @@ -3172,14 +3151,6 @@ struct ExecR2R } }; -template POCKETFFT_NOINLINE void general_r( - const cndarr &in, ndarr &out, const shape_t &axes, bool r2c, - bool forward, T fct, size_t nthreads) - { - general_nd>(in, out, axes, fct, nthreads, - ExecR2R{r2c, forward}); - } - #undef POCKETFFT_NTHREADS template void c2c(const shape_t &shape, const stride_t &stride_in, @@ -3191,7 +3162,7 @@ template void c2c(const shape_t &shape, const stride_t &stride_in, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr> ain(data_in, shape, stride_in); ndarr> aout(data_out, shape, stride_out); - general_c(ain, aout, axes, forward, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, ExecC2C{forward}); } template void dct(const shape_t &shape, @@ -3203,12 +3174,13 @@ template void dct(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); + const ExecDcst exec{ortho, type, true}; if (type==1) - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else if (type==4) - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); } template void dst(const shape_t &shape, @@ -3220,12 +3192,13 @@ template void dst(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); + const ExecDcst exec{ortho, type, false}; if (type==1) - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else if (type==4) - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); else - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, exec); } template void r2c(const shape_t &shape_in, @@ -3309,7 +3282,8 @@ template void r2r_fftpack(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); - general_r(ain, aout, axes, real2hermitian, forward, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, + ExecR2R{real2hermitian, forward}); } template void r2r_separable_hartley(const shape_t &shape, @@ -3320,7 +3294,7 @@ template void r2r_separable_hartley(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); - general_hartley(ain, aout, axes, fct, nthreads); + general_nd>(ain, aout, axes, fct, nthreads, ExecHartley{}); } } // namespace detail -- GitLab From ac891a3056b935185205626a6da9998e51dae8fb Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sun, 4 Aug 2019 02:34:43 +0100 Subject: [PATCH 5/7] Fix hartley transforms by disallowing inplace FFT output --- pocketfft_hdronly.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 7afc983..0b955ba 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2902,7 +2902,8 @@ template using add_vec_t = typename add_vec::type; template POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, - const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec) + const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec, + const bool allow_inplace=true) { shared_ptr plan; @@ -2932,7 +2933,7 @@ POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, while (it.remaining()>0) { it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? + auto buf = allow_inplace && it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast(storage.data()); exec(it, in, out, buf, *plan, fct); } @@ -2979,8 +2980,8 @@ template void copy_hartley(const multi_iter &it, size_t i=1, i1=1, i2=it.length_out()-1; for (i=1; i void r2r_separable_hartley(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); - general_nd>(ain, aout, axes, fct, nthreads, ExecHartley{}); + general_nd>(ain, aout, axes, fct, nthreads, ExecHartley{}, + false); } } // namespace detail -- GitLab From 15278280a70f11f1a0ecb0f17851f5dd13d3614c Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 4 Aug 2019 15:29:35 +0200 Subject: [PATCH 6/7] fix --- pocketfft_hdronly.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 0b955ba..d1eec5d 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2927,7 +2927,7 @@ POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); - exec(it, in, out, tdatav, *plan, fct); + exec(it, tin, out, tdatav, *plan, fct); } #endif while (it.remaining()>0) @@ -2935,7 +2935,7 @@ POCKETFFT_NOINLINE void general_nd(const cndarr &in, ndarr &out, it.advance(1); auto buf = allow_inplace && it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast(storage.data()); - exec(it, in, out, buf, *plan, fct); + exec(it, tin, out, buf, *plan, fct); } } // end of parallel region fct = T0(1); // factor has been applied, use 1 for remaining axes -- GitLab From 8aa2d5cd467187ec895a45498164aa27608dde41 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sun, 4 Aug 2019 16:57:20 +0100 Subject: [PATCH 7/7] xfail long double tests on WSL and platforms without extended precision --- test.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test.py b/test.py index c718c86..e33837b 100644 --- a/test.py +++ b/test.py @@ -58,9 +58,18 @@ def irfft_scipy(a, axis, inorm=0, out=None, nthreads=1): tol = {np.float32: 6e-7, np.float64: 1.5e-15, np.longfloat: 1e-18} ctype = {np.float32: np.complex64, np.float64: np.complex128, np.longfloat: np.longcomplex} +import platform +on_wsl = "microsoft" in platform.uname()[3].lower() +true_long_double = (np.longfloat != np.float64 and not on_wsl) +dtypes = [np.float32, np.float64, + pytest.param(np.longfloat, marks=pytest.mark.xfail( + not true_long_double, + reason="Long double doesn't offer more precision"))] + + @pmp("len", len1D) @pmp("inorm", [0, 1, 2]) -@pmp("dtype", [np.float32, np.float64, np.longfloat]) +@pmp("dtype", dtypes) def test1D(len, inorm, dtype): a = np.random.rand(len)-0.5 + 1j*np.random.rand(len)-0.5j a = a.astype(ctype[dtype]) @@ -199,7 +208,7 @@ def test_genuine_hartley_2D(shp, axes): @pmp("len", len1D) @pmp("inorm", [0, 1]) # inorm==2 not needed, tested via inverse @pmp("type", [1, 2, 3, 4]) -@pmp("dtype", [np.float32, np.float64, np.longfloat]) +@pmp("dtype", dtypes) def testdcst1D(len, inorm, type, dtype): a = (np.random.rand(len)-0.5).astype(dtype) eps = tol[dtype] -- GitLab