diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 7afc983cb08f8d41e6dc8786d637f6c1097e17bc..0b955ba5407b7efdd4e0d9bb3ba6ad7150aad18b 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2902,7 +2902,8 @@ template <typename T> using add_vec_t = typename add_vec<T>::type; template<typename Tplan, typename T, typename T0, typename Exec> POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out, - const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec) + const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec, + const bool allow_inplace=true) { shared_ptr<Tplan> plan; @@ -2932,7 +2933,7 @@ POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out, while (it.remaining()>0) { it.advance(1); - auto buf = it.stride_out() == sizeof(T) ? + auto buf = allow_inplace && it.stride_out() == sizeof(T) ? &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data()); exec(it, in, out, buf, *plan, fct); } @@ -2979,8 +2980,8 @@ template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it, size_t i=1, i1=1, i2=it.length_out()-1; for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2) { - dst[it.oofs(i1)] = src[i]+src[i+1]; - dst[it.oofs(i2)] = src[i]-src[i+1]; + dst[it.oofs(i1)] = src[i]+src[i+1]; + dst[it.oofs(i2)] = src[i]-src[i+1]; } if (i<it.length_out()) dst[it.oofs(i1)] = src[i]; @@ -3294,7 +3295,8 @@ template<typename T> void r2r_separable_hartley(const shape_t &shape, util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr<T> ain(data_in, shape, stride_in); ndarr<T> aout(data_out, shape, stride_out); - general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{}); + general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{}, + false); } } // namespace detail