diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h
index 7afc983cb08f8d41e6dc8786d637f6c1097e17bc..0b955ba5407b7efdd4e0d9bb3ba6ad7150aad18b 100644
--- a/pocketfft_hdronly.h
+++ b/pocketfft_hdronly.h
@@ -2902,7 +2902,8 @@ template <typename T> using add_vec_t = typename add_vec<T>::type;
 
 template<typename Tplan, typename T, typename T0, typename Exec>
 POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out,
-  const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec)
+  const shape_t &axes, T0 fct, size_t POCKETFFT_NTHREADS, const Exec & exec,
+  const bool allow_inplace=true)
   {
   shared_ptr<Tplan> plan;
 
@@ -2932,7 +2933,7 @@ POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out,
     while (it.remaining()>0)
       {
       it.advance(1);
-      auto buf = it.stride_out() == sizeof(T) ?
+      auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
         &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
       exec(it, in, out, buf, *plan, fct);
       }
@@ -2979,8 +2980,8 @@ template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
   size_t i=1, i1=1, i2=it.length_out()-1;
   for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
     {
-      dst[it.oofs(i1)] = src[i]+src[i+1];
-      dst[it.oofs(i2)] = src[i]-src[i+1];
+    dst[it.oofs(i1)] = src[i]+src[i+1];
+    dst[it.oofs(i2)] = src[i]-src[i+1];
     }
   if (i<it.length_out())
     dst[it.oofs(i1)] = src[i];
@@ -3294,7 +3295,8 @@ template<typename T> void r2r_separable_hartley(const shape_t &shape,
   util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
   cndarr<T> ain(data_in, shape, stride_in);
   ndarr<T> aout(data_out, shape, stride_out);
-  general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{});
+  general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{},
+    false);
   }
 
 } // namespace detail