diff --git a/bench_nd.py b/bench_nd.py
index 24329070eb17819ac018af19aeca8f26ffa66807..d770605599e84b291b38297774d6a07c876c7684 100644
--- a/bench_nd.py
+++ b/bench_nd.py
@@ -5,6 +5,8 @@ import pypocketfft
 from time import time
 import matplotlib.pyplot as plt
 
+def _l2error(a,b):
+    return np.sqrt(np.sum(np.abs(a-b)**2)/np.sum(np.abs(a)**2))
 
 def bench_nd_fftn(ndim, nmax, ntry, tp, nrepeat, filename=""):
     res=[]
@@ -23,6 +25,8 @@ def bench_nd_fftn(ndim, nmax, ntry, tp, nrepeat, filename=""):
             b=pypocketfft.fftn(a)
             t1=time()
             tmin_pp = min(tmin_pp,t1-t0)
+        a2=pypocketfft.ifftn(b,fct=1./a.size)
+        assert(_l2error(a,a2)<(2e-15 if tp=='c16' else 6e-7))
         res.append(tmin_pp/tmin_np)
     plt.title("t(pypocketfft / numpy 1.17), {}D, {}, max_extent={}".format(ndim, str(tp), nmax))
     plt.xlabel("time ratio")
diff --git a/pocketfft.cc b/pocketfft.cc
index 25bdf2265c3bc37013eb6b7d6a098b8ed1f8d8cf..26966bf557d82ef355af71a9c42f5cc3dd247bd9 100644
--- a/pocketfft.cc
+++ b/pocketfft.cc
@@ -2060,27 +2060,43 @@ template<> struct VTYPE<float>
 #endif
 
 template<typename T> arr<char> alloc_tmp(const shape_t &shape,
-  size_t tmpsize, size_t elemsize)
+  size_t axsize, size_t elemsize)
   {
 #ifdef HAVE_VECSUPPORT
   using vtype = typename VTYPE<T>::type;
   constexpr int vlen = sizeof(vtype)/sizeof(T);
-  return arr<char>(tmpsize*elemsize*vlen);
+#else
+  constexpr int vlen = 1;
 #endif
+  size_t fullsize=1;
+  size_t ndim = shape.size();
+  for (size_t i=0; i<ndim; ++i)
+    fullsize*=shape[i];
+  auto othersize = fullsize/axsize;
+  auto tmpsize = axsize*((othersize>vlen) ? vlen : 1);
   return arr<char>(tmpsize*elemsize);
   }
 template<typename T> arr<char> alloc_tmp(const shape_t &shape,
   const shape_t &axes, size_t elemsize)
   {
+#ifdef HAVE_VECSUPPORT
+  using vtype = typename VTYPE<T>::type;
+  constexpr int vlen = sizeof(vtype)/sizeof(T);
+#else
+  constexpr int vlen = 1;
+#endif
   size_t fullsize=1;
   size_t ndim = shape.size();
   for (size_t i=0; i<ndim; ++i)
     fullsize*=shape[i];
   size_t tmpsize=0;
   for (size_t i=0; i<axes.size(); ++i)
-    tmpsize = max(tmpsize, shape[axes[i]]);
-
-  return alloc_tmp<T>(shape, tmpsize, elemsize);
+    {
+    auto axsize = shape[axes[i]];
+    auto othersize = fullsize/axsize;
+    tmpsize = max(tmpsize, axsize*((othersize>vlen) ? vlen : 1));
+    }
+  return arr<char>(tmpsize*elemsize);
   }
 
 template<typename T> void pocketfft_general_c(const shape_t &shape,
@@ -2413,26 +2429,27 @@ shape_t makeaxes(const py::array &in, py::object axes)
   }
 
 template<typename T> py::array xfftn_internal(const py::array &in,
-  const shape_t &axes, double fct, bool fwd)
+  const shape_t &axes, double fct, bool inplace, bool fwd)
   {
   auto dims(copy_shape(in));
-  py::array res = py::array_t<complex<T>>(dims);
+  py::array res = inplace ? in : py::array_t<complex<T>>(dims);
   auto s_i(copy_strides(in)), s_o(copy_strides(res));
   pocketfft_general_c<T>(dims, s_i, s_o, axes, fwd, (const cmplx<T> *)in.data(),
     (cmplx<T> *)res.mutable_data(), fct);
   return res;
   }
 
-py::array xfftn(const py::array &a, py::object axes, double fct, bool fwd)
+py::array xfftn(const py::array &a, py::object axes, double fct, bool inplace,
+  bool fwd)
   {
   return tcheck(a, c128, c64) ?
-    xfftn_internal<double>(a, makeaxes(a, axes), fct, fwd) :
-    xfftn_internal<float> (a, makeaxes(a, axes), fct, fwd);
+    xfftn_internal<double>(a, makeaxes(a, axes), fct, inplace, fwd) :
+    xfftn_internal<float> (a, makeaxes(a, axes), fct, inplace, fwd);
   }
-py::array fftn(const py::array &a, py::object axes, double fct)
-  { return xfftn(a, axes, fct, true); }
-py::array ifftn(const py::array &a, py::object axes, double fct)
-  { return xfftn(a, axes, fct, false); }
+py::array fftn(const py::array &a, py::object axes, double fct, bool inplace)
+  { return xfftn(a, axes, fct, inplace, true); }
+py::array ifftn(const py::array &a, py::object axes, double fct, bool inplace)
+  { return xfftn(a, axes, fct, inplace, false); }
 
 template<typename T> py::array rfftn_internal(const py::array &in,
   py::object axes_, T fct)
@@ -2467,7 +2484,7 @@ template<typename T> py::array irfftn_internal(const py::array &in,
     shape_t axes2(axes.size()-1);
     for (size_t i=0; i<axes2.size(); ++i)
       axes2[i] = axes[i];
-    inter = xfftn_internal<T>(in, axes2, 1., false);
+    inter = xfftn_internal<T>(in, axes2, 1., false, false);
     }
   else
     inter = in;
@@ -2493,20 +2510,22 @@ py::array irfftn(const py::array &in, py::object axes_, size_t lastsize,
   }
 
 template<typename T> py::array hartley_internal(const py::array &in,
-  py::object axes_, double fct)
+  py::object axes_, double fct, bool inplace)
   {
   auto axes(makeaxes(in, axes_));
   auto dims(copy_shape(in));
-  py::array res = py::array_t<T>(dims);
+  py::array res = inplace ? in : py::array_t<T>(dims);
   auto s_i(copy_strides(in)), s_o(copy_strides(res));
   pocketfft_general_hartley<T>(dims, s_i, s_o, axes, (const T *)in.data(),
     (T *)res.mutable_data(), 1.);
   return res;
   }
-py::array hartley(const py::array &in, py::object axes_, double fct)
+py::array hartley(const py::array &in, py::object axes_, double fct,
+  bool inplace)
   {
-  return tcheck(in, f64, f32) ? hartley_internal<double>(in, axes_, fct)
-                              : hartley_internal<float> (in, axes_, fct);
+  return tcheck(in, f64, f32) ?
+    hartley_internal<double>(in, axes_, fct, inplace) :
+    hartley_internal<float> (in, axes_, fct, inplace);
   }
 
 template<typename T>py::array complex2hartley(const py::array &in,
@@ -2589,6 +2608,9 @@ axes : list of integers
     If not set, all axes will be transformed.
 fct : float
     Normalization factor
+inplace : bool
+    if False, returns the esult in a new array and leaves the input unchanged.
+    if True, stores the result in the input array and returns a handle to it.
 
 Returns
 -------
@@ -2607,6 +2629,9 @@ axes : list of integers
     If not set, all axes will be transformed.
 fct : float
     Normalization factor
+inplace : bool
+    if False, returns the esult in a new array and leaves the input unchanged.
+    if True, stores the result in the input array and returns a handle to it.
 
 Returns
 -------
@@ -2670,6 +2695,9 @@ axes : list of integers
     If not set, all axes will be transformed.
 fct : float
     Normalization factor
+inplace : bool
+    if False, returns the esult in a new array and leaves the input unchanged.
+    if True, stores the result in the input array and returns a handle to it.
 
 Returns
 -------
@@ -2684,11 +2712,15 @@ PYBIND11_MODULE(pypocketfft, m)
   using namespace pybind11::literals;
 
   m.doc() = pypocketfft_DS;
-  m.def("fftn",&fftn, fftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.);
-  m.def("ifftn",&ifftn, ifftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.);
+  m.def("fftn",&fftn, fftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.,
+    "inplace"_a=false);
+  m.def("ifftn",&ifftn, ifftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.,
+    "inplace"_a=false);
   m.def("rfftn",&rfftn, rfftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.);
-  m.def("irfftn",&irfftn, irfftn_DS, "a"_a, "axes"_a=py::none(), "lastsize"_a=0, "fct"_a=1.);
-  m.def("hartley",&hartley, hartley_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.);
+  m.def("irfftn",&irfftn, irfftn_DS, "a"_a, "axes"_a=py::none(), "lastsize"_a=0,
+    "fct"_a=1.);
+  m.def("hartley",&hartley, hartley_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.,
+    "inplace"_a=false);
   m.def("hartley2",&hartley2, "a"_a, "axes"_a=py::none(), "fct"_a=1.);
   m.def("complex2hartley",&mycomplex2hartley, "in"_a, "tmp"_a, "axes"_a);
   }
diff --git a/test.py b/test.py
index fa04db7bfd3ac1a19d34c3a948902fbf8d364b83..20815c4a91b0c3372f56f0525c09d0c85af88b74 100644
--- a/test.py
+++ b/test.py
@@ -20,8 +20,14 @@ def test1D(len):
     b=a.astype(np.complex64)
     assert(_l2error(a, pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./len))<1.5e-15)
     assert(_l2error(a.real, pypocketfft.irfftn(pypocketfft.rfftn(a.real),fct=1./len,lastsize=len))<1e-15)
+    tmp=a.copy()
+    assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./len, inplace=True) is tmp)
+    assert(_l2error(tmp, a)<1.5e-15)
     assert(_l2error(b, pypocketfft.ifftn(pypocketfft.fftn(b),fct=1./len))<5e-7)
     assert(_l2error(b.real, pypocketfft.irfftn(pypocketfft.rfftn(b.real),fct=1./len,lastsize=len))<5e-7)
+    tmp=b.copy()
+    assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./len, inplace=True) is tmp)
+    assert(_l2error(tmp, b)<5e-7)
 
 @pmp("shp", shapes)
 def test_fftn(shp):
@@ -62,6 +68,9 @@ def test_identity(shp):
     a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j
     vmax = np.max(np.abs(a))
     assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1e-15*vmax)
+    tmp=a.copy()
+    assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./a.size, inplace=True) is tmp)
+    assert(_l2error(tmp, a)<1.5e-15*vmax)
     a=a.astype(np.complex64)
     assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<5e-7*vmax)