Commit 32cb03a0 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

avoid some unnecessary copying

parent 09034339
......@@ -26,7 +26,7 @@ def bench_nd_fftn(ndim, nmax, ntry, tp, nrepeat, filename=""):
t1=time()
tmin_pp = min(tmin_pp,t1-t0)
a2=pypocketfft.ifftn(b,fct=1./a.size)
assert(_l2error(a,a2)<(2e-15 if tp=='c16' else 6e-7))
assert(_l2error(a,a2)<(2.5e-15 if tp=='c16' else 6e-7))
res.append(tmin_pp/tmin_np)
plt.title("t(pypocketfft / numpy 1.17), {}D, {}, max_extent={}".format(ndim, str(tp), nmax))
plt.xlabel("time ratio")
......
......@@ -2061,13 +2061,13 @@ template<> struct VTYPE<float>
template<typename T> struct VTYPE{};
template<> struct VTYPE<double>
{
using type = double;
constexpr int vlen=1;
using type = double __attribute__ ((vector_size (8)));
static constexpr int vlen=1;
};
template<> struct VTYPE<float>
{
using type = float;
constexpr int vlen=1;
using type = float __attribute__ ((vector_size (4)));
static constexpr int vlen=1;
};
#endif
......@@ -2146,6 +2146,18 @@ template<typename T> void pocketfft_general_c(const shape_t &shape,
data_out[p_o[j]+i*it_out.stride()].Set(tdatav[i].r[j],tdatav[i].i[j]);
}
while (it_in.remaining()>0)
{
if ((data_in==data_out) && it_in.stride()==1) // fully in-place
forward ? plan->forward((T *)(data_in+it_in.offset()), fct)
: plan->backward((T *)(data_in+it_in.offset()), fct);
else if (it_out.stride()==1) // compute FFT in output location
{
for (size_t i=0; i<it_out.length(); ++i)
data_out[it_out.offset()+i] = data_in[it_in.offset()+i*it_in.stride()];
forward ? plan->forward((T *)(data_out+it_out.offset()), fct)
: plan->backward((T *)(data_out+it_out.offset()), fct);
}
else
{
for (size_t i=0; i<it_in.length(); ++i)
tdata[i] = data_in[it_in.offset() + i*it_in.stride()];
......@@ -2153,6 +2165,7 @@ template<typename T> void pocketfft_general_c(const shape_t &shape,
: plan->backward((T *)tdata, fct);
for (size_t i=0; i<it_out.length(); ++i)
data_out[it_out.offset()+i*it_out.stride()] = tdata[i];
}
it_in.advance();
it_out.advance();
}
......@@ -2658,7 +2671,7 @@ np.ndarray (np.float32 or np.float64)
entries.
)DELIM";
const char *hartley_DS = R"DELIM(Performs a Hartley FFT.
const char *hartley_DS = R"DELIM(Performs a Hartley transform.
For every requested axis, a 1D forward Fourier transform is carried out,
and the sum of real and imaginary parts of the result is stored in the output
array.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment