Commit 32cb03a0 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

avoid some unnecessary copying

parent 09034339
...@@ -26,7 +26,7 @@ def bench_nd_fftn(ndim, nmax, ntry, tp, nrepeat, filename=""): ...@@ -26,7 +26,7 @@ def bench_nd_fftn(ndim, nmax, ntry, tp, nrepeat, filename=""):
t1=time() t1=time()
tmin_pp = min(tmin_pp,t1-t0) tmin_pp = min(tmin_pp,t1-t0)
a2=pypocketfft.ifftn(b,fct=1./a.size) a2=pypocketfft.ifftn(b,fct=1./a.size)
assert(_l2error(a,a2)<(2e-15 if tp=='c16' else 6e-7)) assert(_l2error(a,a2)<(2.5e-15 if tp=='c16' else 6e-7))
res.append(tmin_pp/tmin_np) res.append(tmin_pp/tmin_np)
plt.title("t(pypocketfft / numpy 1.17), {}D, {}, max_extent={}".format(ndim, str(tp), nmax)) plt.title("t(pypocketfft / numpy 1.17), {}D, {}, max_extent={}".format(ndim, str(tp), nmax))
plt.xlabel("time ratio") plt.xlabel("time ratio")
......
...@@ -2061,13 +2061,13 @@ template<> struct VTYPE<float> ...@@ -2061,13 +2061,13 @@ template<> struct VTYPE<float>
template<typename T> struct VTYPE{}; template<typename T> struct VTYPE{};
template<> struct VTYPE<double> template<> struct VTYPE<double>
{ {
using type = double; using type = double __attribute__ ((vector_size (8)));
constexpr int vlen=1; static constexpr int vlen=1;
}; };
template<> struct VTYPE<float> template<> struct VTYPE<float>
{ {
using type = float; using type = float __attribute__ ((vector_size (4)));
constexpr int vlen=1; static constexpr int vlen=1;
}; };
#endif #endif
...@@ -2147,12 +2147,25 @@ template<typename T> void pocketfft_general_c(const shape_t &shape, ...@@ -2147,12 +2147,25 @@ template<typename T> void pocketfft_general_c(const shape_t &shape,
} }
while (it_in.remaining()>0) while (it_in.remaining()>0)
{ {
for (size_t i=0; i<it_in.length(); ++i) if ((data_in==data_out) && it_in.stride()==1) // fully in-place
tdata[i] = data_in[it_in.offset() + i*it_in.stride()]; forward ? plan->forward((T *)(data_in+it_in.offset()), fct)
forward ? plan->forward((T *)tdata, fct) : plan->backward((T *)(data_in+it_in.offset()), fct);
: plan->backward((T *)tdata, fct); else if (it_out.stride()==1) // compute FFT in output location
for (size_t i=0; i<it_out.length(); ++i) {
data_out[it_out.offset()+i*it_out.stride()] = tdata[i]; for (size_t i=0; i<it_out.length(); ++i)
data_out[it_out.offset()+i] = data_in[it_in.offset()+i*it_in.stride()];
forward ? plan->forward((T *)(data_out+it_out.offset()), fct)
: plan->backward((T *)(data_out+it_out.offset()), fct);
}
else
{
for (size_t i=0; i<it_in.length(); ++i)
tdata[i] = data_in[it_in.offset() + i*it_in.stride()];
forward ? plan->forward((T *)tdata, fct)
: plan->backward((T *)tdata, fct);
for (size_t i=0; i<it_out.length(); ++i)
data_out[it_out.offset()+i*it_out.stride()] = tdata[i];
}
it_in.advance(); it_in.advance();
it_out.advance(); it_out.advance();
} }
...@@ -2658,7 +2671,7 @@ np.ndarray (np.float32 or np.float64) ...@@ -2658,7 +2671,7 @@ np.ndarray (np.float32 or np.float64)
entries. entries.
)DELIM"; )DELIM";
const char *hartley_DS = R"DELIM(Performs a Hartley FFT. const char *hartley_DS = R"DELIM(Performs a Hartley transform.
For every requested axis, a 1D forward Fourier transform is carried out, For every requested axis, a 1D forward Fourier transform is carried out,
and the sum of real and imaginary parts of the result is stored in the output and the sum of real and imaginary parts of the result is stored in the output
array. array.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment