Commit 65e1c0de authored by Martin Reinecke's avatar Martin Reinecke

Merge branch 'ducc0' of gitlab.mpcdf.mpg.de:mtr/ducc into ducc0

parents d15ceb46 bef610d2
Pipeline #94011 passed with stages
in 21 minutes
......@@ -16,7 +16,7 @@
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Copyright (C) 2019-2020 Max-Planck-Society
/* Copyright (C) 2019-2021 Max-Planck-Society
Author: Martin Reinecke */
#ifndef DUCC0_CMPLX_H
......@@ -43,6 +43,7 @@ template<typename T> struct Cmplx {
r = tmp;
return *this;
}
Cmplx conj() const { return {r, -i}; }
template<typename T2>Cmplx &operator+= (const Cmplx<T2> &other)
{ r+=other.r; i+=other.i; return *this; }
template<typename T2>Cmplx &operator-= (const Cmplx<T2> &other)
......
/*
This file is part of pocketfft.
Copyright (C) 2010-2020 Max-Planck-Society
Copyright (C) 2010-2021 Max-Planck-Society
Copyright (C) 2019 Peter Bell
For the odd-sized DCT-IV transforms:
......@@ -138,26 +138,34 @@ template<typename T0> class T_dct1
DUCC0_NOINLINE T_dct1(size_t length)
: fftplan(2*(length-1)) {}
template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho,
template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho,
int /*type*/, bool /*cosine*/) const
{
constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
size_t N=fftplan.length(), n=N/2+1;
if (ortho)
{ c[0]*=sqrt2; c[n-1]*=sqrt2; }
aligned_array<T> tmp(N);
auto tmp=&buf[0];
tmp[0] = c[0];
for (size_t i=1; i<n; ++i)
tmp[i] = tmp[N-i] = c[i];
fftplan.exec(tmp.data(), fct, true);
c[0] = tmp[0];
auto res = fftplan.exec(tmp, &buf[N], fct, true);
c[0] = res[0];
for (size_t i=1; i<n; ++i)
c[i] = tmp[2*i-1];
c[i] = res[2*i-1];
if (ortho)
{ c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
return c;
}
template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho,
int /*type*/, bool /*cosine*/) const
{
aligned_array<T> buf(bufsize());
exec(c, buf.data(), fct, ortho, 1, true);
}
size_t length() const { return fftplan.length()/2+1; }
size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); }
};
template<typename T0> class T_dst1
......@@ -169,20 +177,28 @@ template<typename T0> class T_dst1
DUCC0_NOINLINE T_dst1(size_t length)
: fftplan(2*(length+1)) {}
template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct,
template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct,
bool /*ortho*/, int /*type*/, bool /*cosine*/) const
{
size_t N=fftplan.length(), n=N/2-1;
aligned_array<T> tmp(N);
auto tmp = &buf[0];
tmp[0] = tmp[n+1] = c[0]*0;
for (size_t i=0; i<n; ++i)
{ tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
fftplan.exec(tmp.data(), fct, true);
auto res = fftplan.exec(tmp, buf+N, fct, true);
for (size_t i=0; i<n; ++i)
c[i] = -tmp[2*i+2];
c[i] = -res[2*i+2];
return c;
}
template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct,
bool /*ortho*/, int /*type*/, bool /*cosine*/) const
{
aligned_array<T> buf(bufsize());
exec(c, buf.data(), fct, true, 1, false);
}
size_t length() const { return fftplan.length()/2-1; }
size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); }
};
template<typename T0> class T_dcst23
......@@ -200,7 +216,7 @@ template<typename T0> class T_dcst23
twiddle[i] = tw[i+1].r;
}
template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho,
template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho,
int type, bool cosine) const
{
constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
......@@ -215,15 +231,16 @@ template<typename T0> class T_dcst23
if ((N&1)==0) c[N-1]*=2;
for (size_t k=1; k<N-1; k+=2)
MPINPLACE(c[k+1], c[k]);
fftplan.exec(c, fct, false);
auto res = fftplan.exec(c, buf, fct, false);
c[0] = res[0];
for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
{
T t1 = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k];
T t2 = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc];
T t1 = twiddle[k-1]*res[kc]+twiddle[kc-1]*res[k];
T t2 = twiddle[k-1]*res[k]-twiddle[kc-1]*res[kc];
c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
}
if ((N&1)==0)
c[NS2] *= twiddle[NS2-1];
c[NS2] = res[NS2]*twiddle[NS2-1];
if (!cosine)
for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
std::swap(c[k], c[kc]);
......@@ -243,16 +260,26 @@ template<typename T0> class T_dcst23
}
if ((N&1)==0)
c[NS2] *= 2*twiddle[NS2-1];
fftplan.exec(c, fct, true);
auto res = fftplan.exec(c, buf, fct, true);
if (res != c) // FIXME: not yet optimal
memcpy(c, res, N*sizeof(T));
for (size_t k=1; k<N-1; k+=2)
MPINPLACE(c[k], c[k+1]);
if (!cosine)
for (size_t k=1; k<N; k+=2)
c[k] = -c[k];
}
return c;
}
template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho,
int type, bool cosine) const
{
aligned_array<T> buf(bufsize());
exec(c, &buf[0], fct, ortho, type, cosine);
}
size_t length() const { return fftplan.length(); }
size_t bufsize() const { return fftplan.bufsize(); }
};
template<typename T0> class T_dcst4
......@@ -278,7 +305,7 @@ template<typename T0> class T_dcst4
}
}
template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct,
template<typename T> DUCC0_NOINLINE T *exec(T c[], T /*buf*/[], T0 fct,
bool /*ortho*/, int /*type*/, bool cosine) const
{
size_t n2 = N/2;
......@@ -350,9 +377,17 @@ template<typename T0> class T_dcst4
if (!cosine)
for (size_t k=1; k<N; k+=2)
c[k] = -c[k];
return c;
}
template<typename T> DUCC0_NOINLINE T *exec(T c[], T0 fct,
bool /*ortho*/, int /*type*/, bool cosine) const
{
exec(c, nullptr, fct, true, 4, cosine);
}
size_t length() const { return N; }
size_t bufsize() const { return 0; }
};
......@@ -556,9 +591,19 @@ template<typename T, typename T0> DUCC0_NOINLINE aligned_array<T> alloc_tmp
{
auto othersize = info.size()/axsize;
constexpr auto vlen = native_simd<T0>::size();
// FIXME: when switching to C++20, use bit_floor(othersize)
return aligned_array<T>(axsize*std::min(vlen, othersize));
}
template<typename T, typename T0> DUCC0_NOINLINE aligned_array<T> alloc_tmp
(const fmav_info &info, size_t axsize, size_t bufsize)
{
auto othersize = info.size()/axsize;
constexpr auto vlen = native_simd<T0>::size();
// FIXME: when switching to C++20, use bit_floor(othersize)
return aligned_array<T>((axsize+bufsize)*std::min(vlen, othersize));
}
template <typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
const fmav<Cmplx<typename Tsimd::Ts>> &src, Cmplx<Tsimd> *DUCC0_RESTRICT dst)
{
......@@ -733,8 +778,8 @@ template <typename T, size_t vlen> using add_vec_t = typename add_vec<T, vlen>::
template<typename Tplan, typename T, typename T0, typename Exec>
DUCC0_NOINLINE void general_nd(const fmav<T> &in, fmav<T> &out,
const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec,
const bool allow_inplace=true)
const shape_t &axes, T0 fct, size_t nthreads, const Exec &exec,
const bool /*allow_inplace*/=true)
{
std::unique_ptr<Tplan> plan;
......@@ -748,7 +793,7 @@ DUCC0_NOINLINE void general_nd(const fmav<T> &in, fmav<T> &out,
util::thread_count(nthreads, in, axes[iax], native_simd<T0>::size()),
[&](Scheduler &sched) {
constexpr auto vlen = native_simd<T0>::size();
auto storage = alloc_tmp<T,T0>(in, len);
auto storage = alloc_tmp<T,T0>(in, len, plan->bufsize());
const auto &tin(iax==0? in : out);
multi_iter<vlen> it(tin, out, axes[iax], sched.num_threads(), sched.thread_num());
#ifndef DUCC0_NO_SIMD
......@@ -777,9 +822,7 @@ DUCC0_NOINLINE void general_nd(const fmav<T> &in, fmav<T> &out,
while (it.remaining()>0)
{
it.advance(1);
auto buf = allow_inplace && it.stride_out() == 1 ?
&out.vraw(it.oofs(0)) : reinterpret_cast<T *>(storage.data());
exec(it, tin, out, buf, *plan, fct);
exec(it, tin, out, storage.data(), *plan, fct);
}
}); // end of parallel region
fct = T0(1); // factor has been applied, use 1 for remaining axes
......@@ -794,9 +837,10 @@ struct ExecC2C
const Titer &it, const fmav<Cmplx<T0>> &in,
fmav<Cmplx<T0>> &out, T *buf, const pocketfft_c<T0> &plan, T0 fct) const
{
copy_input(it, in, buf);
plan.exec(buf, fct, forward);
copy_output(it, buf, out);
T *buf1=buf, *buf2=buf+plan.bufsize();
copy_input(it, in, buf2);
auto res = plan.exec(buf2, buf1, fct, forward);
copy_output(it, res, out);
}
};
......@@ -892,11 +936,12 @@ struct ExecHartley
{
template <typename T0, typename T, typename Titer> DUCC0_NOINLINE void operator () (
const Titer &it, const fmav<T0> &in, fmav<T0> &out,
T * buf, const pocketfft_r<T0> &plan, T0 fct) const
T *buf, const pocketfft_r<T0> &plan, T0 fct) const
{
copy_input(it, in, buf);
plan.exec(buf, fct, true);
copy_hartley(it, buf, out);
T *buf1=buf, *buf2=buf+plan.bufsize();
copy_input(it, in, buf2);
auto res = plan.exec(buf2, buf1, fct, true);
copy_hartley(it, res, out);
}
};
......@@ -910,9 +955,10 @@ struct ExecDcst
DUCC0_NOINLINE void operator () (const Titer &it, const fmav<T0> &in,
fmav <T0> &out, T * buf, const Tplan &plan, T0 fct) const
{
copy_input(it, in, buf);
plan.exec(buf, fct, ortho, type, cosine);
copy_output(it, buf, out);
T *buf1=buf, *buf2=buf+plan.bufsize();
copy_input(it, in, buf2);
auto res = plan.exec(buf2, buf1, fct, ortho, type, cosine);
copy_output(it, res, out);
}
};
......@@ -926,7 +972,7 @@ template<typename T> DUCC0_NOINLINE void general_r2c(
util::thread_count(nthreads, in, axis, native_simd<T>::size()),
[&](Scheduler &sched) {
constexpr auto vlen = native_simd<T>::size();
auto storage = alloc_tmp<T,T>(in, len);
auto storage = alloc_tmp<T,T>(in, len, plan->bufsize());
multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
#ifndef DUCC0_NO_SIMD
if constexpr (vlen>1)
......@@ -1029,7 +1075,7 @@ template<typename T> DUCC0_NOINLINE void general_c2r(
util::thread_count(nthreads, in, axis, native_simd<T>::size()),
[&](Scheduler &sched) {
constexpr auto vlen = native_simd<T>::size();
auto storage = alloc_tmp<T,T>(out, len);
auto storage = alloc_tmp<T,T>(out, len, plan->bufsize());
multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
#ifndef DUCC0_NO_SIMD
if constexpr (vlen>1)
......@@ -1156,18 +1202,19 @@ struct ExecR2R
bool r2c, forward;
template <typename T0, typename T, typename Titer> DUCC0_NOINLINE void operator () (
const Titer &it, const fmav<T0> &in, fmav<T0> &out, T * buf,
const Titer &it, const fmav<T0> &in, fmav<T0> &out, T *buf,
const pocketfft_r<T0> &plan, T0 fct) const
{
copy_input(it, in, buf);
T *buf1=buf, *buf2=buf+plan.bufsize();
copy_input(it, in, buf2);
if ((!r2c) && forward)
for (size_t i=2; i<it.length_out(); i+=2)
buf[i] = -buf[i];
plan.exec(buf, fct, r2c);
buf2[i] = -buf2[i];
auto res = plan.exec(buf2, buf1, fct, r2c);
if (r2c && (!forward))
for (size_t i=2; i<it.length_out(); i+=2)
buf[i] = -buf[i];
copy_output(it, buf, out);
res[i] = -res[i];
copy_output(it, res, out);
}
};
......
/*
This file is part of pocketfft.
Copyright (C) 2010-2020 Max-Planck-Society
Copyright (C) 2010-2021 Max-Planck-Society
Copyright (C) 2019 Peter Bell
Authors: Martin Reinecke, Peter Bell
......@@ -672,11 +672,6 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
auto CH2 = [ch, idl1](size_t a, size_t b) -> const T&
{ return ch[a+idl1*b]; };
aligned_array<Cmplx<T0>> wal(ip);
wal[0] = Cmplx<T0>(1., 0.);
for (size_t i=1; i<ip; ++i)
wal[i]=Cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
for (size_t k=0; k<l1; ++k)
for (size_t i=0; i<ido; ++i)
CH(i,k,0) = CC(i,0,k);
......@@ -697,10 +692,12 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
// j=0
for (size_t ik=0; ik<idl1; ++ik)
{
CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
auto wal = fwd ? csarr[ l].conj() : csarr[ l];
auto wal2 = fwd ? csarr[2*l].conj() : csarr[2*l];
CX2(ik,l ).r = CH2(ik,0).r+wal.r*CH2(ik,1).r+wal2.r*CH2(ik,2).r;
CX2(ik,l ).i = CH2(ik,0).i+wal.r*CH2(ik,1).i+wal2.r*CH2(ik,2).i;
CX2(ik,lc).r =-wal.i*CH2(ik,ip-1).i-wal2.i*CH2(ik,ip-2).i;
CX2(ik,lc).i = wal.i*CH2(ik,ip-1).r+wal2.i*CH2(ik,ip-2).r;
}
size_t iwal=2*l;
......@@ -708,9 +705,9 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
for (; j<ipph-1; j+=2, jc-=2)
{
iwal+=l; if (iwal>ip) iwal-=ip;
Cmplx<T0> xwal=wal[iwal];
Cmplx<T0> xwal=fwd ? csarr[iwal].conj() : csarr[iwal];
iwal+=l; if (iwal>ip) iwal-=ip;
Cmplx<T0> xwal2=wal[iwal];
Cmplx<T0> xwal2=fwd ? csarr[iwal].conj() : csarr[iwal];
for (size_t ik=0; ik<idl1; ++ik)
{
CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
......@@ -722,7 +719,7 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
for (; j<ipph; ++j, --jc)
{
iwal+=l; if (iwal>ip) iwal-=ip;
Cmplx<T0> xwal=wal[iwal];
Cmplx<T0> xwal=fwd ? csarr[iwal].conj() : csarr[iwal];
for (size_t ik=0; ik<idl1; ++ik)
{
CX2(ik,l).r += CH2(ik,j).r*xwal.r;
......@@ -761,12 +758,11 @@ template<bool fwd, typename T> void passg (size_t ido, size_t ip,
}
}
template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
template<bool fwd, typename T> T *pass_all(T c[], T ch[], T0 fct) const
{
if (length==1) { c[0]*=fct; return; }
if (length==1) { c[0]*=fct; return c; }
size_t l1=1;
aligned_array<T> ch(length);
T *p1=c, *p2=ch.data();
T *p1=c, *p2=ch;
for(size_t k1=0; k1<fact.size(); k1++)
{
......@@ -795,6 +791,17 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
std::swap(p1,p2);
l1=l2;
}
if (fct!=1.)
for (size_t i=0; i<length; ++i)
p1[i] *= fct;
return p1;
}
template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
{
if (length==1) { c[0]*=fct; return; }
aligned_array<T> ch(length);
auto p1 = pass_all<fwd>(c, ch.data(), T0(1));
if (p1!=c)
{
if (fct!=1.)
......@@ -812,6 +819,8 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
public:
template<typename T> void exec(T c[], T0 fct, bool fwd) const
{ fwd ? pass_all<true>(c, fct) : pass_all<false>(c, fct); }
template<typename T> T *exec(T c[], T ch[], T0 fct, bool fwd) const
{ return fwd ? pass_all<true>(c, ch, fct) : pass_all<false>(c, ch, fct); }
private:
DUCC0_NOINLINE void factorize()
......@@ -885,6 +894,8 @@ template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
mem.resize(twsize());
comp_twiddle();
}
size_t bufsize() const { return length; }
};
//
......@@ -1566,12 +1577,11 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
}
public:
template<typename T> void exec(T c[], T0 fct, bool r2hc) const
template<typename T> T *exec(T c[], T ch[], T0 fct, bool r2hc) const
{
if (length==1) { c[0]*=fct; return; }
if (length==1) { c[0]*=fct; return c; }
size_t n=length, nf=fact.size();
aligned_array<T> ch(n);
T *p1=c, *p2=ch.data();
T *p1=c, *p2=ch;
if (r2hc)
for(size_t k1=0, l1=n; k1<nf;++k1)
......@@ -1610,8 +1620,16 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
std::swap (p1,p2);
l1*=ip;
}
copy_and_norm(p1, p1, length, fct);
return p1;
}
copy_and_norm(c,p1,n,fct);
template<typename T> void exec(T c[], T0 fct, bool r2hc) const
{
if (length==1) { c[0]*=fct; return; }
aligned_array<T> ch(length);
auto p1 = exec(c, ch.data(), T0(1), r2hc);
copy_and_norm(c,p1,length,fct);
}
private:
......@@ -1694,6 +1712,8 @@ template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
mem.resize(twsize());
comp_twiddle();
}
size_t bufsize() const { return length; }
};
//
......@@ -1708,9 +1728,10 @@ template<typename T0> class fftblue
aligned_array<Cmplx<T0>> mem;
Cmplx<T0> *bk, *bkf;
template<bool fwd, typename T> void fft(Cmplx<T> c[], T0 fct) const
template<bool fwd, typename T> void fft(Cmplx<T> c[], Cmplx<T> buf[], T0 fct) const
{
aligned_array<Cmplx<T>> akf(n2);
auto akf = &buf[0];
auto akf2 = &buf[n2];
/* initialize a_k and FFT it */
for (size_t m=0; m<n; ++m)
......@@ -1719,24 +1740,30 @@ template<typename T0> class fftblue
for (size_t m=n; m<n2; ++m)
akf[m]=zero;
plan.exec (akf.data(),1.,true);
auto res = plan.exec (akf,akf2,T0(1),true);
/* do the convolution */
akf[0] = akf[0].template special_mul<!fwd>(bkf[0]);
res[0] = res[0].template special_mul<!fwd>(bkf[0]);
for (size_t m=1; m<(n2+1)/2; ++m)
{
akf[m] = akf[m].template special_mul<!fwd>(bkf[m]);
akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]);
res[m] = res[m].template special_mul<!fwd>(bkf[m]);
res[n2-m] = res[n2-m].template special_mul<!fwd>(bkf[m]);
}
if ((n2&1)==0)
akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]);
res[n2/2] = res[n2/2].template special_mul<!fwd>(bkf[n2/2]);
/* inverse FFT */
plan.exec (akf.data(),1.,false);
res = plan.exec (res, (res==akf) ? akf2 : akf, T0(1), false);
/* multiply by b_k */
for (size_t m=0; m<n; ++m)
c[m] = akf[m].template special_mul<fwd>(bk[m])*fct;
c[m] = res[m].template special_mul<fwd>(bk[m])*fct;
}
template<bool fwd, typename T> void fft(Cmplx<T> c[], T0 fct) const
{
aligned_array<Cmplx<T>> buf(2*n2);
fft<fwd>(c, buf.data(), fct);
}
public:
......@@ -1769,34 +1796,49 @@ template<typename T0> class fftblue
bkf[i] = tbkf[i];
}
size_t bufsize_c() const { return 2*n2; }
size_t bufsize_r() const { return 2*bufsize_c(); }
template<typename T> void exec(Cmplx<T> c[], T0 fct, bool fwd) const
{ fwd ? fft<true>(c,fct) : fft<false>(c,fct); }
template<typename T> Cmplx<T> *exec(Cmplx<T> c[], Cmplx<T> *buf, T0 fct, bool fwd) const
{
fwd ? fft<true>(c,buf,fct) : fft<false>(c,buf,fct);
return c;
}
template<typename T> void exec_r(T c[], T0 fct, bool fwd)
template<typename T> T *exec_r(T c[], T buf[], T0 fct, bool fwd)
{
aligned_array<Cmplx<T>> tmp(n);
auto tmp = reinterpret_cast<Cmplx<T> *>(&buf[2*n2]);
auto buf2 = reinterpret_cast<Cmplx<T> *>(&buf[0]);
if (fwd)
{
auto zero = T0(0)*c[0];
for (size_t m=0; m<n; ++m)
tmp[m].Set(c[m], zero);
fft<true>(tmp.data(),fct);
fft<true>(tmp,buf2,fct);
c[0] = tmp[0].r;
memcpy (reinterpret_cast<void *>(c+1),
reinterpret_cast<void *>(tmp.data()+1), (n-1)*sizeof(T));
memcpy (reinterpret_cast<void *>(&c[1]),
reinterpret_cast<void *>(&tmp[1]), (n-1)*sizeof(T));
}
else
{
tmp[0].Set(c[0],c[0]*0);
memcpy (reinterpret_cast<void *>(tmp.data()+1),
memcpy (reinterpret_cast<void *>(&tmp[1]),
reinterpret_cast<void *>(c+1), (n-1)*sizeof(T));
if ((n&1)==0) tmp[n/2].i=T0(0)*c[0];
for (size_t m=1; 2*m<n; ++m)
tmp[n-m].Set(tmp[m].r, -tmp[m].i);
fft<false>(tmp.data(),fct);
fft<false>(tmp,buf2,fct);
for (size_t m=0; m<n; ++m)
c[m] = tmp[m].r;
}
return c;
}
template<typename T> void exec_r(T c[], T0 fct, bool fwd)
{
aligned_array<T> buf(4*n2);
exec_r(c, buf.data(), fct, fwd);
}
};
......@@ -1833,8 +1875,12 @@ template<typename T0> class pocketfft_c
template<typename T> DUCC0_NOINLINE void exec(Cmplx<T> c[], T0 fct, bool fwd) const
{ packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); }
template<typename T> DUCC0_NOINLINE Cmplx<T> *exec(Cmplx<T> c[], Cmplx<T> buf[], T0 fct, bool fwd) const
{ return packplan ? packplan->exec(c,buf,fct,fwd) : blueplan->exec(c,buf,fct,fwd); }
size_t length() const { return len; }
size_t bufsize() const
{ return packplan ? packplan->bufsize() : blueplan->bufsize_c(); }