Commit 2fd77f20 authored by Martin Reinecke's avatar Martin Reinecke

synchronnize

parent 8d142ce6
......@@ -20,6 +20,9 @@
#include <stdexcept>
#include <memory>
#include <vector>
#if defined(_WIN32)
#include <malloc.h>
#endif
#ifdef __GNUC__
#define NOINLINE __attribute__((noinline))
......@@ -38,6 +41,21 @@ using stride_t = vector<ptrdiff_t>;
namespace detail {
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=64;
#elif (defined(__AVX__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=32;
#elif (defined(__SSE2__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=16;
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif
template<typename T> struct arr
{
private:
......@@ -51,9 +69,12 @@ template<typename T> struct arr
void *res = malloc(num*sizeof(T));
if (!res) throw bad_alloc();
#else
#if 0
#if __cplusplus >= 201703L
void *res = aligned_alloc(64,num*sizeof(T));
if (!res) throw bad_alloc();
#elif defined(_WIN32)
void *res = _aligned_malloc(num*sizeof(T), 64);
if (!res) throw bad_alloc();
#else
void *res(nullptr);
if (posix_memalign(&res, 64, num*sizeof(T))!=0)
......@@ -172,7 +193,7 @@ template<typename T> class sincos_2pibyn
if (n==0) return;
res[0]=1.; res[1]=0.;
if (n==1) return;
size_t l1=(size_t)sqrt(n);
size_t l1 = size_t(sqrt(n));
arr<double> tmp(2*l1);
for (size_t i=1; i<l1; ++i)
{
......@@ -320,60 +341,71 @@ template<typename T> class sincos_2pibyn
{ return reinterpret_cast<const cmplx<T> *>(data.data()); }
};
NOINLINE size_t largest_prime_factor (size_t n)
struct util // hack to avoid duplicate symbols
{
size_t res=1;
while ((n&1)==0)
{ res=2; n>>=1; }
size_t limit=size_t(sqrt(n+0.01));
for (size_t x=3; x<=limit; x+=2)
while ((n/x)*x==n)
static NOINLINE size_t largest_prime_factor (size_t n)
{
res=x;
n/=x;
limit=size_t(sqrt(n+0.01));
}
if (n>1) res=n;
size_t res=1;
while ((n&1)==0)
{ res=2; n>>=1; }
return res;
}
size_t limit=size_t(sqrt(n+0.01));
for (size_t x=3; x<=limit; x+=2)
while ((n/x)*x==n)
{
res=x;
n/=x;
limit=size_t(sqrt(n+0.01));
}
if (n>1) res=n;
NOINLINE double cost_guess (size_t n)
{
const double lfp=1.1; // penalty for non-hardcoded larger factors
size_t ni=n;
double result=0.;
while ((n&1)==0)
{ result+=2; n>>=1; }
size_t limit=size_t(sqrt(n+0.01));
for (size_t x=3; x<=limit; x+=2)
while ((n/x)*x==n)
return res;
}
static NOINLINE double cost_guess (size_t n)
{
result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
n/=x;
limit=size_t(sqrt(n+0.01));
const double lfp=1.1; // penalty for non-hardcoded larger factors
size_t ni=n;
double result=0.;
while ((n&1)==0)
{ result+=2; n>>=1; }
size_t limit=size_t(sqrt(n+0.01));
for (size_t x=3; x<=limit; x+=2)
while ((n/x)*x==n)
{
result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
n/=x;
limit=size_t(sqrt(n+0.01));
}
if (n>1) result+=(n<=5) ? n : lfp*n;
return result*ni;
}
if (n>1) result+=(n<=5) ? n : lfp*n;
return result*ni;
}
/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
static NOINLINE size_t good_size(size_t n)
{
if (n<=12) return n;
size_t bestfac=2*n;
for (size_t f2=1; f2<bestfac; f2*=2)
for (size_t f23=f2; f23<bestfac; f23*=3)
for (size_t f235=f23; f235<bestfac; f235*=5)
for (size_t f2357=f235; f2357<bestfac; f2357*=7)
for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
if (f235711>=n) bestfac=f235711;
return bestfac;
}
/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
NOINLINE size_t good_size(size_t n)
{
if (n<=12) return n;
size_t bestfac=2*n;
for (size_t f2=1; f2<bestfac; f2*=2)
for (size_t f23=f2; f23<bestfac; f23*=3)
for (size_t f235=f23; f235<bestfac; f235*=5)
for (size_t f2357=f235; f2357<bestfac; f2357*=7)
for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
if (f235711>=n) bestfac=f235711;
return bestfac;
}
static size_t prod(const shape_t &shape)
{
size_t res=1;
for (auto sz: shape)
res*=sz;
return res;
}
};
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
......@@ -397,12 +429,12 @@ template<typename T0> class cfftp
size_t length, nfct;
arr<cmplx<T0>> mem;
fctdata fct[NFCT];
fctdata fact[NFCT];
void add_factor(size_t factor)
{
if (nfct>=NFCT) throw runtime_error("too many prime factors");
fct[nfct++].fct = factor;
fact[nfct++].fct = factor;
}
template<bool bwd, typename T> void pass2 (size_t ido, size_t l1,
......@@ -844,33 +876,33 @@ template<bool bwd, typename T> void passg (size_t ido, size_t ip,
#undef CX2
#undef CX
template<bool bwd, typename T> void pass_all(T c[], T0 fact)
template<bool bwd, typename T> void pass_all(T c[], T0 fct)
{
if (length==1) { c[0]*=fact; return; }
if (length==1) { c[0]*=fct; return; }
size_t l1=1;
arr<T> ch(length);
T *p1=c, *p2=ch.data();
for(size_t k1=0; k1<nfct; k1++)
{
size_t ip=fct[k1].fct;
size_t ip=fact[k1].fct;
size_t l2=ip*l1;
size_t ido = length/l2;
if (ip==4)
pass4<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass4<bwd> (ido, l1, p1, p2, fact[k1].tw);
else if(ip==2)
pass2<bwd>(ido, l1, p1, p2, fct[k1].tw);
pass2<bwd>(ido, l1, p1, p2, fact[k1].tw);
else if(ip==3)
pass3<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass3<bwd> (ido, l1, p1, p2, fact[k1].tw);
else if(ip==5)
pass5<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass5<bwd> (ido, l1, p1, p2, fact[k1].tw);
else if(ip==7)
pass7<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass7<bwd> (ido, l1, p1, p2, fact[k1].tw);
else if(ip==11)
pass11<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass11<bwd> (ido, l1, p1, p2, fact[k1].tw);
else
{
passg<bwd>(ido, ip, l1, p1, p2, fct[k1].tw, fct[k1].tws);
passg<bwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
swap(p1,p2);
}
swap(p1,p2);
......@@ -878,16 +910,16 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
}
if (p1!=c)
{
if (fact!=1.)
if (fct!=1.)
for (size_t i=0; i<length; ++i)
c[i] = ch[i]*fact;
c[i] = ch[i]*fct;
else
memcpy (c,p1,length*sizeof(T));
}
else
if (fact!=1.)
if (fct!=1.)
for (size_t i=0; i<length; ++i)
c[i] *= fact;
c[i] *= fct;
}
#undef WA
......@@ -913,9 +945,9 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
len>>=1;
// factor 2 should be at the front of the factor list
add_factor(2);
swap(fct[0].fct, fct[nfct-1].fct);
swap(fact[0].fct, fact[nfct-1].fct);
}
size_t maxl=(size_t)(sqrt((double)len))+1;
size_t maxl = size_t(sqrt(double(len)))+1;
for (size_t divisor=3; (len>1)&&(divisor<maxl); divisor+=2)
if ((len%divisor)==0)
{
......@@ -924,7 +956,7 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
add_factor(divisor);
len/=divisor;
}
maxl=size_t(sqrt((double)len))+1;
maxl=size_t(sqrt(double(len)))+1;
}
if (len>1) add_factor(len);
}
......@@ -934,7 +966,7 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
size_t twsize=0, l1=1;
for (size_t k=0; k<nfct; ++k)
{
size_t ip=fct[k].fct, ido= length/(l1*ip);
size_t ip=fact[k].fct, ido= length/(l1*ip);
twsize+=(ip-1)*(ido-1);
if (ip>11)
twsize+=ip;
......@@ -951,18 +983,18 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
size_t memofs=0;
for (size_t k=0; k<nfct; ++k)
{
size_t ip=fct[k].fct, ido=length/(l1*ip);
fct[k].tw=mem.data()+memofs;
size_t ip=fact[k].fct, ido=length/(l1*ip);
fact[k].tw=mem.data()+memofs;
memofs+=(ip-1)*(ido-1);
for (size_t j=1; j<ip; ++j)
for (size_t i=1; i<ido; ++i)
fct[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
if (ip>11)
{
fct[k].tws=mem.data()+memofs;
fact[k].tws=mem.data()+memofs;
memofs+=ip;
for (size_t j=0; j<ip; ++j)
fct[k].tws[j] = twiddle[j*l1*ido];
fact[k].tws[j] = twiddle[j*l1*ido];
}
l1*=ip;
}
......@@ -995,12 +1027,12 @@ template<typename T0> class rfftp
size_t length, nfct;
arr<T0> mem;
fctdata fct[NFCT];
fctdata fact[NFCT];
void add_factor(size_t factor)
{
if (nfct>=NFCT) throw runtime_error("too many prime factors");
fct[nfct++].fct = factor;
fact[nfct++].fct = factor;
}
#define WA(x,i) wa[(i)+(x)*(ido-1)]
......@@ -1632,9 +1664,9 @@ template<typename T> void copy_and_norm(T *c, T *p1, size_t n, T0 fct)
public:
template<typename T> void forward(T c[], T0 fact)
template<typename T> void forward(T c[], T0 fct)
{
if (length==1) { c[0]*=fact; return; }
if (length==1) { c[0]*=fct; return; }
size_t n=length;
size_t l1=n, nf=nfct;
arr<T> ch(n);
......@@ -1643,30 +1675,30 @@ template<typename T> void forward(T c[], T0 fact)
for(size_t k1=0; k1<nf;++k1)
{
size_t k=nf-k1-1;
size_t ip=fct[k].fct;
size_t ip=fact[k].fct;
size_t ido=n / l1;
l1 /= ip;
if(ip==4)
radf4(ido, l1, p1, p2, fct[k].tw);
radf4(ido, l1, p1, p2, fact[k].tw);
else if(ip==2)
radf2(ido, l1, p1, p2, fct[k].tw);
radf2(ido, l1, p1, p2, fact[k].tw);
else if(ip==3)
radf3(ido, l1, p1, p2, fct[k].tw);
radf3(ido, l1, p1, p2, fact[k].tw);
else if(ip==5)
radf5(ido, l1, p1, p2, fct[k].tw);
radf5(ido, l1, p1, p2, fact[k].tw);
else
{
radfg(ido, ip, l1, p1, p2, fct[k].tw, fct[k].tws);
radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
swap (p1,p2);
}
swap (p1,p2);
}
copy_and_norm(c,p1,n,fact);
copy_and_norm(c,p1,n,fct);
}
template<typename T> void backward(T c[], T0 fact)
template<typename T> void backward(T c[], T0 fct)
{
if (length==1) { c[0]*=fact; return; }
if (length==1) { c[0]*=fct; return; }
size_t n=length;
size_t l1=1, nf=nfct;
arr<T> ch(n);
......@@ -1674,22 +1706,22 @@ template<typename T> void backward(T c[], T0 fact)
for(size_t k=0; k<nf; k++)
{
size_t ip = fct[k].fct,
size_t ip = fact[k].fct,
ido= n/(ip*l1);
if(ip==4)
radb4(ido, l1, p1, p2, fct[k].tw);
radb4(ido, l1, p1, p2, fact[k].tw);
else if(ip==2)
radb2(ido, l1, p1, p2, fct[k].tw);
radb2(ido, l1, p1, p2, fact[k].tw);
else if(ip==3)
radb3(ido, l1, p1, p2, fct[k].tw);
radb3(ido, l1, p1, p2, fact[k].tw);
else if(ip==5)
radb5(ido, l1, p1, p2, fct[k].tw);
radb5(ido, l1, p1, p2, fact[k].tw);
else
radbg(ido, ip, l1, p1, p2, fct[k].tw, fct[k].tws);
radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
swap (p1,p2);
l1*=ip;
}
copy_and_norm(c,p1,n,fact);
copy_and_norm(c,p1,n,fct);
}
private:
......@@ -1705,9 +1737,9 @@ void factorize()
len>>=1;
// factor 2 should be at the front of the factor list
add_factor(2);
swap(fct[0].fct, fct[nfct-1].fct);
swap(fact[0].fct, fact[nfct-1].fct);
}
size_t maxl=(size_t)(sqrt((double)len))+1;
size_t maxl = size_t(sqrt(double(len)))+1;
for (size_t divisor=3; (len>1)&&(divisor<maxl); divisor+=2)
if ((len%divisor)==0)
{
......@@ -1716,7 +1748,7 @@ void factorize()
add_factor(divisor);
len/=divisor;
}
maxl=(size_t)(sqrt((double)len))+1;
maxl=size_t(sqrt(double(len)))+1;
}
if (len>1) add_factor(len);
}
......@@ -1726,7 +1758,7 @@ size_t twsize() const
size_t twsz=0, l1=1;
for (size_t k=0; k<nfct; ++k)
{
size_t ip=fct[k].fct, ido=length/(l1*ip);
size_t ip=fact[k].fct, ido=length/(l1*ip);
twsz+=(ip-1)*(ido-1);
if (ip>5) twsz+=2*ip;
l1*=ip;
......@@ -1741,28 +1773,28 @@ void comp_twiddle()
T0 *ptr=mem.data();
for (size_t k=0; k<nfct; ++k)
{
size_t ip=fct[k].fct, ido=length/(l1*ip);
size_t ip=fact[k].fct, ido=length/(l1*ip);
if (k<nfct-1) // last factor doesn't need twiddles
{
fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
for (size_t j=1; j<ip; ++j)
for (size_t i=1; i<=(ido-1)/2; ++i)
{
fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
}
}
if (ip>5) // special factors required by *g functions
{
fct[k].tws=ptr; ptr+=2*ip;
fct[k].tws[0] = 1.;
fct[k].tws[1] = 0.;
fact[k].tws=ptr; ptr+=2*ip;
fact[k].tws[0] = 1.;
fact[k].tws[1] = 0.;
for (size_t i=1; i<=(ip>>1); ++i)
{
fct[k].tws[2*i ] = twid[2*i*(length/ip)];
fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)];
fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
fact[k].tws[2*i ] = twid[2*i*(length/ip)];
fact[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
fact[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)];
fact[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
}
}
l1*=ip;
......@@ -1822,7 +1854,7 @@ template<typename T0> class fftblue
public:
NOINLINE fftblue(size_t length)
: n(length), n2(good_size(n*2-1)), plan(n2), mem(n+n2),
: n(length), n2(util::good_size(n*2-1)), plan(n2), mem(n+n2),
bk(mem.data()), bkf(mem.data()+n)
{
/* initialize b_k */
......@@ -1858,7 +1890,8 @@ template<typename T0> class fftblue
{
arr<cmplx<T>> tmp(n);
tmp[0].Set(c[0],c[0]*0);
memcpy ((void *)(tmp.data()+1),(void *)(c+1), (n-1)*sizeof(T));
memcpy (reinterpret_cast<void *>(tmp.data()+1),
reinterpret_cast<void *>(c+1), (n-1)*sizeof(T));
if ((n&1)==0) tmp[n/2].i=c[0]*0.;
for (size_t m=1; 2*m<n; ++m)
tmp[n-m].Set(tmp[m].r, -tmp[m].i);
......@@ -1894,13 +1927,13 @@ template<typename T0> class pocketfft_c
: len(length)
{
if (length==0) throw runtime_error("zero-length FFT requested");
if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length)))
{
packplan=unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
return;
}
double comp1 = cost_guess(length);
double comp2 = 2*cost_guess(good_size(2*length-1));
double comp1 = util::cost_guess(length);
double comp2 = 2*util::cost_guess(util::good_size(2*length-1));
comp2*=1.5; /* fudge factor that appears to give good overall performance */
if (comp2<comp1) // use Bluestein
blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
......@@ -1908,16 +1941,16 @@ template<typename T0> class pocketfft_c
packplan=unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
}
template<typename T> NOINLINE void backward(T c[], T0 fct)
template<typename T> NOINLINE void backward(cmplx<T> c[], T0 fct)
{
packplan ? packplan->backward((cmplx<T> *)c,fct)
: blueplan->backward((cmplx<T> *)c,fct);
packplan ? packplan->backward(c,fct)
: blueplan->backward(c,fct);
}
template<typename T> NOINLINE void forward(T c[], T0 fct)
template<typename T> NOINLINE void forward(cmplx<T> c[], T0 fct)
{
packplan ? packplan->forward((cmplx<T> *)c,fct)
: blueplan->forward((cmplx<T> *)c,fct);
packplan ? packplan->forward(c,fct)
: blueplan->forward(c,fct);
}
size_t length() const { return len; }
......@@ -1939,13 +1972,13 @@ template<typename T0> class pocketfft_r
: len(length)
{
if (length==0) throw runtime_error("zero-length FFT requested");
if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length)))
{
packplan=unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
return;
}
double comp1 = 0.5*cost_guess(length);
double comp2 = 2*cost_guess(good_size(2*length-1));
double comp1 = 0.5*util::cost_guess(length);
double comp2 = 2*util::cost_guess(util::good_size(2*length-1));
comp2*=1.5; /* fudge factor that appears to give good overall performance */
if (comp2<comp1) // use Bluestein
blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
......@@ -1972,14 +2005,6 @@ template<typename T0> class pocketfft_r
// multi-D infrastructure
//
size_t prod(const shape_t &shape)
{
size_t res=1;
for (auto sz: shape)
res*=sz;
return res;
}
template<typename T> class ndarr
{
private:
......@@ -1996,7 +2021,7 @@ template<typename T> class ndarr
: d(reinterpret_cast<char *>(data_)),
cd(reinterpret_cast<const char *>(data_)), shp(shape_), str(stride_) {}
size_t ndim() const { return shp.size(); }
size_t size() const { return prod(shp); }
size_t size() const { return util::prod(shp); }
const shape_t &shape() const { return shp; }
size_t shape(size_t i) const { return shp[i]; }
const stride_t &stride() const { return str; }
......@@ -2067,20 +2092,6 @@ template<size_t N, typename Ti, typename To> class multi_iter
bool contiguous_out() const { return str_o==sizeof(To); }
};
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=64;
#elif (defined(__AVX__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=32;
#elif (defined(__SSE2__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=16;
#endif
#endif
#if defined(HAVE_VECSUPPORT)
template<typename T> struct VTYPE{};
template<> struct VTYPE<double>
......@@ -2104,14 +2115,14 @@ template<> struct VTYPE<float>
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
size_t axsize, size_t elemsize)
{
auto othersize = prod(shape)/axsize;
auto othersize = util::prod(shape)/axsize;
auto tmpsize = axsize*((othersize>=VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1);
return arr<char>(tmpsize*elemsize);
}
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
const shape_t &axes, size_t elemsize)
{
size_t fullsize=prod(shape);
size_t fullsize=util::prod(shape);