Commit cef9199e authored by Martin Reinecke's avatar Martin Reinecke
Browse files

Merge branch 'develop' into 'master'

Develop

See merge request !3
parents 1307970b 39de2274
......@@ -10,16 +10,18 @@
* \author Martin Reinecke
*/
#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <stdexcept>
#include <memory>
#include <vector>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#if defined(_WIN32)
#include <malloc.h>
#endif
#ifdef __GNUC__
#define NOINLINE __attribute__((noinline))
......@@ -29,11 +31,32 @@
#define restrict
#endif
namespace pocketfft_private {
namespace pocketfft {
using shape_t = std::vector<size_t>;
using stride_t = std::vector<ptrdiff_t>;
namespace detail {
using namespace std;
template<typename T> struct arr
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=64;
#elif (defined(__AVX__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=32;
#elif (defined(__SSE2__))
#define HAVE_VECSUPPORT
constexpr int VBYTELEN=16;
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif
template<typename T> class arr
{
private:
T *p;
......@@ -41,26 +64,31 @@ template<typename T> struct arr
static T *ralloc(size_t num)
{
#if 0
return (T *)aligned_alloc(64,num*sizeof(T));
if (num==0) return nullptr;
#ifdef POCKETFFT_NO_VECTORS
void *res = malloc(num*sizeof(T));
if (!res) throw bad_alloc();
#else
#if __cplusplus >= 201703L
void *res = aligned_alloc(64,num*sizeof(T));
if (!res) throw bad_alloc();
#elif defined(_WIN32)
void *res = _aligned_malloc(num*sizeof(T), 64);
if (!res) throw bad_alloc();
#else
void *res(nullptr);
if (num>0)
if (posix_memalign(&res, 64, num*sizeof(T))!=0)
throw bad_alloc();
return reinterpret_cast<T *>(res);
if (posix_memalign(&res, 64, num*sizeof(T))!=0)
throw bad_alloc();
#endif
#endif
return reinterpret_cast<T *>(res);
}
static void dealloc(T *ptr)
{ free(ptr); }
public:
arr() : p(0), sz(0) {}
arr(size_t n) : p(ralloc(n)), sz(n)
{
if ((!p) && (n!=0))
throw bad_alloc();
}
arr(size_t n) : p(ralloc(n)), sz(n) {}
arr(arr &&other)
: p(other.p), sz(other.sz)
{ other.p=nullptr; other.sz=0; }
......@@ -165,7 +193,7 @@ template<typename T> class sincos_2pibyn
if (n==0) return;
res[0]=1.; res[1]=0.;
if (n==1) return;
size_t l1=(size_t)sqrt(n);
size_t l1 = size_t(sqrt(n));
arr<double> tmp(2*l1);
for (size_t i=1; i<l1; ++i)
{
......@@ -192,7 +220,7 @@ template<typename T> class sincos_2pibyn
}
}
NOINLINE void calc_first_quadrant(size_t n, T * restrict res)
void calc_first_quadrant(size_t n, T * restrict res)
{
T * restrict p = res+n;
calc_first_octant(n<<1, p);
......@@ -200,56 +228,37 @@ template<typename T> class sincos_2pibyn
size_t i=0, idx1=0, idx2=2*ndone-2;
for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
{
res[idx1] = p[2*i];
res[idx1+1] = p[2*i+1];
res[idx2] = p[2*i+3];
res[idx2+1] = p[2*i+2];
res[idx1] = p[2*i ]; res[idx1+1] = p[2*i+1];
res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2];
}
if (i!=ndone)
{
res[idx1 ] = p[2*i];
res[idx1+1] = p[2*i+1];
}
{ res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; }
}
NOINLINE void calc_first_half(size_t n, T * restrict res)
void calc_first_half(size_t n, T * restrict res)
{
int ndone=(n+1)>>1;
T * p = res+n-1;
calc_first_octant(n<<2, p);
int i4=0, in=n, i=0;
for (; i4<=in-i4; ++i, i4+=4) // octant 0
{
res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
}
{ res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; }
for (; i4-in <= 0; ++i, i4+=4) // octant 1
{
int xm = in-i4;
res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
}
{ int xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; }
for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
{
int xm = i4-in;
res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
}
{ int xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; }
for (; i<ndone; ++i, i4+=4) // octant 3
{
int xm = 2*in-i4;
res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
}
{ int xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; }
}
NOINLINE void fill_first_quadrant(size_t n, T * restrict res)
void fill_first_quadrant(size_t n, T * restrict res)
{
const double hsqt2 = 0.707106781186547524400844362104849;
size_t quart = n>>2;
if ((n&7)==0)
res[quart] = res[quart+1] = hsqt2;
for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
{
res[j ] = res[i+1];
res[j+1] = res[i ];
}
{ res[j] = res[i+1]; res[j+1] = res[i]; }
}
NOINLINE void fill_first_half(size_t n, T * restrict res)
......@@ -257,29 +266,20 @@ template<typename T> class sincos_2pibyn
size_t half = n>>1;
if ((n&3)==0)
for (size_t i=0; i<half; i+=2)
{
res[i+half] = -res[i+1];
res[i+half+1] = res[i ];
}
{ res[i+half] = -res[i+1]; res[i+half+1] = res[i]; }
else
for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
{
res[j ] = -res[i ];
res[j+1] = res[i+1];
}
{ res[j] = -res[i]; res[j+1] = res[i+1]; }
}
NOINLINE void fill_second_half(size_t n, T * restrict res)
void fill_second_half(size_t n, T * restrict res)
{
if ((n&1)==0)
for (size_t i=0; i<n; ++i)
res[i+n] = -res[i];
else
for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
{
res[j ] = res[i ];
res[j+1] = -res[i+1];
}
{ res[j] = res[i]; res[j+1] = -res[i+1]; }
}
NOINLINE void sincos_2pibyn_half(size_t n, T * restrict res)
......@@ -313,67 +313,76 @@ template<typename T> class sincos_2pibyn
{ return reinterpret_cast<const cmplx<T> *>(data.data()); }
};
NOINLINE size_t largest_prime_factor (size_t n)
struct util // hack to avoid duplicate symbols
{
size_t res=1;
while ((n&1)==0)
{ res=2; n>>=1; }
size_t limit=size_t(sqrt(n+0.01));
for (size_t x=3; x<=limit; x+=2)
while ((n/x)*x==n)
static NOINLINE size_t largest_prime_factor (size_t n)
{
res=x;
n/=x;
limit=size_t(sqrt(n+0.01));
}
if (n>1) res=n;
size_t res=1;
while ((n&1)==0)
{ res=2; n>>=1; }
return res;
}
size_t limit=size_t(sqrt(n+0.01));
for (size_t x=3; x<=limit; x+=2)
while ((n/x)*x==n)
{
res=x;
n/=x;
limit=size_t(sqrt(n+0.01));
}
if (n>1) res=n;
NOINLINE double cost_guess (size_t n)
{
const double lfp=1.1; // penalty for non-hardcoded larger factors
size_t ni=n;
double result=0.;
while ((n&1)==0)
{ result+=2; n>>=1; }
size_t limit=size_t(sqrt(n+0.01));
for (size_t x=3; x<=limit; x+=2)
while ((n/x)*x==n)
return res;
}
static NOINLINE double cost_guess (size_t n)
{
result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
n/=x;
limit=size_t(sqrt(n+0.01));
const double lfp=1.1; // penalty for non-hardcoded larger factors
size_t ni=n;
double result=0.;
while ((n&1)==0)
{ result+=2; n>>=1; }
size_t limit=size_t(sqrt(n+0.01));
for (size_t x=3; x<=limit; x+=2)
while ((n/x)*x==n)
{
result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
n/=x;
limit=size_t(sqrt(n+0.01));
}
if (n>1) result+=(n<=5) ? n : lfp*n;
return result*ni;
}
if (n>1) result+=(n<=5) ? n : lfp*n;
return result*ni;
}
/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
static NOINLINE size_t good_size(size_t n)
{
if (n<=12) return n;
size_t bestfac=2*n;
for (size_t f2=1; f2<bestfac; f2*=2)
for (size_t f23=f2; f23<bestfac; f23*=3)
for (size_t f235=f23; f235<bestfac; f235*=5)
for (size_t f2357=f235; f2357<bestfac; f2357*=7)
for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
if (f235711>=n) bestfac=f235711;
return bestfac;
}
/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
NOINLINE size_t good_size(size_t n)
{
if (n<=12) return n;
size_t bestfac=2*n;
for (size_t f2=1; f2<bestfac; f2*=2)
for (size_t f23=f2; f23<bestfac; f23*=3)
for (size_t f235=f23; f235<bestfac; f235*=5)
for (size_t f2357=f235; f2357<bestfac; f2357*=7)
for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
if (f235711>=n) bestfac=f235711;
return bestfac;
}
static size_t prod(const shape_t &shape)
{
size_t res=1;
for (auto sz: shape)
res*=sz;
return res;
}
};
#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
constexpr size_t NFCT=25;
//
// complex FFTPACK transforms
//
......@@ -388,17 +397,14 @@ template<typename T0> class cfftp
cmplx<T0> *tw, *tws;
};
size_t length, nfct;
size_t length;
arr<cmplx<T0>> mem;
fctdata fct[NFCT];
vector<fctdata> fact;
void add_factor(size_t factor)
{
if (nfct>=NFCT) throw runtime_error("too many prime factors");
fct[nfct++].fct = factor;
}
{ fact.push_back({factor, nullptr, nullptr}); }
template<bool bwd, typename T> NOINLINE void pass2 (size_t ido, size_t l1,
template<bool bwd, typename T> void pass2 (size_t ido, size_t l1,
const T * restrict cc, T * restrict ch, const cmplx<T0> * restrict wa)
{
constexpr size_t cdim=2;
......@@ -442,7 +448,7 @@ template<bool bwd, typename T> NOINLINE void pass2 (size_t ido, size_t l1,
CH(i,k,u1) = da.template special_mul<bwd>(WA(u1-1,i)); \
CH(i,k,u2) = db.template special_mul<bwd>(WA(u2-1,i)); \
}
template<bool bwd, typename T> NOINLINE void pass3 (size_t ido, size_t l1,
template<bool bwd, typename T> void pass3 (size_t ido, size_t l1,
const T * restrict cc, T * restrict ch, const cmplx<T0> * restrict wa)
{
constexpr size_t cdim=3;
......@@ -473,7 +479,7 @@ template<bool bwd, typename T> NOINLINE void pass3 (size_t ido, size_t l1,
#undef PARTSTEP3a
#undef PREP3
template<bool bwd, typename T> NOINLINE void pass4 (size_t ido, size_t l1,
template<bool bwd, typename T> void pass4 (size_t ido, size_t l1,
const T * restrict cc, T * restrict ch, const cmplx<T0> * restrict wa)
{
constexpr size_t cdim=4;
......@@ -544,7 +550,7 @@ template<bool bwd, typename T> NOINLINE void pass4 (size_t ido, size_t l1,
CH(i,k,u1) = da.template special_mul<bwd>(WA(u1-1,i)); \
CH(i,k,u2) = db.template special_mul<bwd>(WA(u2-1,i)); \
}
template<bool bwd, typename T> NOINLINE void pass5 (size_t ido, size_t l1,
template<bool bwd, typename T> void pass5 (size_t ido, size_t l1,
const T * restrict cc, T * restrict ch, const cmplx<T0> * restrict wa)
{
constexpr size_t cdim=5;
......@@ -608,7 +614,7 @@ template<bool bwd, typename T> NOINLINE void pass5 (size_t ido, size_t l1,
CH(i,k,u2) = db.template special_mul<bwd>(WA(u2-1,i)); \
}
template<bool bwd, typename T> NOINLINE void pass7(size_t ido, size_t l1,
template<bool bwd, typename T> void pass7(size_t ido, size_t l1,
const T * restrict cc, T * restrict ch, const cmplx<T0> * restrict wa)
{
constexpr size_t cdim=7;
......@@ -679,7 +685,7 @@ template<bool bwd, typename T> NOINLINE void pass7(size_t ido, size_t l1,
CH(i,k,u2) = db.template special_mul<bwd>(WA(u2-1,i)); \
}
template<bool bwd, typename T> NOINLINE void pass11 (size_t ido, size_t l1,
template<bool bwd, typename T> void pass11 (size_t ido, size_t l1,
const T * restrict cc, T * restrict ch, const cmplx<T0> * restrict wa)
{
const size_t cdim=11;
......@@ -736,7 +742,7 @@ template<bool bwd, typename T> NOINLINE void pass11 (size_t ido, size_t l1,
#define CX2(a,b) cc[(a)+idl1*(b)]
#define CH2(a,b) ch[(a)+idl1*(b)]
template<bool bwd, typename T> NOINLINE void passg (size_t ido, size_t ip,
template<bool bwd, typename T> void passg (size_t ido, size_t ip,
size_t l1, T * restrict cc, T * restrict ch, const cmplx<T0> * restrict wa,
const cmplx<T0> * restrict csarr)
{
......@@ -837,33 +843,33 @@ template<bool bwd, typename T> NOINLINE void passg (size_t ido, size_t ip,
#undef CX2
#undef CX
template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
template<bool bwd, typename T> void pass_all(T c[], T0 fct)
{
if (length==1) { c[0]*=fact; return; }
if (length==1) { c[0]*=fct; return; }
size_t l1=1;
arr<T> ch(length);
T *p1=c, *p2=ch.data();
for(size_t k1=0; k1<nfct; k1++)
for(size_t k1=0; k1<fact.size(); k1++)
{
size_t ip=fct[k1].fct;
size_t ip=fact[k1].fct;
size_t l2=ip*l1;
size_t ido = length/l2;
if (ip==4)
pass4<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass4<bwd> (ido, l1, p1, p2, fact[k1].tw);
else if(ip==2)
pass2<bwd>(ido, l1, p1, p2, fct[k1].tw);
pass2<bwd>(ido, l1, p1, p2, fact[k1].tw);
else if(ip==3)
pass3<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass3<bwd> (ido, l1, p1, p2, fact[k1].tw);
else if(ip==5)
pass5<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass5<bwd> (ido, l1, p1, p2, fact[k1].tw);
else if(ip==7)
pass7<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass7<bwd> (ido, l1, p1, p2, fact[k1].tw);
else if(ip==11)
pass11<bwd> (ido, l1, p1, p2, fct[k1].tw);
pass11<bwd> (ido, l1, p1, p2, fact[k1].tw);
else
{
passg<bwd>(ido, ip, l1, p1, p2, fct[k1].tw, fct[k1].tws);
passg<bwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
swap(p1,p2);
}
swap(p1,p2);
......@@ -871,16 +877,16 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
}
if (p1!=c)
{
if (fact!=1.)
if (fct!=1.)
for (size_t i=0; i<length; ++i)
c[i] = ch[i]*fact;
c[i] = ch[i]*fct;
else
memcpy (c,p1,length*sizeof(T));
}
else
if (fact!=1.)
if (fct!=1.)
for (size_t i=0; i<length; ++i)
c[i] *= fact;
c[i] *= fct;
}
#undef WA
......@@ -897,7 +903,6 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
private:
NOINLINE void factorize()
{
nfct=0;
size_t len=length;
while ((len&3)==0)
{ add_factor(4); len>>=2; }
......@@ -906,9 +911,9 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
len>>=1;
// factor 2 should be at the front of the factor list
add_factor(2);
swap(fct[0].fct, fct[nfct-1].fct);
swap(fact[0].fct, fact.back().fct);
}
size_t maxl=(size_t)(sqrt((double)len))+1;
size_t maxl = size_t(sqrt(double(len)))+1;
for (size_t divisor=3; (len>1)&&(divisor<maxl); divisor+=2)
if ((len%divisor)==0)
{
......@@ -917,17 +922,17 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
add_factor(divisor);
len/=divisor;
}
maxl=size_t(sqrt((double)len))+1;
maxl=size_t(sqrt(double(len)))+1;
}
if (len>1) add_factor(len);
}
NOINLINE size_t twsize() const
size_t twsize() const
{
size_t twsize=0, l1=1;
for (size_t k=0; k<nfct; ++k)
for (size_t k=0; k<fact.size(); ++k)
{
size_t ip=fct[k].fct, ido= length/(l1*ip);
size_t ip=fact[k].fct, ido= length/(l1*ip);
twsize+=(ip-1)*(ido-1);
if (ip>11)
twsize+=ip;
......@@ -936,26 +941,26 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
return twsize;
}
NOINLINE void comp_twiddle()
void comp_twiddle()
{
sincos_2pibyn<T0> twid(length, false);
auto twiddle = twid.cdata();
size_t l1=1;
size_t memofs=0;
for (size_t k=0; k<nfct; ++k)
for (size_t k=0; k<fact.size(); ++k)
{
size_t ip=fct[k].fct, ido=length/(l1*ip);
fct[k].tw=mem.data()+memofs;
size_t ip=fact[k].fct, ido=length/(l1*ip);
fact[k].tw=mem.data()+memofs;
memofs+=(ip-1)*(ido-1);
for (size_t j=1; j<ip; ++j)
for (size_t i=1; i<ido; ++i)
fct[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
if (ip>11)
{
fct[k].tws=mem.data()+memofs;
fact[k].tws=mem.data()+memofs;
memofs+=ip;
for (size_t j=0; j<ip; ++j)
fct[k].tws[j] = twiddle[j*l1*ido];
fact[k].tws[j] = twiddle[j*l1*ido];
}
l1*=ip;
}
......@@ -963,7 +968,7 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
public:
NOINLINE cfftp(size_t length_)
: length(length_), nfct(0)
: length(length_)
{
if (length==0) throw runtime_error("zero length FFT requested");
if (length==1) return;
......@@ -986,15 +991,12 @@ template<typename T0> class rfftp
T0 *tw, *tws;
};
size_t length, nfct;
size_t length;
arr<T0> mem;
fctdata fct[NFCT];
vector<fctdata> fact;
void add_factor(size_t factor)
{
if (nfct>=NFCT) throw runtime_error("too many prime factors");
fct[nfct++].fct = factor;
}
{ fact.push_back({factor, nullptr, nullptr}); }
#define WA(x,i) wa[(i)+(x)*(ido-1)]
#define PM(a,b,c,d) { a=c+d; b=c-d; }
......@@ -1004,7 +1006,7 @@ template<typename T0> class rfftp
#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
template<typename T> NOINLINE void radf2 (size_t ido, size_t l1,
template<typename T> void radf2 (size_t ido, size_t l1,
const T * restrict cc, T * restrict ch, const T0 * restrict wa)
{
constexpr size_t cdim=2;
......@@ -1029,7 +1031,7 @@ template<typename T> NOINLINE void radf2 (size_t ido, size_t l1,
}
}
template<typename T> NOINLINE void radf3(size_t ido, size_t l1,
template<typename T> void radf3(size_t ido, size_t l1,
const T * restrict cc, T * restrict ch, const T0 * restrict wa)
{
constexpr size_t cdim=3;
......@@ -1063,7 +1065,7 @@ template<typename T> NOINLINE void radf3(size_t ido, size_t l1,
}
}
template<typename T> NOINLINE void radf4(size_t ido, size_t l1,
template<typename T> void radf4(size_t ido, size_t l1,