/* * This file is part of pocketfft. * Licensed under a 3-clause BSD style license - see LICENSE.md */ /* * Main implementation file. * * Copyright (C) 2004-2019 Max-Planck-Society * \author Martin Reinecke */ #ifndef POCKETFFT_HDRONLY_H #define POCKETFFT_HDRONLY_H #include #include #include #include #include #include #include #if defined(_WIN32) #include #endif #ifdef __GNUC__ #define NOINLINE __attribute__((noinline)) #define restrict __restrict__ #else #define NOINLINE #define restrict #endif namespace pocketfft { using shape_t = std::vector; using stride_t = std::vector; namespace detail { using namespace std; #ifndef POCKETFFT_NO_VECTORS #if (defined(__AVX512F__)) #define HAVE_VECSUPPORT constexpr int VBYTELEN=64; #elif (defined(__AVX__)) #define HAVE_VECSUPPORT constexpr int VBYTELEN=32; #elif (defined(__SSE2__)) #define HAVE_VECSUPPORT constexpr int VBYTELEN=16; #else #define POCKETFFT_NO_VECTORS #endif #endif template class arr { private: T *p; size_t sz; static T *ralloc(size_t num) { if (num==0) return nullptr; #ifdef POCKETFFT_NO_VECTORS void *res = malloc(num*sizeof(T)); if (!res) throw bad_alloc(); #else #if __cplusplus >= 201703L void *res = aligned_alloc(64,num*sizeof(T)); if (!res) throw bad_alloc(); #elif defined(_WIN32) void *res = _aligned_malloc(num*sizeof(T), 64); if (!res) throw bad_alloc(); #else void *res(nullptr); if (posix_memalign(&res, 64, num*sizeof(T))!=0) throw bad_alloc(); #endif #endif return reinterpret_cast(res); } static void dealloc(T *ptr) { free(ptr); } public: arr() : p(0), sz(0) {} arr(size_t n) : p(ralloc(n)), sz(n) {} arr(arr &&other) : p(other.p), sz(other.sz) { other.p=nullptr; other.sz=0; } ~arr() { dealloc(p); } void resize(size_t n) { if (n==sz) return; dealloc(p); p = ralloc(n); sz = n; } T &operator[](size_t idx) { return p[idx]; } const T &operator[](size_t idx) const { return p[idx]; } T *data() { return p; } const T *data() const { return p; } size_t size() const { return sz; } }; template struct cmplx { T r, i; cmplx() {} cmplx(T r_, T i_) : r(r_), i(i_) {} void Set(T r_, T i_) { r=r_; i=i_; } void Set(T r_) { r=r_; i=T(0); } cmplx &operator+= (const cmplx &other) { r+=other.r; i+=other.i; return *this; } templatecmplx &operator*= (T2 other) { r*=other; i*=other; return *this; } cmplx operator+ (const cmplx &other) const { return cmplx(r+other.r, i+other.i); } cmplx operator- (const cmplx &other) const { return cmplx(r-other.r, i-other.i); } template auto operator* (const T2 &other) const -> cmplx { return {r*other, i*other}; } template auto operator* (const cmplx &other) const -> cmplx { return {r*other.r-i*other.i, r*other.i + i*other.r}; } template auto special_mul (const cmplx &other) const -> cmplx { return bwd ? cmplx(r*other.r-i*other.i, r*other.i + i*other.r) : cmplx(r*other.r+i*other.i, i*other.r - r*other.i); } }; template void PMC(cmplx &a, cmplx &b, const cmplx &c, const cmplx &d) { a = c+d; b = c-d; } template cmplx conj(const cmplx &a) { return {a.r, -a.i}; } template void ROT90(cmplx &a) { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } template void ROTM90(cmplx &a) { auto tmp_=-a.r; a.r=a.i; a.i=tmp_; } // // twiddle factor section // template class sincos_2pibyn { private: arr data; // adapted from https://stackoverflow.com/questions/42792939/ // CAUTION: this function only works for arguments in the range // [-0.25; 0.25]! void my_sincosm1pi (double a, double *restrict res) { double s = a * a; /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ double r = -1.0369917389758117e-4; r = fma (r, s, 1.9294935641298806e-3); r = fma (r, s, -2.5806887942825395e-2); r = fma (r, s, 2.3533063028328211e-1); r = fma (r, s, -1.3352627688538006e+0); r = fma (r, s, 4.0587121264167623e+0); r = fma (r, s, -4.9348022005446790e+0); double c = r*s; /* Approximate sin(pi*x) for x in [-0.25,0.25] */ r = 4.6151442520157035e-4; r = fma (r, s, -7.3700183130883555e-3); r = fma (r, s, 8.2145868949323936e-2); r = fma (r, s, -5.9926452893214921e-1); r = fma (r, s, 2.5501640398732688e+0); r = fma (r, s, -5.1677127800499516e+0); s = s * a; r = r * s; s = fma (a, 3.1415926535897931e+0, r); res[0] = c; res[1] = s; } NOINLINE void calc_first_octant(size_t den, T * restrict res) { size_t n = (den+4)>>3; if (n==0) return; res[0]=1.; res[1]=0.; if (n==1) return; size_t l1 = size_t(sqrt(n)); arr tmp(2*l1); for (size_t i=1; in) end = n-start; for (size_t i=1; i>2; size_t i=0, idx1=0, idx2=2*ndone-2; for (; i+1>1; T * p = res+n-1; calc_first_octant(n<<2, p); int i4=0, in=n, i=0; for (; i4<=in-i4; ++i, i4+=4) // octant 0 { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; } for (; i4-in <= 0; ++i, i4+=4) // octant 1 { int xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; } for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 { int xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; } for (; i>2; if ((n&7)==0) res[quart] = res[quart+1] = hsqt2; for (size_t i=2, j=2*quart-2; i>1; if ((n&3)==0) for (size_t i=0; i *cdata() const { return reinterpret_cast *>(data.data()); } }; struct util // hack to avoid duplicate symbols { static NOINLINE size_t largest_prime_factor (size_t n) { size_t res=1; while ((n&1)==0) { res=2; n>>=1; } size_t limit=size_t(sqrt(n+0.01)); for (size_t x=3; x<=limit; x+=2) while ((n/x)*x==n) { res=x; n/=x; limit=size_t(sqrt(n+0.01)); } if (n>1) res=n; return res; } static NOINLINE double cost_guess (size_t n) { const double lfp=1.1; // penalty for non-hardcoded larger factors size_t ni=n; double result=0.; while ((n&1)==0) { result+=2; n>>=1; } size_t limit=size_t(sqrt(n+0.01)); for (size_t x=3; x<=limit; x+=2) while ((n/x)*x==n) { result+= (x<=5) ? x : lfp*x; // penalize larger prime factors n/=x; limit=size_t(sqrt(n+0.01)); } if (n>1) result+=(n<=5) ? n : lfp*n; return result*ni; } /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ static NOINLINE size_t good_size(size_t n) { if (n<=12) return n; size_t bestfac=2*n; for (size_t f2=1; f2=n) bestfac=f235711; return bestfac; } static size_t prod(const shape_t &shape) { size_t res=1; for (auto sz: shape) res*=sz; return res; } static NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace) { auto ndim = shape.size(); if (ndim<1) throw runtime_error("ndim must be >= 1"); if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim)) throw runtime_error("stride dimension mismatch"); for (auto shp: shape) if (shp<1) throw runtime_error("zero extent detected"); if (inplace) for (size_t i=0; i=ndim) throw runtime_error("bad axis number"); if (++tmp[ax]>1) throw runtime_error("axis specified repeatedly"); } } static NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, size_t axis) { sanity_check(shape, stride_in, stride_out, inplace); if (axis>=shape.size()) throw runtime_error("bad axis number"); } }; #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] #define WA(x,i) wa[(i)-1+(x)*(ido-1)] // // complex FFTPACK transforms // template class cfftp { private: struct fctdata { size_t fct; cmplx *tw, *tws; }; size_t length; arr> mem; vector fact; void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } template void pass2 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=2; if (ido==1) for (size_t k=0; k(WA(0,i)); } } } #define PREP3(idx) \ T t0 = CC(idx,0,k), t1, t2; \ PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ CH(idx,k,0)=t0+t1; #define PARTSTEP3a(u1,u2,twr,twi) \ { \ T ca,cb; \ ca=t0+t1*twr; \ cb=t2*twi; ROT90(cb); \ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ } #define PARTSTEP3b(u1,u2,twr,twi) \ { \ T ca,cb,da,db; \ ca=t0+t1*twr; \ cb=t2*twi; ROT90(cb); \ PMC(da,db,ca,cb); \ CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass3 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=3; constexpr T0 tw1r=-0.5, tw1i= (bwd ? 1.: -1.) * 0.86602540378443864676; if (ido==1) for (size_t k=0; k void pass4 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=4; if (ido==1) for (size_t k=0; k wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i); PMC(CH(i,k,0),c3,t2,t3); PMC(c2,c4,t1,t4); CH(i,k,1) = c2.template special_mul(wa0); CH(i,k,2) = c3.template special_mul(wa1); CH(i,k,3) = c4.template special_mul(wa2); } } } #define PREP5(idx) \ T t0 = CC(idx,0,k), t1, t2, t3, t4; \ PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ CH(idx,k,0).r=t0.r+t1.r+t2.r; \ CH(idx,k,0).i=t0.i+t1.i+t2.i; #define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ { \ T ca,cb; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \ } #define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ { \ T ca,cb,da,db; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ PMC(da,db,ca,cb); \ CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass5 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=5; constexpr T0 tw1r= 0.3090169943749474241, tw1i= (bwd ? 1.: -1.) * 0.95105651629515357212, tw2r= -0.8090169943749474241, tw2i= (bwd ? 1.: -1.) * 0.58778525229247312917; if (ido==1) for (size_t k=0; k(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass7(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=7; constexpr T0 tw1r= 0.623489801858733530525, tw1i= (bwd ? 1. : -1.) * 0.7818314824680298087084, tw2r= -0.222520933956314404289, tw2i= (bwd ? 1. : -1.) * 0.9749279121818236070181, tw3r= -0.9009688679024191262361, tw3i= (bwd ? 1. : -1.) * 0.4338837391175581204758; if (ido==1) for (size_t k=0; k(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass11 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { const size_t cdim=11; const T0 tw1r = 0.8412535328311811688618, tw1i = (bwd ? 1. : -1.) * 0.5406408174555975821076, tw2r = 0.4154150130018864255293, tw2i = (bwd ? 1. : -1.) * 0.9096319953545183714117, tw3r = -0.1423148382732851404438, tw3i = (bwd ? 1. : -1.) * 0.9898214418809327323761, tw4r = -0.6548607339452850640569, tw4i = (bwd ? 1. : -1.) * 0.755749574354258283774, tw5r = -0.9594929736144973898904, tw5i = (bwd ? 1. : -1.) * 0.2817325568414296977114; if (ido==1) for (size_t k=0; k void passg (size_t ido, size_t ip, size_t l1, T * restrict cc, T * restrict ch, const cmplx * restrict wa, const cmplx * restrict csarr) { const size_t cdim=ip; size_t ipph = (ip+1)/2; size_t idl1 = ido*l1; arr> wal(ip); wal[0] = cmplx(1., 0.); for (size_t i=1; i(csarr[i].r,bwd ? csarr[i].i : -csarr[i].i); for (size_t k=0; kip) iwal-=ip; cmplx xwal=wal[iwal]; iwal+=l; if (iwal>ip) iwal-=ip; cmplx xwal2=wal[iwal]; for (size_t ik=0; ikip) iwal-=ip; cmplx xwal=wal[iwal]; for (size_t ik=0; ik(wa[idij]); idij=(jc-1)*(ido-1)+i-1; CX(i,k,jc) = x2.template special_mul(wa[idij]); } } } } #undef CH2 #undef CX2 #undef CX template void pass_all(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t l1=1; arr ch(length); T *p1=c, *p2=ch.data(); for(size_t k1=0; k1 (ido, l1, p1, p2, fact[k1].tw); else if(ip==2) pass2(ido, l1, p1, p2, fact[k1].tw); else if(ip==3) pass3 (ido, l1, p1, p2, fact[k1].tw); else if(ip==5) pass5 (ido, l1, p1, p2, fact[k1].tw); else if(ip==7) pass7 (ido, l1, p1, p2, fact[k1].tw); else if(ip==11) pass11 (ido, l1, p1, p2, fact[k1].tw); else { passg(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws); swap(p1,p2); } swap(p1,p2); l1=l2; } if (p1!=c) { if (fct!=1.) for (size_t i=0; i void forward(T c[], T0 fct) { pass_all(c, fct); } template void backward(T c[], T0 fct) { pass_all(c, fct); } private: NOINLINE void factorize() { size_t len=length; while ((len&3)==0) { add_factor(4); len>>=2; } if ((len&1)==0) { len>>=1; // factor 2 should be at the front of the factor list add_factor(2); swap(fact[0].fct, fact.back().fct); } size_t maxl = size_t(sqrt(double(len)))+1; for (size_t divisor=3; (len>1)&&(divisor1) add_factor(len); } size_t twsize() const { size_t twsize=0, l1=1; for (size_t k=0; k11) twsize+=ip; l1*=ip; } return twsize; } void comp_twiddle() { sincos_2pibyn twid(length, false); auto twiddle = twid.cdata(); size_t l1=1; size_t memofs=0; for (size_t k=0; k11) { fact[k].tws=mem.data()+memofs; memofs+=ip; for (size_t j=0; j class rfftp { private: struct fctdata { size_t fct; T0 *tw, *tws; }; size_t length; arr mem; vector fact; void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } #define WA(x,i) wa[(i)+(x)*(ido-1)] #define PM(a,b,c,d) { a=c+d; b=c-d; } /* (a+ib) = conj(c+id) * (e+if) */ #define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; } #define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] #define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] template void radf2 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=2; for (size_t k=0; k void radf3(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=0.86602540378443864676; for (size_t k=0; k void radf4(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=4; constexpr T0 hsqt2=0.70710678118654752440; for (size_t k=0; k void radf5(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=5; constexpr T0 tr11= 0.3090169943749474241, ti11=0.95105651629515357212, tr12=-0.8090169943749474241, ti12=0.58778525229247312917; for (size_t k=0; k void radfg(size_t ido, size_t ip, size_t l1, T * restrict cc, T * restrict ch, const T0 * restrict wa, const T0 * restrict csarr) { const size_t cdim=ip; size_t ipph=(ip+1)/2; size_t idl1 = ido*l1; if (ido>1) { for (size_t j=1, jc=ip-1; j=ip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; T0 ar=csarr[2*iang], ai=csarr[2*iang+1]; for (size_t ik=0; ik void radb2(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=2; for (size_t k=0; k void radb3(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=0.86602540378443864676; for (size_t k=0; k void radb4(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=4; constexpr T0 sqrt2=1.41421356237309504880; for (size_t k=0; k void radb5(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=5; constexpr T0 tr11= 0.3090169943749474241, ti11=0.95105651629515357212, tr12=-0.8090169943749474241, ti12=0.58778525229247312917; for (size_t k=0; k void radbg(size_t ido, size_t ip, size_t l1, T * restrict cc, T * restrict ch, const T0 * restrict wa, const T0 * restrict csarr) { const size_t cdim=ip; size_t ipph=(ip+1)/ 2; size_t idl1 = ido*l1; for (size_t k=0; kip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; T0 war=csarr[2*iang], wai=csarr[2*iang+1]; for (size_t ik=0; ik void copy_and_norm(T *c, T *p1, size_t n, T0 fct) { if (p1!=c) { if (fct!=1.) for (size_t i=0; i void forward(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t n=length; size_t l1=n, nf=fact.size(); arr ch(n); T *p1=c, *p2=ch.data(); for(size_t k1=0; k1 void backward(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t n=length; size_t l1=1, nf=fact.size(); arr ch(n); T *p1=c, *p2=ch.data(); for(size_t k=0; k>=2; } if ((len%2)==0) { len>>=1; // factor 2 should be at the front of the factor list add_factor(2); swap(fact[0].fct, fact.back().fct); } size_t maxl = size_t(sqrt(double(len)))+1; for (size_t divisor=3; (len>1)&&(divisor1) add_factor(len); } size_t twsize() const { size_t twsz=0, l1=1; for (size_t k=0; k5) twsz+=2*ip; l1*=ip; } return twsz; } void comp_twiddle() { sincos_2pibyn twid(length, true); size_t l1=1; T0 *ptr=mem.data(); for (size_t k=0; k5) // special factors required by *g functions { fact[k].tws=ptr; ptr+=2*ip; fact[k].tws[0] = 1.; fact[k].tws[1] = 0.; for (size_t i=1; i<=(ip>>1); ++i) { fact[k].tws[2*i ] = twid[2*i*(length/ip)]; fact[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; fact[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)]; fact[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1]; } } l1*=ip; } } public: NOINLINE rfftp(size_t length_) : length(length_) { if (length==0) throw runtime_error("zero-sized FFT"); if (length==1) return; factorize(); mem.resize(twsize()); comp_twiddle(); } }; // // complex Bluestein transforms // template class fftblue { private: size_t n, n2; cfftp plan; arr> mem; cmplx *bk, *bkf; template void fft(cmplx c[], T0 fct) { arr> akf(n2); /* initialize a_k and FFT it */ for (size_t m=0; m(bk[m]); for (size_t m=n; m(bkf[m]); /* inverse FFT */ plan.backward (akf.data(),1.); /* multiply by b_k */ for (size_t m=0; m(bk[m])*fct; } public: NOINLINE fftblue(size_t length) : n(length), n2(util::good_size(n*2-1)), plan(n2), mem(n+n2), bk(mem.data()), bkf(mem.data()+n) { /* initialize b_k */ sincos_2pibyn tmp_(2*n, false); auto tmp = tmp_.cdata(); bk[0].Set(1, 0); size_t coeff=0; for (size_t m=1; m=2*n) coeff-=2*n; bk[m] = tmp[coeff]; } /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ T0 xn2 = 1./n2; bkf[0] = bk[0]*xn2; for (size_t m=1; m void backward(cmplx c[], T0 fct) { fft(c,fct); } template void forward(cmplx c[], T0 fct) { fft(c,fct); } template void backward_r(T c[], T0 fct) { arr> tmp(n); tmp[0].Set(c[0],c[0]*0); memcpy (reinterpret_cast(tmp.data()+1), reinterpret_cast(c+1), (n-1)*sizeof(T)); if ((n&1)==0) tmp[n/2].i=c[0]*0.; for (size_t m=1; 2*m(tmp.data(),fct); for (size_t m=0; m void forward_r(T c[], T0 fct) { arr> tmp(n); for (size_t m=0; m(tmp.data(),fct); c[0] = tmp[0].r; memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T)); } }; // // flexible (FFTPACK/Bluestein) complex 1D transform // template class pocketfft_c { private: unique_ptr> packplan; unique_ptr> blueplan; size_t len; public: NOINLINE pocketfft_c(size_t length) : len(length) { if (length==0) throw runtime_error("zero-length FFT requested"); if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length))) { packplan=unique_ptr>(new cfftp(length)); return; } double comp1 = util::cost_guess(length); double comp2 = 2*util::cost_guess(util::good_size(2*length-1)); comp2*=1.5; /* fudge factor that appears to give good overall performance */ if (comp2>(new fftblue(length)); else packplan=unique_ptr>(new cfftp(length)); } template NOINLINE void backward(cmplx c[], T0 fct) { packplan ? packplan->backward(c,fct) : blueplan->backward(c,fct); } template NOINLINE void forward(cmplx c[], T0 fct) { packplan ? packplan->forward(c,fct) : blueplan->forward(c,fct); } size_t length() const { return len; } }; // // flexible (FFTPACK/Bluestein) real-valued 1D transform // template class pocketfft_r { private: unique_ptr> packplan; unique_ptr> blueplan; size_t len; public: NOINLINE pocketfft_r(size_t length) : len(length) { if (length==0) throw runtime_error("zero-length FFT requested"); if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length))) { packplan=unique_ptr>(new rfftp(length)); return; } double comp1 = 0.5*util::cost_guess(length); double comp2 = 2*util::cost_guess(util::good_size(2*length-1)); comp2*=1.5; /* fudge factor that appears to give good overall performance */ if (comp2>(new fftblue(length)); else packplan=unique_ptr>(new rfftp(length)); } template NOINLINE void backward(T c[], T0 fct) { packplan ? packplan->backward(c,fct) : blueplan->backward_r(c,fct); } template NOINLINE void forward(T c[], T0 fct) { packplan ? packplan->forward(c,fct) : blueplan->forward_r(c,fct); } size_t length() const { return len; } }; // // multi-D infrastructure // template class ndarr { private: char *d; const char *cd; shape_t shp; stride_t str; public: ndarr(const void *data_, const shape_t &shape_, const stride_t &stride_) : d(nullptr), cd(reinterpret_cast(data_)), shp(shape_), str(stride_) {} ndarr(void *data_, const shape_t &shape_, const stride_t &stride_) : d(reinterpret_cast(data_)), cd(reinterpret_cast(data_)), shp(shape_), str(stride_) {} size_t ndim() const { return shp.size(); } size_t size() const { return util::prod(shp); } const shape_t &shape() const { return shp; } size_t shape(size_t i) const { return shp[i]; } const stride_t &stride() const { return str; } const ptrdiff_t &stride(size_t i) const { return str[i]; } T &operator[](ptrdiff_t ofs) { return *reinterpret_cast(d+ofs); } const T &operator[](ptrdiff_t ofs) const { return *reinterpret_cast(cd+ofs); } }; template class multi_iter { public: shape_t pos; const ndarr &iarr; ndarr &oarr; private: ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o; size_t idim, rem; void advance_i() { for (int i=pos.size()-1; i>=0; --i) { if (i==int(idim)) continue; ++pos[i]; p_ii += iarr.stride(i); p_oi += oarr.stride(i); if (pos[i] < iarr.shape(i)) return; pos[i] = 0; p_ii -= iarr.shape(i)*iarr.stride(i); p_oi -= oarr.shape(i)*oarr.stride(i); } } public: multi_iter(const ndarr &iarr_, ndarr &oarr_, size_t idim_) : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0), str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)), idim(idim_), rem(iarr.size()/iarr.shape(idim)) {} void advance(size_t n) { if (rem struct VTYPE{}; template<> struct VTYPE { using type = double __attribute__ ((vector_size (VBYTELEN))); static constexpr int vlen=VBYTELEN/sizeof(double); }; template<> struct VTYPE { using type = float __attribute__ ((vector_size (VBYTELEN))); static constexpr int vlen=VBYTELEN/sizeof(float); }; #else template struct VTYPE{}; template<> struct VTYPE { static constexpr int vlen=1; }; template<> struct VTYPE { static constexpr int vlen=1; }; #endif template arr alloc_tmp(const shape_t &shape, size_t axsize, size_t elemsize) { auto othersize = util::prod(shape)/axsize; auto tmpsize = axsize*((othersize>=VTYPE::vlen) ? VTYPE::vlen : 1); return arr(tmpsize*elemsize); } template arr alloc_tmp(const shape_t &shape, const shape_t &axes, size_t elemsize) { size_t fullsize=util::prod(shape); size_t tmpsize=0; for (size_t i=0; i=VTYPE::vlen) ? VTYPE::vlen : 1); if (sz>tmpsize) tmpsize=sz; } return arr(tmpsize*elemsize); } template NOINLINE void general_c( const ndarr> &in, ndarr> &out, const shape_t &axes, bool forward, T fct) { auto storage = alloc_tmp(in.shape(), axes, sizeof(cmplx)); unique_ptr> plan; for (size_t iax=0; iax::vlen; multi_iter, cmplx> it(iax==0? in : out, out, axes[iax]); size_t len=it.length_in(); if ((!plan) || (len!=plan->length())) plan.reset(new pocketfft_c(len)); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; iforward (tdatav, fct) : plan->backward(tdatav, fct); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast *>(storage.data()); if (it.inplace() && it.contiguous_out()) // fully in-place forward ? plan->forward ((&it.out(0)), fct) : plan->backward((&it.out(0)), fct); else if (it.contiguous_out()) // compute FFT in output location { for (size_t i=0; iforward ((&it.out(0)), fct) : plan->backward((&it.out(0)), fct); } else { for (size_t i=0; iforward (tdata, fct) : plan->backward(tdata, fct); for (size_t i=0; i NOINLINE void general_hartley( const ndarr &in, ndarr &out, const shape_t &axes, T fct) { auto storage = alloc_tmp(in.shape(), axes, sizeof(T)); unique_ptr> plan; for (size_t iax=0; iax::vlen; multi_iter it(iax==0 ? in : out, out, axes[iax]); size_t len=it.length_in(); if ((!plan) || (len!=plan->length())) plan.reset(new pocketfft_r(len)); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdatav, fct); for (size_t j=0; j0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdata, fct); // Hartley order it.out(0) = tdata[0]; size_t i=1, i1=1, i2=len-1; for (i=1; i NOINLINE void general_r2c( const ndarr &in, ndarr> &out, size_t axis, T fct) { auto storage = alloc_tmp(in.shape(), in.shape(axis), sizeof(T)); pocketfft_r plan(in.shape(axis)); constexpr int vlen = VTYPE::vlen; multi_iter> it(in, out, axis); size_t len=in.shape(axis); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); for (size_t i=0; i NOINLINE void general_c2r( const ndarr> &in, ndarr &out, size_t axis, T fct) { auto storage = alloc_tmp(out.shape(), out.shape(axis), sizeof(T)); pocketfft_r plan(out.shape(axis)); constexpr int vlen = VTYPE::vlen; multi_iter, T> it(in, out, axis); size_t len=out.shape(axis); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t j=0; j0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); tdata[0]=it.in(0).r; { size_t i=1, ii=1; for (; i NOINLINE void general_r( const ndarr &in, ndarr &out, size_t axis, bool forward, T fct) { auto storage = alloc_tmp(in.shape(), in.shape(axis), sizeof(T)); constexpr int vlen = VTYPE::vlen; multi_iter it(in, out, axis); size_t len=it.length_in(); pocketfft_r plan(len); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); if (it.inplace() && it.contiguous_out()) // fully in-place forward ? plan.forward (&it.out(0), fct) : plan.backward(&it.out(0), fct); else if (it.contiguous_out()) // compute FFT in output location { for (size_t i=0; i void c2c(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool forward, const std::complex *data_in, std::complex *data_out, T fct) { using namespace detail; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); ndarr> ain(data_in, shape, stride_in), aout(data_out, shape, stride_out); general_c(ain, aout, axes, forward, fct); } template void r2c(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, size_t axis, const T *data_in, std::complex *data_out, T fct) { using namespace detail; util::sanity_check(shape, stride_in, stride_out, false, axis); ndarr ain(data_in, shape, stride_in); ndarr> aout(data_out, shape, stride_out); general_r2c(ain, aout, axis, fct); } template void r2c(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, std::complex *data_out, T fct) { using namespace detail; util::sanity_check(shape, stride_in, stride_out, false, axes); r2c(shape, stride_in, stride_out, axes.back(), data_in, data_out, fct); if (axes.size()==1) return; shape_t shape_out(shape); shape_out[axes.back()] = shape[axes.back()]/2 + 1; auto newaxes = shape_t{axes.begin(), --axes.end()}; c2c(shape_out, stride_out, stride_out, newaxes, true, data_out, data_out, T(1)); } template void c2r(const shape_t &shape, size_t new_size, const stride_t &stride_in, const stride_t &stride_out, size_t axis, const std::complex *data_in, T *data_out, T fct) { using namespace detail; util::sanity_check(shape, stride_in, stride_out, false, axis); shape_t shape_out(shape); shape_out[axis] = new_size; ndarr> ain(data_in, shape, stride_in); ndarr aout(data_out, shape_out, stride_out); general_c2r(ain, aout, axis, fct); } template void c2r(const shape_t &shape, size_t new_size, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const std::complex *data_in, T *data_out, T fct) { using namespace detail; if (axes.size()==1) { c2r(shape, new_size, stride_in, stride_out, axes[0], data_in, data_out, fct); return; } util::sanity_check(shape, stride_in, stride_out, false, axes); auto nval = util::prod(shape); stride_t stride_inter(shape.size()); stride_inter.back() = sizeof(cmplx); for (int i=shape.size()-2; i>=0; --i) stride_inter[i] = stride_inter[i+1]*shape[i+1]; arr> tmp(nval); auto newaxes = shape_t({axes.begin(), --axes.end()}); c2c(shape, stride_in, stride_inter, newaxes, false, data_in, tmp.data(), T(1)); c2r(shape, new_size, stride_inter, stride_out, axes.back(), tmp.data(), data_out, fct); } template void r2r_fftpack(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, size_t axis, bool forward, const T *data_in, T *data_out, T fct) { using namespace detail; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axis); ndarr ain(data_in, shape, stride_in), aout(data_out, shape, stride_out); general_r(ain, aout, axis, forward, fct); } template void r2r_hartley(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct) { using namespace detail; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); ndarr ain(data_in, shape, stride_in), aout(data_out, shape, stride_out); general_hartley(ain, aout, axes, fct); } } // namespace pocketfft #endif // POCKETFFT_HDRONLY_H