/* This file is part of pocketfft. Copyright (C) 2010-2019 Max-Planck-Society Author: Martin Reinecke All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef POCKETFFT_HDRONLY_H #define POCKETFFT_HDRONLY_H #ifndef __cplusplus #error This file is C++ and requires a C++ compiler #endif #if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L) #error This file requires at least C++11 support #endif #include #include #include #include #include #include #include #if defined(_WIN32) #include #endif #ifdef POCKETFFT_OPENMP #include #endif #ifdef __GNUC__ #define NOINLINE __attribute__((noinline)) #define restrict __restrict__ #else #define NOINLINE #define restrict #endif namespace pocketfft { using shape_t = std::vector; using stride_t = std::vector; constexpr bool FORWARD = true, BACKWARD = false; namespace detail { using namespace std; #ifndef POCKETFFT_NO_VECTORS #if (defined(__AVX512F__)) #define HAVE_VECSUPPORT constexpr int VBYTELEN=64; #elif (defined(__AVX__)) #define HAVE_VECSUPPORT constexpr int VBYTELEN=32; #elif (defined(__SSE2__)) #define HAVE_VECSUPPORT constexpr int VBYTELEN=16; #else #define POCKETFFT_NO_VECTORS #endif #endif template class arr { private: T *p; size_t sz; static T *ralloc(size_t num) { if (num==0) return nullptr; #ifdef POCKETFFT_NO_VECTORS void *res = malloc(num*sizeof(T)); if (!res) throw bad_alloc(); #else #if __cplusplus >= 201703L void *res = aligned_alloc(64,num*sizeof(T)); if (!res) throw bad_alloc(); #elif defined(_WIN32) void *res = _aligned_malloc(num*sizeof(T), 64); if (!res) throw bad_alloc(); #else void *res(nullptr); if (posix_memalign(&res, 64, num*sizeof(T))!=0) throw bad_alloc(); #endif #endif return reinterpret_cast(res); } static void dealloc(T *ptr) { free(ptr); } public: arr() : p(0), sz(0) {} arr(size_t n) : p(ralloc(n)), sz(n) {} arr(arr &&other) : p(other.p), sz(other.sz) { other.p=nullptr; other.sz=0; } ~arr() { dealloc(p); } void resize(size_t n) { if (n==sz) return; dealloc(p); p = ralloc(n); sz = n; } T &operator[](size_t idx) { return p[idx]; } const T &operator[](size_t idx) const { return p[idx]; } T *data() { return p; } const T *data() const { return p; } size_t size() const { return sz; } }; template struct cmplx { T r, i; cmplx() {} cmplx(T r_, T i_) : r(r_), i(i_) {} void Set(T r_, T i_) { r=r_; i=i_; } void Set(T r_) { r=r_; i=T(0); } cmplx &operator+= (const cmplx &other) { r+=other.r; i+=other.i; return *this; } templatecmplx &operator*= (T2 other) { r*=other; i*=other; return *this; } cmplx operator+ (const cmplx &other) const { return cmplx(r+other.r, i+other.i); } cmplx operator- (const cmplx &other) const { return cmplx(r-other.r, i-other.i); } template auto operator* (const T2 &other) const -> cmplx { return {r*other, i*other}; } template auto operator* (const cmplx &other) const -> cmplx { return {r*other.r-i*other.i, r*other.i + i*other.r}; } template auto special_mul (const cmplx &other) const -> cmplx { return bwd ? cmplx(r*other.r-i*other.i, r*other.i + i*other.r) : cmplx(r*other.r+i*other.i, i*other.r - r*other.i); } }; template void PMC(cmplx &a, cmplx &b, const cmplx &c, const cmplx &d) { a = c+d; b = c-d; } template cmplx conj(const cmplx &a) { return {a.r, -a.i}; } template void ROT90(cmplx &a) { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } template void ROTM90(cmplx &a) { auto tmp_=-a.r; a.r=a.i; a.i=tmp_; } // // twiddle factor section // template class sincos_2pibyn { private: template struct TypeSelector {}; template struct TypeSelector { using type = Ta; }; template struct TypeSelector { using type = Tb; }; using Thigh = typename TypeSelectorsizeof(double))>::type; arr data; // adapted from https://stackoverflow.com/questions/42792939/ // CAUTION: this function only works for arguments in the range // [-0.25; 0.25]! void my_sincosm1pi (Thigh a, Thigh *restrict res) { if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double { Thigh pi = Thigh(3.141592653589793238462643383279502884197L); res[1] = sin(pi*a); auto s = res[1]; res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1); return; } double s = a * a; /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ double r = -1.0369917389758117e-4; r = fma (r, s, 1.9294935641298806e-3); r = fma (r, s, -2.5806887942825395e-2); r = fma (r, s, 2.3533063028328211e-1); r = fma (r, s, -1.3352627688538006e+0); r = fma (r, s, 4.0587121264167623e+0); r = fma (r, s, -4.9348022005446790e+0); double c = r*s; /* Approximate sin(pi*x) for x in [-0.25,0.25] */ r = 4.6151442520157035e-4; r = fma (r, s, -7.3700183130883555e-3); r = fma (r, s, 8.2145868949323936e-2); r = fma (r, s, -5.9926452893214921e-1); r = fma (r, s, 2.5501640398732688e+0); r = fma (r, s, -5.1677127800499516e+0); s = s * a; r = r * s; s = fma (a, 3.1415926535897931e+0, r); res[0] = c; res[1] = s; } NOINLINE void calc_first_octant(size_t den, T * restrict res) { size_t n = (den+4)>>3; if (n==0) return; res[0]=1.; res[1]=0.; if (n==1) return; size_t l1 = size_t(sqrt(n)); arr tmp(2*l1); for (size_t i=1; in) end = n-start; for (size_t i=1; i>2; size_t i=0, idx1=0, idx2=2*ndone-2; for (; i+1>1; T * p = res+n-1; calc_first_octant(n<<2, p); int i4=0, in=n, i=0; for (; i4<=in-i4; ++i, i4+=4) // octant 0 { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; } for (; i4-in <= 0; ++i, i4+=4) // octant 1 { int xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; } for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 { int xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; } for (; i>2; if ((n&7)==0) res[quart] = res[quart+1] = hsqt2; for (size_t i=2, j=2*quart-2; i>1; if ((n&3)==0) for (size_t i=0; i *cdata() const { return reinterpret_cast *>(data.data()); } }; struct util // hack to avoid duplicate symbols { static NOINLINE size_t largest_prime_factor (size_t n) { size_t res=1; while ((n&1)==0) { res=2; n>>=1; } size_t limit=size_t(sqrt(n+0.01)); for (size_t x=3; x<=limit; x+=2) while ((n/x)*x==n) { res=x; n/=x; limit=size_t(sqrt(n+0.01)); } if (n>1) res=n; return res; } static NOINLINE double cost_guess (size_t n) { constexpr double lfp=1.1; // penalty for non-hardcoded larger factors size_t ni=n; double result=0.; while ((n&1)==0) { result+=2; n>>=1; } size_t limit=size_t(sqrt(n+0.01)); for (size_t x=3; x<=limit; x+=2) while ((n/x)*x==n) { result+= (x<=5) ? x : lfp*x; // penalize larger prime factors n/=x; limit=size_t(sqrt(n+0.01)); } if (n>1) result+=(n<=5) ? n : lfp*n; return result*ni; } /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ static NOINLINE size_t good_size(size_t n) { if (n<=12) return n; size_t bestfac=2*n; for (size_t f2=1; f2=n) bestfac=f235711; return bestfac; } static size_t prod(const shape_t &shape) { size_t res=1; for (auto sz: shape) res*=sz; return res; } static NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace) { auto ndim = shape.size(); if (ndim<1) throw runtime_error("ndim must be >= 1"); if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim)) throw runtime_error("stride dimension mismatch"); if (inplace && (stride_in!=stride_out)) throw runtime_error("stride mismatch"); } static NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, const shape_t &axes) { sanity_check(shape, stride_in, stride_out, inplace); auto ndim = shape.size(); shape_t tmp(ndim,0); for (auto ax : axes) { if (ax>=ndim) throw runtime_error("bad axis number"); if (++tmp[ax]>1) throw runtime_error("axis specified repeatedly"); } } static NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, size_t axis) { sanity_check(shape, stride_in, stride_out, inplace); if (axis>=shape.size()) throw runtime_error("bad axis number"); } #ifdef POCKETFFT_OPENMP static size_t nthreads() { return omp_get_num_threads(); } static size_t thread_num() { return omp_get_thread_num(); } static size_t thread_count (size_t nthreads, const shape_t &shape, size_t axis) { if (nthreads==1) return 1; if (prod(shape)/shape[axis] < 20) return 1; return (nthreads==0) ? omp_get_max_threads() : nthreads; } #else static size_t nthreads() { return 1; } static size_t thread_num() { return 0; } #endif }; #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] #define WA(x,i) wa[(i)-1+(x)*(ido-1)] // // complex FFTPACK transforms // template class cfftp { private: struct fctdata { size_t fct; cmplx *tw, *tws; }; size_t length; arr> mem; vector fact; void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } template void pass2 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=2; if (ido==1) for (size_t k=0; k(WA(0,i)); } } } #define PREP3(idx) \ T t0 = CC(idx,0,k), t1, t2; \ PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ CH(idx,k,0)=t0+t1; #define PARTSTEP3a(u1,u2,twr,twi) \ { \ T ca,cb; \ ca=t0+t1*twr; \ cb=t2*twi; ROT90(cb); \ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ } #define PARTSTEP3b(u1,u2,twr,twi) \ { \ T ca,cb,da,db; \ ca=t0+t1*twr; \ cb=t2*twi; ROT90(cb); \ PMC(da,db,ca,cb); \ CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass3 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=3; constexpr T0 tw1r=-0.5, tw1i= (bwd ? 1: -1) * T0(0.8660254037844386467637231707529362L); if (ido==1) for (size_t k=0; k void pass4 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=4; if (ido==1) for (size_t k=0; k wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i); PMC(CH(i,k,0),c3,t2,t3); PMC(c2,c4,t1,t4); CH(i,k,1) = c2.template special_mul(wa0); CH(i,k,2) = c3.template special_mul(wa1); CH(i,k,3) = c4.template special_mul(wa2); } } } #define PREP5(idx) \ T t0 = CC(idx,0,k), t1, t2, t3, t4; \ PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ CH(idx,k,0).r=t0.r+t1.r+t2.r; \ CH(idx,k,0).i=t0.i+t1.i+t2.i; #define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ { \ T ca,cb; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \ } #define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ { \ T ca,cb,da,db; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ PMC(da,db,ca,cb); \ CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass5 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=5; constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L), tw1i= (bwd ? 1: -1) * T0(0.9510565162951535721164393333793821L), tw2r= T0(-0.8090169943749474241022934171828191L), tw2i= (bwd ? 1: -1) * T0(0.5877852522924731291687059546390728L); if (ido==1) for (size_t k=0; k(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass7(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=7; constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L), tw1i= (bwd ? 1 : -1) * T0(0.7818314824680298087084445266740578L), tw2r= T0(-0.2225209339563144042889025644967948L), tw2i= (bwd ? 1 : -1) * T0(0.9749279121818236070181316829939312L), tw3r= T0(-0.9009688679024191262361023195074451L), tw3i= (bwd ? 1 : -1) * T0(0.433883739117558120475768332848359L); if (ido==1) for (size_t k=0; k(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass11 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const cmplx * restrict wa) { constexpr size_t cdim=11; constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L), tw1i= (bwd ? 1 : -1) * T0(0.5406408174555975821076359543186917L), tw2r= T0(0.4154150130018864255292741492296232L), tw2i= (bwd ? 1 : -1) * T0(0.9096319953545183714117153830790285L), tw3r= T0(-0.1423148382732851404437926686163697L), tw3i= (bwd ? 1 : -1) * T0(0.9898214418809327323760920377767188L), tw4r= T0(-0.6548607339452850640569250724662936L), tw4i= (bwd ? 1 : -1) * T0(0.7557495743542582837740358439723444L), tw5r= T0(-0.9594929736144973898903680570663277L), tw5i= (bwd ? 1 : -1) * T0(0.2817325568414296977114179153466169L); if (ido==1) for (size_t k=0; k void passg (size_t ido, size_t ip, size_t l1, T * restrict cc, T * restrict ch, const cmplx * restrict wa, const cmplx * restrict csarr) { const size_t cdim=ip; size_t ipph = (ip+1)/2; size_t idl1 = ido*l1; arr> wal(ip); wal[0] = cmplx(1., 0.); for (size_t i=1; i(csarr[i].r,bwd ? csarr[i].i : -csarr[i].i); for (size_t k=0; kip) iwal-=ip; cmplx xwal=wal[iwal]; iwal+=l; if (iwal>ip) iwal-=ip; cmplx xwal2=wal[iwal]; for (size_t ik=0; ikip) iwal-=ip; cmplx xwal=wal[iwal]; for (size_t ik=0; ik(wa[idij]); idij=(jc-1)*(ido-1)+i-1; CX(i,k,jc) = x2.template special_mul(wa[idij]); } } } } #undef CH2 #undef CX2 #undef CX template void pass_all(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t l1=1; arr ch(length); T *p1=c, *p2=ch.data(); for(size_t k1=0; k1 (ido, l1, p1, p2, fact[k1].tw); else if(ip==2) pass2(ido, l1, p1, p2, fact[k1].tw); else if(ip==3) pass3 (ido, l1, p1, p2, fact[k1].tw); else if(ip==5) pass5 (ido, l1, p1, p2, fact[k1].tw); else if(ip==7) pass7 (ido, l1, p1, p2, fact[k1].tw); else if(ip==11) pass11 (ido, l1, p1, p2, fact[k1].tw); else { passg(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws); swap(p1,p2); } swap(p1,p2); l1=l2; } if (p1!=c) { if (fct!=1.) for (size_t i=0; i void forward(T c[], T0 fct) { pass_all(c, fct); } template void backward(T c[], T0 fct) { pass_all(c, fct); } private: NOINLINE void factorize() { size_t len=length; while ((len&3)==0) { add_factor(4); len>>=2; } if ((len&1)==0) { len>>=1; // factor 2 should be at the front of the factor list add_factor(2); swap(fact[0].fct, fact.back().fct); } size_t maxl = size_t(sqrt(double(len)))+1; for (size_t divisor=3; (len>1)&&(divisor1) add_factor(len); } size_t twsize() const { size_t twsize=0, l1=1; for (size_t k=0; k11) twsize+=ip; l1*=ip; } return twsize; } void comp_twiddle() { sincos_2pibyn twid(length, false); auto twiddle = twid.cdata(); size_t l1=1; size_t memofs=0; for (size_t k=0; k11) { fact[k].tws=mem.data()+memofs; memofs+=ip; for (size_t j=0; j class rfftp { private: struct fctdata { size_t fct; T0 *tw, *tws; }; size_t length; arr mem; vector fact; void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } #define WA(x,i) wa[(i)+(x)*(ido-1)] #define PM(a,b,c,d) { a=c+d; b=c-d; } /* (a+ib) = conj(c+id) * (e+if) */ #define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; } #define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] #define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] template void radf2 (size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=2; for (size_t k=0; k void radf3(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); for (size_t k=0; k void radf4(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=4; constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); for (size_t k=0; k void radf5(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=5; constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), ti11= T0(0.9510565162951535721164393333793821L), tr12= T0(-0.8090169943749474241022934171828191L), ti12= T0(0.5877852522924731291687059546390728L); for (size_t k=0; k void radfg(size_t ido, size_t ip, size_t l1, T * restrict cc, T * restrict ch, const T0 * restrict wa, const T0 * restrict csarr) { const size_t cdim=ip; size_t ipph=(ip+1)/2; size_t idl1 = ido*l1; if (ido>1) { for (size_t j=1, jc=ip-1; j=ip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; T0 ar=csarr[2*iang], ai=csarr[2*iang+1]; for (size_t ik=0; ik void radb2(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=2; for (size_t k=0; k void radb3(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); for (size_t k=0; k void radb4(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=4; constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); for (size_t k=0; k void radb5(size_t ido, size_t l1, const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=5; constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), ti11= T0(0.9510565162951535721164393333793821L), tr12= T0(-0.8090169943749474241022934171828191L), ti12= T0(0.5877852522924731291687059546390728L); for (size_t k=0; k void radbg(size_t ido, size_t ip, size_t l1, T * restrict cc, T * restrict ch, const T0 * restrict wa, const T0 * restrict csarr) { const size_t cdim=ip; size_t ipph=(ip+1)/ 2; size_t idl1 = ido*l1; for (size_t k=0; kip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; T0 war=csarr[2*iang], wai=csarr[2*iang+1]; for (size_t ik=0; ik void copy_and_norm(T *c, T *p1, size_t n, T0 fct) { if (p1!=c) { if (fct!=1.) for (size_t i=0; i void forward(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t n=length; size_t l1=n, nf=fact.size(); arr ch(n); T *p1=c, *p2=ch.data(); for(size_t k1=0; k1 void backward(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t n=length; size_t l1=1, nf=fact.size(); arr ch(n); T *p1=c, *p2=ch.data(); for(size_t k=0; k>=2; } if ((len%2)==0) { len>>=1; // factor 2 should be at the front of the factor list add_factor(2); swap(fact[0].fct, fact.back().fct); } size_t maxl = size_t(sqrt(double(len)))+1; for (size_t divisor=3; (len>1)&&(divisor1) add_factor(len); } size_t twsize() const { size_t twsz=0, l1=1; for (size_t k=0; k5) twsz+=2*ip; l1*=ip; } return twsz; } void comp_twiddle() { sincos_2pibyn twid(length, true); size_t l1=1; T0 *ptr=mem.data(); for (size_t k=0; k5) // special factors required by *g functions { fact[k].tws=ptr; ptr+=2*ip; fact[k].tws[0] = 1.; fact[k].tws[1] = 0.; for (size_t i=1; i<=(ip>>1); ++i) { fact[k].tws[2*i ] = twid[2*i*(length/ip)]; fact[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; fact[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)]; fact[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1]; } } l1*=ip; } } public: NOINLINE rfftp(size_t length_) : length(length_) { if (length==0) throw runtime_error("zero-sized FFT"); if (length==1) return; factorize(); mem.resize(twsize()); comp_twiddle(); } }; // // complex Bluestein transforms // template class fftblue { private: size_t n, n2; cfftp plan; arr> mem; cmplx *bk, *bkf; template void fft(cmplx c[], T0 fct) { arr> akf(n2); /* initialize a_k and FFT it */ for (size_t m=0; m(bk[m]); for (size_t m=n; m(bkf[m]); /* inverse FFT */ plan.backward (akf.data(),1.); /* multiply by b_k */ for (size_t m=0; m(bk[m])*fct; } public: NOINLINE fftblue(size_t length) : n(length), n2(util::good_size(n*2-1)), plan(n2), mem(n+n2), bk(mem.data()), bkf(mem.data()+n) { /* initialize b_k */ sincos_2pibyn tmp_(2*n, false); auto tmp = tmp_.cdata(); bk[0].Set(1, 0); size_t coeff=0; for (size_t m=1; m=2*n) coeff-=2*n; bk[m] = tmp[coeff]; } /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ T0 xn2 = T0(1)/n2; bkf[0] = bk[0]*xn2; for (size_t m=1; m void backward(cmplx c[], T0 fct) { fft(c,fct); } template void forward(cmplx c[], T0 fct) { fft(c,fct); } template void backward_r(T c[], T0 fct) { arr> tmp(n); tmp[0].Set(c[0],c[0]*0); memcpy (reinterpret_cast(tmp.data()+1), reinterpret_cast(c+1), (n-1)*sizeof(T)); if ((n&1)==0) tmp[n/2].i=c[0]*0.; for (size_t m=1; 2*m(tmp.data(),fct); for (size_t m=0; m void forward_r(T c[], T0 fct) { arr> tmp(n); for (size_t m=0; m(tmp.data(),fct); c[0] = tmp[0].r; memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T)); } }; // // flexible (FFTPACK/Bluestein) complex 1D transform // template class pocketfft_c { private: unique_ptr> packplan; unique_ptr> blueplan; size_t len; public: NOINLINE pocketfft_c(size_t length) : len(length) { if (length==0) throw runtime_error("zero-length FFT requested"); if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length))) { packplan=unique_ptr>(new cfftp(length)); return; } double comp1 = util::cost_guess(length); double comp2 = 2*util::cost_guess(util::good_size(2*length-1)); comp2*=1.5; /* fudge factor that appears to give good overall performance */ if (comp2>(new fftblue(length)); else packplan=unique_ptr>(new cfftp(length)); } template NOINLINE void backward(cmplx c[], T0 fct) { packplan ? packplan->backward(c,fct) : blueplan->backward(c,fct); } template NOINLINE void forward(cmplx c[], T0 fct) { packplan ? packplan->forward(c,fct) : blueplan->forward(c,fct); } size_t length() const { return len; } }; // // flexible (FFTPACK/Bluestein) real-valued 1D transform // template class pocketfft_r { private: unique_ptr> packplan; unique_ptr> blueplan; size_t len; public: NOINLINE pocketfft_r(size_t length) : len(length) { if (length==0) throw runtime_error("zero-length FFT requested"); if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length))) { packplan=unique_ptr>(new rfftp(length)); return; } double comp1 = 0.5*util::cost_guess(length); double comp2 = 2*util::cost_guess(util::good_size(2*length-1)); comp2*=1.5; /* fudge factor that appears to give good overall performance */ if (comp2>(new fftblue(length)); else packplan=unique_ptr>(new rfftp(length)); } template NOINLINE void backward(T c[], T0 fct) { packplan ? packplan->backward(c,fct) : blueplan->backward_r(c,fct); } template NOINLINE void forward(T c[], T0 fct) { packplan ? packplan->forward(c,fct) : blueplan->forward_r(c,fct); } size_t length() const { return len; } }; // // multi-D infrastructure // template class ndarr { private: char *d; const char *cd; shape_t shp; stride_t str; public: ndarr(const void *data_, const shape_t &shape_, const stride_t &stride_) : d(nullptr), cd(reinterpret_cast(data_)), shp(shape_), str(stride_) {} ndarr(void *data_, const shape_t &shape_, const stride_t &stride_) : d(reinterpret_cast(data_)), cd(reinterpret_cast(data_)), shp(shape_), str(stride_) {} size_t ndim() const { return shp.size(); } size_t size() const { return util::prod(shp); } const shape_t &shape() const { return shp; } size_t shape(size_t i) const { return shp[i]; } const stride_t &stride() const { return str; } const ptrdiff_t &stride(size_t i) const { return str[i]; } T &operator[](ptrdiff_t ofs) { return *reinterpret_cast(d+ofs); } const T &operator[](ptrdiff_t ofs) const { return *reinterpret_cast(cd+ofs); } }; template class multi_iter { public: shape_t pos; const ndarr &iarr; ndarr &oarr; private: ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o; size_t idim, rem; void advance_i() { for (int i=pos.size()-1; i>=0; --i) { if (i==int(idim)) continue; p_ii += iarr.stride(i); p_oi += oarr.stride(i); if (++pos[i] < iarr.shape(i)) return; pos[i] = 0; p_ii -= iarr.shape(i)*iarr.stride(i); p_oi -= oarr.shape(i)*oarr.stride(i); } } public: multi_iter(const ndarr &iarr_, ndarr &oarr_, size_t idim_) : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0), str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)), idim(idim_), rem(iarr.size()/iarr.shape(idim)) { auto nshares = util::nthreads(); if (nshares==1) return; if (nshares==0) throw runtime_error("can't run with zero threads"); auto myshare = util::thread_num(); if (myshare>=nshares) throw runtime_error("impossible share requested"); size_t nbase = rem/nshares; size_t additional = rem%nshares; size_t lo = myshare*nbase + ((myshare struct VTYPE {}; template<> struct VTYPE { using type = long double __attribute__ ((vector_size (sizeof(long double)))); static constexpr int vlen=1; }; template<> struct VTYPE { using type = double __attribute__ ((vector_size (VBYTELEN))); static constexpr int vlen=VBYTELEN/sizeof(double); }; template<> struct VTYPE { using type = float __attribute__ ((vector_size (VBYTELEN))); static constexpr int vlen=VBYTELEN/sizeof(float); }; #else template struct VTYPE { static constexpr int vlen=1; }; #endif template arr alloc_tmp(const shape_t &shape, size_t axsize, size_t elemsize) { auto othersize = util::prod(shape)/axsize; auto tmpsize = axsize*((othersize>=VTYPE::vlen) ? VTYPE::vlen : 1); return arr(tmpsize*elemsize); } template arr alloc_tmp(const shape_t &shape, const shape_t &axes, size_t elemsize) { size_t fullsize=util::prod(shape); size_t tmpsize=0; for (size_t i=0; i=VTYPE::vlen) ? VTYPE::vlen : 1); if (sz>tmpsize) tmpsize=sz; } return arr(tmpsize*elemsize); } template NOINLINE void general_c( const ndarr> &in, ndarr> &out, const shape_t &axes, bool forward, T fct, size_t nthreads=1) { unique_ptr> plan; for (size_t iax=0; iax::vlen; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) plan.reset(new pocketfft_c(len)); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(cmplx)); multi_iter, cmplx> it(iax==0? in : out, out, axes[iax]); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; iforward (tdatav, fct) : plan->backward(tdatav, fct); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast *>(storage.data()); if (it.inplace() && it.contiguous_out()) // fully in-place forward ? plan->forward ((&it.out(0)), fct) : plan->backward((&it.out(0)), fct); else if (it.contiguous_out()) // compute FFT in output location { for (size_t i=0; iforward ((&it.out(0)), fct) : plan->backward((&it.out(0)), fct); } else { for (size_t i=0; iforward (tdata, fct) : plan->backward(tdata, fct); for (size_t i=0; i NOINLINE void general_hartley( const ndarr &in, ndarr &out, const shape_t &axes, T fct, size_t nthreads=1) { unique_ptr> plan; for (size_t iax=0; iax::vlen; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) plan.reset(new pocketfft_r(len)); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(T)); multi_iter it(iax==0 ? in : out, out, axes[iax]); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdatav, fct); for (size_t j=0; j0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdata, fct); // Hartley order it.out(0) = tdata[0]; size_t i=1, i1=1, i2=len-1; for (i=1; i NOINLINE void general_r2c( const ndarr &in, ndarr> &out, size_t axis, T fct, size_t nthreads=1) { pocketfft_r plan(in.shape(axis)); constexpr int vlen = VTYPE::vlen; size_t len=in.shape(axis); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axis)) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(T)); multi_iter> it(in, out, axis); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); for (size_t i=0; i NOINLINE void general_c2r( const ndarr> &in, ndarr &out, size_t axis, T fct, size_t nthreads=1) { pocketfft_r plan(out.shape(axis)); constexpr int vlen = VTYPE::vlen; size_t len=out.shape(axis); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axis)) #endif { auto storage = alloc_tmp(out.shape(), len, sizeof(T)); multi_iter, T> it(in, out, axis); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t j=0; j0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); tdata[0]=it.in(0).r; { size_t i=1, ii=1; for (; i NOINLINE void general_r( const ndarr &in, ndarr &out, size_t axis, bool forward, T fct, size_t nthreads=1) { constexpr int vlen = VTYPE::vlen; size_t len=in.shape(axis); pocketfft_r plan(len); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axis)) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(T)); multi_iter it(in, out, axis); #if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); if (it.inplace() && it.contiguous_out()) // fully in-place forward ? plan.forward (&it.out(0), fct) : plan.backward(&it.out(0), fct); else if (it.contiguous_out()) // compute FFT in output location { for (size_t i=0; i void c2c(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool forward, const std::complex *data_in, std::complex *data_out, T fct, size_t nthreads=1) { using namespace detail; if (util::prod(shape)==0) return; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); ndarr> ain(data_in, shape, stride_in), aout(data_out, shape, stride_out); general_c(ain, aout, axes, forward, fct, nthreads); } template void r2c(const shape_t &shape_in, const stride_t &stride_in, const stride_t &stride_out, size_t axis, const T *data_in, std::complex *data_out, T fct, size_t nthreads=1) { using namespace detail; if (util::prod(shape_in)==0) return; util::sanity_check(shape_in, stride_in, stride_out, false, axis); ndarr ain(data_in, shape_in, stride_in); shape_t shape_out(shape_in); shape_out[axis] = shape_in[axis]/2 + 1; ndarr> aout(data_out, shape_out, stride_out); general_r2c(ain, aout, axis, fct, nthreads); } template void r2c(const shape_t &shape_in, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, std::complex *data_out, T fct, size_t nthreads=1) { using namespace detail; if (util::prod(shape_in)==0) return; util::sanity_check(shape_in, stride_in, stride_out, false, axes); r2c(shape_in, stride_in, stride_out, axes.back(), data_in, data_out, fct, nthreads); if (axes.size()==1) return; shape_t shape_out(shape_in); shape_out[axes.back()] = shape_in[axes.back()]/2 + 1; auto newaxes = shape_t{axes.begin(), --axes.end()}; c2c(shape_out, stride_out, stride_out, newaxes, true, data_out, data_out, T(1), nthreads); } template void c2r(const shape_t &shape_out, const stride_t &stride_in, const stride_t &stride_out, size_t axis, const std::complex *data_in, T *data_out, T fct, size_t nthreads=1) { using namespace detail; if (util::prod(shape_out)==0) return; util::sanity_check(shape_out, stride_in, stride_out, false, axis); shape_t shape_in(shape_out); shape_in[axis] = shape_out[axis]/2 + 1; ndarr> ain(data_in, shape_in, stride_in); ndarr aout(data_out, shape_out, stride_out); general_c2r(ain, aout, axis, fct, nthreads); } template void c2r(const shape_t &shape_out, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const std::complex *data_in, T *data_out, T fct, size_t nthreads=1) { using namespace detail; if (util::prod(shape_out)==0) return; if (axes.size()==1) return c2r(shape_out, stride_in, stride_out, axes[0], data_in, data_out, fct, nthreads); util::sanity_check(shape_out, stride_in, stride_out, false, axes); auto shape_in = shape_out; shape_in[axes.back()] = shape_out[axes.back()]/2 + 1; auto nval = util::prod(shape_in); stride_t stride_inter(shape_in.size()); stride_inter.back() = sizeof(cmplx); for (int i=shape_in.size()-2; i>=0; --i) stride_inter[i] = stride_inter[i+1]*shape_in[i+1]; arr> tmp(nval); auto newaxes = shape_t({axes.begin(), --axes.end()}); c2c(shape_in, stride_in, stride_inter, newaxes, false, data_in, tmp.data(), T(1), nthreads); c2r(shape_out, stride_inter, stride_out, axes.back(), tmp.data(), data_out, fct, nthreads); } template void r2r_fftpack(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, size_t axis, bool forward, const T *data_in, T *data_out, T fct, size_t nthreads=1) { using namespace detail; if (util::prod(shape)==0) return; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axis); ndarr ain(data_in, shape, stride_in), aout(data_out, shape, stride_out); general_r(ain, aout, axis, forward, fct, nthreads); } template void r2r_hartley(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, size_t nthreads=1) { using namespace detail; if (util::prod(shape)==0) return; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); ndarr ain(data_in, shape, stride_in), aout(data_out, shape, stride_out); general_hartley(ain, aout, axes, fct, nthreads); } } // namespace pocketfft #endif // POCKETFFT_HDRONLY_H