/* This file is part of pocketfft. Copyright (C) 2010-2019 Max-Planck-Society Copyright (C) 2019 Peter Bell For the odd-sized DCT-IV transforms: Copyright (C) 2003, 2007-14 Matteo Frigo Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology Authors: Martin Reinecke, Peter Bell All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef POCKETFFT_HDRONLY_H #define POCKETFFT_HDRONLY_H #ifndef __cplusplus #error This file is C++ and requires a C++ compiler. #endif #if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L) #error This file requires at least C++11 support. #endif #ifndef POCKETFFT_CACHE_SIZE #define POCKETFFT_CACHE_SIZE 16 #endif #include #include #include #include #include #include #include #if POCKETFFT_CACHE_SIZE!=0 #include #include #endif #ifdef POCKETFFT_OPENMP #include #endif #if defined(__GNUC__) #define POCKETFFT_NOINLINE __attribute__((noinline)) #define POCKETFFT_RESTRICT __restrict__ #elif defined(_MSC_VER) #define POCKETFFT_NOINLINE __declspec(noinline) #define POCKETFFT_RESTRICT __restrict #else #define POCKETFFT_NOINLINE #define POCKETFFT_RESTRICT #endif namespace pocketfft { namespace detail { using namespace std; using shape_t = vector; using stride_t = vector; constexpr bool FORWARD = true, BACKWARD = false; // only enable vector support for gcc>=5.0 and clang>=5.0 #ifndef POCKETFFT_NO_VECTORS #define POCKETFFT_NO_VECTORS #if defined(__INTEL_COMPILER) // do nothing. This is necessary because this compiler also sets __GNUC__. #elif defined(__clang__) #if __clang__>=5 #undef POCKETFFT_NO_VECTORS #endif #elif defined(__GNUC__) #if __GNUC__>=5 #undef POCKETFFT_NO_VECTORS #endif #endif #endif template struct VLEN { static constexpr size_t val=1; }; #ifndef POCKETFFT_NO_VECTORS #if (defined(__AVX512F__)) template<> struct VLEN { static constexpr size_t val=16; }; template<> struct VLEN { static constexpr size_t val=8; }; #elif (defined(__AVX__)) template<> struct VLEN { static constexpr size_t val=8; }; template<> struct VLEN { static constexpr size_t val=4; }; #elif (defined(__SSE2__)) template<> struct VLEN { static constexpr size_t val=4; }; template<> struct VLEN { static constexpr size_t val=2; }; #elif (defined(__VSX__)) template<> struct VLEN { static constexpr size_t val=4; }; template<> struct VLEN { static constexpr size_t val=2; }; #else #define POCKETFFT_NO_VECTORS #endif #endif template class arr { private: T *p; size_t sz; #if defined(POCKETFFT_NO_VECTORS) static T *ralloc(size_t num) { if (num==0) return nullptr; void *res = malloc(num*sizeof(T)); if (!res) throw bad_alloc(); return reinterpret_cast(res); } static void dealloc(T *ptr) { free(ptr); } #elif __cplusplus >= 201703L static T *ralloc(size_t num) { if (num==0) return nullptr; void *res = aligned_alloc(64,num*sizeof(T)); if (!res) throw bad_alloc(); return reinterpret_cast(res); } static void dealloc(T *ptr) { free(ptr); } #else // portable emulation static T *ralloc(size_t num) { if (num==0) return nullptr; void *ptr = malloc(num*sizeof(T)+64); if (!ptr) throw bad_alloc(); T *res = reinterpret_cast ((reinterpret_cast(ptr) & ~(size_t(63))) + 64); (reinterpret_cast(res))[-1] = ptr; return res; } static void dealloc(T *ptr) { if (ptr) free((reinterpret_cast(ptr))[-1]); } #endif public: arr() : p(0), sz(0) {} arr(size_t n) : p(ralloc(n)), sz(n) {} arr(arr &&other) : p(other.p), sz(other.sz) { other.p=nullptr; other.sz=0; } ~arr() { dealloc(p); } void resize(size_t n) { if (n==sz) return; dealloc(p); p = ralloc(n); sz = n; } T &operator[](size_t idx) { return p[idx]; } const T &operator[](size_t idx) const { return p[idx]; } T *data() { return p; } const T *data() const { return p; } size_t size() const { return sz; } }; template struct cmplx { T r, i; cmplx() {} cmplx(T r_, T i_) : r(r_), i(i_) {} void Set(T r_, T i_) { r=r_; i=i_; } void Set(T r_) { r=r_; i=T(0); } cmplx &operator+= (const cmplx &other) { r+=other.r; i+=other.i; return *this; } templatecmplx &operator*= (T2 other) { r*=other; i*=other; return *this; } templatecmplx &operator*= (const cmplx &other) { T tmp = r*other.r - i*other.i; i = r*other.i + i*other.r; r = tmp; return *this; } cmplx operator+ (const cmplx &other) const { return cmplx(r+other.r, i+other.i); } cmplx operator- (const cmplx &other) const { return cmplx(r-other.r, i-other.i); } template auto operator* (const T2 &other) const -> cmplx { return {r*other, i*other}; } template auto operator* (const cmplx &other) const -> cmplx { return {r*other.r-i*other.i, r*other.i + i*other.r}; } template auto special_mul (const cmplx &other) const -> cmplx { using Tres = cmplx; return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i) : Tres(r*other.r-i*other.i, r*other.i+i*other.r); } }; template void PMC(cmplx &a, cmplx &b, const cmplx &c, const cmplx &d) { a = c+d; b = c-d; } template cmplx conj(const cmplx &a) { return {a.r, -a.i}; } template void ROT90(cmplx &a) { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } template void ROTX90(cmplx &a) { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; } // // twiddle factor section // template class sincos_2pibyn { private: using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; arr data; void my_sincosm1pi (Thigh a_, Thigh *POCKETFFT_RESTRICT res) { if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double { constexpr Thigh pi = Thigh(3.141592653589793238462643383279502884197L); auto s = sin(pi*a_); res[1] = s; res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1); return; } // adapted from https://stackoverflow.com/questions/42792939/ // CAUTION: this function only works for arguments in the range // [-0.25; 0.25]! double a = double(a_); double s = a * a; /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */ double r = -1.0369917389758117e-4; r = fma (r, s, 1.9294935641298806e-3); r = fma (r, s, -2.5806887942825395e-2); r = fma (r, s, 2.3533063028328211e-1); r = fma (r, s, -1.3352627688538006e+0); r = fma (r, s, 4.0587121264167623e+0); r = fma (r, s, -4.9348022005446790e+0); double c = r*s; /* Approximate sin(pi*x) for x in [-0.25,0.25] */ r = 4.6151442520157035e-4; r = fma (r, s, -7.3700183130883555e-3); r = fma (r, s, 8.2145868949323936e-2); r = fma (r, s, -5.9926452893214921e-1); r = fma (r, s, 2.5501640398732688e+0); r = fma (r, s, -5.1677127800499516e+0); s = s * a; r = r * s; s = fma (a, 3.1415926535897931e+0, r); res[0] = c; res[1] = s; } POCKETFFT_NOINLINE void calc_first_octant(size_t den, T * POCKETFFT_RESTRICT res) { size_t n = (den+4)>>3; if (n==0) return; res[0]=1.; res[1]=0.; if (n==1) return; size_t l1 = size_t(sqrt(n)); arr tmp(2*l1); for (size_t i=1; in) end = n-start; for (size_t i=1; i>2; size_t i=0, idx1=0, idx2=2*ndone-2; for (; i+1>1; T * p = res+n-1; calc_first_octant(n<<2, p); int i4=0, in=int(n), i=0; for (; i4<=in-i4; ++i, i4+=4) // octant 0 { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; } for (; i4-in <= 0; ++i, i4+=4) // octant 1 { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; } for (; i4<=3*in-i4; ++i, i4+=4) // octant 2 { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; } for (; i>2; if ((n&7)==0) res[quart] = res[quart+1] = hsqt2; for (size_t i=2, j=2*quart-2; i>1; if ((n&3)==0) for (size_t i=0; i *cdata() const { return reinterpret_cast *>(data.data()); } }; struct util // hack to avoid duplicate symbols { static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n) { size_t res=1; while ((n&1)==0) { res=2; n>>=1; } for (size_t x=3; x*x<=n; x+=2) while ((n%x)==0) { res=x; n/=x; } if (n>1) res=n; return res; } static POCKETFFT_NOINLINE double cost_guess (size_t n) { constexpr double lfp=1.1; // penalty for non-hardcoded larger factors size_t ni=n; double result=0.; while ((n&1)==0) { result+=2; n>>=1; } for (size_t x=3; x*x<=n; x+=2) while ((n%x)==0) { result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors n/=x; } if (n>1) result+=(n<=5) ? double(n) : lfp*double(n); return result*double(ni); } /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ static POCKETFFT_NOINLINE size_t good_size(size_t n) { if (n<=12) return n; size_t bestfac=2*n; for (size_t f2=1; f2=n) bestfac=f235711; return bestfac; } static size_t prod(const shape_t &shape) { size_t res=1; for (auto sz: shape) res*=sz; return res; } static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace) { auto ndim = shape.size(); if (ndim<1) throw runtime_error("ndim must be >= 1"); if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim)) throw runtime_error("stride dimension mismatch"); if (inplace && (stride_in!=stride_out)) throw runtime_error("stride mismatch"); } static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, const shape_t &axes) { sanity_check(shape, stride_in, stride_out, inplace); auto ndim = shape.size(); shape_t tmp(ndim,0); for (auto ax : axes) { if (ax>=ndim) throw invalid_argument("bad axis number"); if (++tmp[ax]>1) throw invalid_argument("axis specified repeatedly"); } } static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, size_t axis) { sanity_check(shape, stride_in, stride_out, inplace); if (axis>=shape.size()) throw invalid_argument("bad axis number"); } #ifdef POCKETFFT_OPENMP static size_t nthreads() { return size_t(omp_get_num_threads()); } static size_t thread_num() { return size_t(omp_get_thread_num()); } static size_t thread_count (size_t nthreads, const shape_t &shape, size_t axis) { if (nthreads==1) return 1; if (prod(shape) < 20*shape[axis]) return 1; return (nthreads==0) ? size_t(omp_get_max_threads()) : nthreads; } #else static constexpr size_t nthreads() { return 1; } static constexpr size_t thread_num() { return 0; } #endif }; // // complex FFTPACK transforms // template class cfftp { private: struct fctdata { size_t fct; cmplx *tw, *tws; }; size_t length; arr> mem; vector fact; void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } template void pass2 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=2; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; if (ido==1) for (size_t k=0; k(WA(0,i)); } } } #define POCKETFFT_PREP3(idx) \ T t0 = CC(idx,0,k), t1, t2; \ PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ CH(idx,k,0)=t0+t1; #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ { \ T ca,cb; \ ca=t0+t1*twr; \ cb=t2*twi; ROT90(cb); \ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ } #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ { \ T ca,cb,da,db; \ ca=t0+t1*twr; \ cb=t2*twi; ROT90(cb); \ PMC(da,db,ca,cb); \ CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass3 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=3; constexpr T0 tw1r=-0.5, tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L); auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; if (ido==1) for (size_t k=0; k void pass4 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=4; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; if (ido==1) for (size_t k=0; k(t4); PMC(CH(0,k,0),CH(0,k,2),t2,t3); PMC(CH(0,k,1),CH(0,k,3),t1,t4); } else for (size_t k=0; k(t4); PMC(CH(0,k,0),CH(0,k,2),t2,t3); PMC(CH(0,k,1),CH(0,k,3),t1,t4); } for (size_t i=1; i(t4); PMC(CH(i,k,0),c3,t2,t3); PMC(c2,c4,t1,t4); CH(i,k,1) = c2.template special_mul(WA(0,i)); CH(i,k,2) = c3.template special_mul(WA(1,i)); CH(i,k,3) = c4.template special_mul(WA(2,i)); } } } #define POCKETFFT_PREP5(idx) \ T t0 = CC(idx,0,k), t1, t2, t3, t4; \ PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ CH(idx,k,0).r=t0.r+t1.r+t2.r; \ CH(idx,k,0).i=t0.i+t1.i+t2.i; #define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ { \ T ca,cb; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \ } #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ { \ T ca,cb,da,db; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ PMC(da,db,ca,cb); \ CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass5 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=5; constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L), tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L), tw2r= T0(-0.8090169943749474241022934171828191L), tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L); auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; if (ido==1) for (size_t k=0; k(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass7(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=7; constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L), tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L), tw2r= T0(-0.2225209339563144042889025644967948L), tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L), tw3r= T0(-0.9009688679024191262361023195074451L), tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L); auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; if (ido==1) for (size_t k=0; k void ROTX45(T &a) { constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); if (fwd) { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); } else { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); } } template void ROTX135(T &a) { constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); if (fwd) { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); } else { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); } } template inline void PMINPLACE(T &a, T &b) { T t = a; a.r+=b.r; a.i+=b.i; b.r=t.r-b.r; b.i=t.i-b.i; } template void pass8 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=8; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; if (ido==1) for (size_t k=0; k(a6); ROTX90(a7); PMINPLACE(a0,a2); PMINPLACE(a1,a3); PMINPLACE(a4,a6); PMINPLACE(a5,a7); ROTX45(a5); ROTX90(a3); ROTX135(a7); PMC(CH(0,k,0),CH(0,k,4),a0,a1); PMC(CH(0,k,1),CH(0,k,5),a4,a5); PMC(CH(0,k,2),CH(0,k,6),a2,a3); PMC(CH(0,k,3),CH(0,k,7),a6,a7); } else for (size_t k=0; k(a6); ROTX90(a7); PMINPLACE(a0,a2); PMINPLACE(a1,a3); PMINPLACE(a4,a6); PMINPLACE(a5,a7); ROTX45(a5); ROTX90(a3); ROTX135(a7); PMC(CH(0,k,0),CH(0,k,4),a0,a1); PMC(CH(0,k,1),CH(0,k,5),a4,a5); PMC(CH(0,k,2),CH(0,k,6),a2,a3); PMC(CH(0,k,3),CH(0,k,7),a6,a7); for (size_t i=1; i(a6); ROTX90(a7); PMINPLACE(a0,a2); PMINPLACE(a1,a3); PMINPLACE(a4,a6); PMINPLACE(a5,a7); ROTX45(a5); ROTX90(a3); ROTX135(a7); PMINPLACE(a0,a1); PMINPLACE(a2,a3); PMINPLACE(a4,a5); PMINPLACE(a6,a7); CH(i,k,0) = a0; CH(i,k,1) = a4.template special_mul(WA(0,i)); CH(i,k,2) = a2.template special_mul(WA(1,i)); CH(i,k,3) = a6.template special_mul(WA(2,i)); CH(i,k,4) = a1.template special_mul(WA(3,i)); CH(i,k,5) = a5.template special_mul(WA(4,i)); CH(i,k,6) = a3.template special_mul(WA(5,i)); CH(i,k,7) = a7.template special_mul(WA(6,i)); } } } #define POCKETFFT_PREP11(idx) \ T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; #define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ { \ T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ cb; \ cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ PMC(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) #define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ { \ T da,db; \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ CH(i,k,u1) = da.template special_mul(WA(u1-1,i)); \ CH(i,k,u2) = db.template special_mul(WA(u2-1,i)); \ } template void pass11 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=11; constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L), tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L), tw2r= T0(0.4154150130018864255292741492296232L), tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L), tw3r= T0(-0.1423148382732851404437926686163697L), tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L), tw4r= T0(-0.6548607339452850640569250724662936L), tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L), tw5r= T0(-0.9594929736144973898903680570663277L), tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L); auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto WA = [wa, ido](size_t x, size_t i) { return wa[i-1+x*(ido-1)]; }; if (ido==1) for (size_t k=0; k void passg (size_t ido, size_t ip, size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const cmplx * POCKETFFT_RESTRICT wa, const cmplx * POCKETFFT_RESTRICT csarr) { const size_t cdim=ip; size_t ipph = (ip+1)/2; size_t idl1 = ido*l1; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T& { return cc[a+ido*(b+l1*c)]; }; auto CX2 = [cc, idl1](size_t a, size_t b) -> T& { return cc[a+idl1*b]; }; auto CH2 = [ch, idl1](size_t a, size_t b) -> const T& { return ch[a+idl1*b]; }; arr> wal(ip); wal[0] = cmplx(1., 0.); for (size_t i=1; i(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i); for (size_t k=0; kip) iwal-=ip; cmplx xwal=wal[iwal]; iwal+=l; if (iwal>ip) iwal-=ip; cmplx xwal2=wal[iwal]; for (size_t ik=0; ikip) iwal-=ip; cmplx xwal=wal[iwal]; for (size_t ik=0; ik(wa[idij]); idij=(jc-1)*(ido-1)+i-1; CX(i,k,jc) = x2.template special_mul(wa[idij]); } } } } template void pass_all(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t l1=1; arr ch(length); T *p1=c, *p2=ch.data(); for(size_t k1=0; k1 (ido, l1, p1, p2, fact[k1].tw); else if(ip==8) pass8(ido, l1, p1, p2, fact[k1].tw); else if(ip==2) pass2(ido, l1, p1, p2, fact[k1].tw); else if(ip==3) pass3 (ido, l1, p1, p2, fact[k1].tw); else if(ip==5) pass5 (ido, l1, p1, p2, fact[k1].tw); else if(ip==7) pass7 (ido, l1, p1, p2, fact[k1].tw); else if(ip==11) pass11 (ido, l1, p1, p2, fact[k1].tw); else { passg(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws); swap(p1,p2); } swap(p1,p2); l1=l2; } if (p1!=c) { if (fct!=1.) for (size_t i=0; i void forward(T c[], T0 fct) { pass_all(c, fct); } template void backward(T c[], T0 fct) { pass_all(c, fct); } private: POCKETFFT_NOINLINE void factorize() { size_t len=length; while ((len&7)==0) { add_factor(8); len>>=3; } while ((len&3)==0) { add_factor(4); len>>=2; } if ((len&1)==0) { len>>=1; // factor 2 should be at the front of the factor list add_factor(2); swap(fact[0].fct, fact.back().fct); } for (size_t divisor=3; divisor*divisor<=len; divisor+=2) while ((len%divisor)==0) { add_factor(divisor); len/=divisor; } if (len>1) add_factor(len); } size_t twsize() const { size_t twsize=0, l1=1; for (size_t k=0; k11) twsize+=ip; l1*=ip; } return twsize; } void comp_twiddle() { sincos_2pibyn twid(length, false); auto twiddle = twid.cdata(); size_t l1=1; size_t memofs=0; for (size_t k=0; k11) { fact[k].tws=mem.data()+memofs; memofs+=ip; for (size_t j=0; j class rfftp { private: struct fctdata { size_t fct; T0 *tw, *tws; }; size_t length; arr mem; vector fact; void add_factor(size_t factor) { fact.push_back({factor, nullptr, nullptr}); } template inline void PM(T &a, T &b, T c, T d) { a=c+d; b=c-d; } /* (a+ib) = conj(c+id) * (e+if) */ template inline void MULPM (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) { a=c*e+d*f; b=c*f-d*e; } template void radf2 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=2; auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+cdim*c)]; }; for (size_t k=0; k void radf3(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+cdim*c)]; }; for (size_t k=0; k void radf4(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=4; constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+cdim*c)]; }; for (size_t k=0; k void radf5(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=5; constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), ti11= T0(0.9510565162951535721164393333793821L), tr12= T0(-0.8090169943749474241022934171828191L), ti12= T0(0.5877852522924731291687059546390728L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,ido,cdim](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+cdim*c)]; }; for (size_t k=0; k void radfg(size_t ido, size_t ip, size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) { const size_t cdim=ip; size_t ipph=(ip+1)/2; size_t idl1 = ido*l1; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T& { return cc[a+ido*(b+cdim*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T& { return ch[a+ido*(b+l1*c)]; }; auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T& { return cc[a+ido*(b+l1*c)]; }; auto C2 = [cc,idl1] (size_t a, size_t b) -> T& { return cc[a+idl1*b]; }; auto CH2 = [ch,idl1] (size_t a, size_t b) -> T& { return ch[a+idl1*b]; }; if (ido>1) { for (size_t j=1, jc=ip-1; j=ip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; T0 ar=csarr[2*iang], ai=csarr[2*iang+1]; for (size_t ik=0; ik void radb2(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=2; auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k void radb3(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k void radb4(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=4; constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k void radb5(size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa) { constexpr size_t cdim=5; constexpr T0 tr11= T0(0.3090169943749474241022934171828191L), ti11= T0(0.9510565162951535721164393333793821L), tr12= T0(-0.8090169943749474241022934171828191L), ti12= T0(0.5877852522924731291687059546390728L); auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k void radbg(size_t ido, size_t ip, size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) { const size_t cdim=ip; size_t ipph=(ip+1)/ 2; size_t idl1 = ido*l1; auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+cdim*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+l1*c)]; }; auto C2 = [cc,idl1](size_t a, size_t b) -> T& { return cc[a+idl1*b]; }; auto CH2 = [ch,idl1](size_t a, size_t b) -> T& { return ch[a+idl1*b]; }; for (size_t k=0; kip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; T0 war=csarr[2*iang], wai=csarr[2*iang+1]; for (size_t ik=0; ik void copy_and_norm(T *c, T *p1, size_t n, T0 fct) { if (p1!=c) { if (fct!=1.) for (size_t i=0; i void forward(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t n=length; size_t l1=n, nf=fact.size(); arr ch(n); T *p1=c, *p2=ch.data(); for(size_t k1=0; k1 void backward(T c[], T0 fct) { if (length==1) { c[0]*=fct; return; } size_t n=length; size_t l1=1, nf=fact.size(); arr ch(n); T *p1=c, *p2=ch.data(); for(size_t k=0; k>=2; } if ((len%2)==0) { len>>=1; // factor 2 should be at the front of the factor list add_factor(2); swap(fact[0].fct, fact.back().fct); } for (size_t divisor=3; divisor*divisor<=len; divisor+=2) while ((len%divisor)==0) { add_factor(divisor); len/=divisor; } if (len>1) add_factor(len); } size_t twsize() const { size_t twsz=0, l1=1; for (size_t k=0; k5) twsz+=2*ip; l1*=ip; } return twsz; } void comp_twiddle() { sincos_2pibyn twid(length, true); size_t l1=1; T0 *ptr=mem.data(); for (size_t k=0; k5) // special factors required by *g functions { fact[k].tws=ptr; ptr+=2*ip; fact[k].tws[0] = 1.; fact[k].tws[1] = 0.; for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2) { fact[k].tws[i ] = twid[i*(length/ip)]; fact[k].tws[i+1] = twid[i*(length/ip)+1]; fact[k].tws[ic] = twid[i*(length/ip)]; fact[k].tws[ic+1] = -twid[i*(length/ip)+1]; } } l1*=ip; } } public: POCKETFFT_NOINLINE rfftp(size_t length_) : length(length_) { if (length==0) throw runtime_error("zero-length FFT requested"); if (length==1) return; factorize(); mem.resize(twsize()); comp_twiddle(); } }; // // complex Bluestein transforms // template class fftblue { private: size_t n, n2; cfftp plan; arr> mem; cmplx *bk, *bkf; template void fft(cmplx c[], T0 fct) { arr> akf(n2); /* initialize a_k and FFT it */ for (size_t m=0; m(bk[m]); auto zero = akf[0]*T0(0); for (size_t m=n; m(bkf[m]); /* inverse FFT */ plan.backward (akf.data(),1.); /* multiply by b_k */ for (size_t m=0; m(bk[m])*fct; } public: POCKETFFT_NOINLINE fftblue(size_t length) : n(length), n2(util::good_size(n*2-1)), plan(n2), mem(n+n2), bk(mem.data()), bkf(mem.data()+n) { /* initialize b_k */ sincos_2pibyn tmp_(2*n, false); auto tmp = tmp_.cdata(); bk[0].Set(1, 0); size_t coeff=0; for (size_t m=1; m=2*n) coeff-=2*n; bk[m] = tmp[coeff]; } /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ T0 xn2 = T0(1)/T0(n2); bkf[0] = bk[0]*xn2; for (size_t m=1; m void backward(cmplx c[], T0 fct) { fft(c,fct); } template void forward(cmplx c[], T0 fct) { fft(c,fct); } template void backward_r(T c[], T0 fct) { arr> tmp(n); tmp[0].Set(c[0],c[0]*0); memcpy (reinterpret_cast(tmp.data()+1), reinterpret_cast(c+1), (n-1)*sizeof(T)); if ((n&1)==0) tmp[n/2].i=T0(0)*c[0]; for (size_t m=1; 2*m(tmp.data(),fct); for (size_t m=0; m void forward_r(T c[], T0 fct) { arr> tmp(n); auto zero = T0(0)*c[0]; for (size_t m=0; m(tmp.data(),fct); c[0] = tmp[0].r; memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T)); } }; // // flexible (FFTPACK/Bluestein) complex 1D transform // template class pocketfft_c { private: unique_ptr> packplan; unique_ptr> blueplan; size_t len; public: POCKETFFT_NOINLINE pocketfft_c(size_t length) : len(length) { if (length==0) throw runtime_error("zero-length FFT requested"); size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length); if (tmp*tmp <= length) { packplan=unique_ptr>(new cfftp(length)); return; } double comp1 = util::cost_guess(length); double comp2 = 2*util::cost_guess(util::good_size(2*length-1)); comp2*=1.5; /* fudge factor that appears to give good overall performance */ if (comp2>(new fftblue(length)); else packplan=unique_ptr>(new cfftp(length)); } template POCKETFFT_NOINLINE void backward(cmplx c[], T0 fct) const { packplan ? packplan->backward(c,fct) : blueplan->backward(c,fct); } template POCKETFFT_NOINLINE void forward(cmplx c[], T0 fct) const { packplan ? packplan->forward(c,fct) : blueplan->forward(c,fct); } size_t length() const { return len; } }; // // flexible (FFTPACK/Bluestein) real-valued 1D transform // template class pocketfft_r { private: unique_ptr> packplan; unique_ptr> blueplan; size_t len; public: POCKETFFT_NOINLINE pocketfft_r(size_t length) : len(length) { if (length==0) throw runtime_error("zero-length FFT requested"); size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length); if (tmp*tmp <= length) { packplan=unique_ptr>(new rfftp(length)); return; } double comp1 = 0.5*util::cost_guess(length); double comp2 = 2*util::cost_guess(util::good_size(2*length-1)); comp2*=1.5; /* fudge factor that appears to give good overall performance */ if (comp2>(new fftblue(length)); else packplan=unique_ptr>(new rfftp(length)); } template POCKETFFT_NOINLINE void backward(T c[], T0 fct) const { packplan ? packplan->backward(c,fct) : blueplan->backward_r(c,fct); } template POCKETFFT_NOINLINE void forward(T c[], T0 fct) const { packplan ? packplan->forward(c,fct) : blueplan->forward_r(c,fct); } size_t length() const { return len; } }; // // sine/cosine transforms // template class T_dct1 { private: pocketfft_r fftplan; public: POCKETFFT_NOINLINE T_dct1(size_t length) : fftplan(2*(length-1)) {} template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, int /*type*/, bool /*cosine*/) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=fftplan.length(), n=N/2+1; if (ortho) { c[0]*=sqrt2; c[n-1]*=sqrt2; } arr tmp(N); tmp[0] = c[0]; for (size_t i=1; i class T_dst1 { private: pocketfft_r fftplan; public: POCKETFFT_NOINLINE T_dst1(size_t length) : fftplan(2*(length+1)) {} template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/, int /*type*/, bool /*cosine*/) const { size_t N=fftplan.length(), n=N/2-1; arr tmp(N); tmp[0] = tmp[n+1] = c[0]*0; for (size_t i=0; i class T_dcst23 { private: pocketfft_r fftplan; vector twiddle; template POCKETFFT_NOINLINE void exec2c(T c[], T0 fct, bool ortho) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); size_t NS2 = (N+1)/2; for (size_t i=2; i POCKETFFT_NOINLINE void exec3c(T c[], T0 fct, bool ortho) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); if (ortho) c[0]*=sqrt2; size_t NS2 = (N+1)/2; for (size_t k=1, kc=N-1; k POCKETFFT_NOINLINE void exec2s(T c[], T0 fct, bool ortho) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); for (size_t k=1; k POCKETFFT_NOINLINE void exec3s(T c[], T0 fct, bool ortho) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); if (ortho) c[0]*=sqrt2; size_t NS2 = N/2; for (size_t k=0, kc=N-1; k POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, int type, bool cosine) const { if (type==2) cosine ? exec2c(c, fct, ortho) : exec2s(c, fct, ortho); else cosine ? exec3c(c, fct, ortho) : exec3s(c, fct, ortho); } size_t length() const { return fftplan.length(); } }; template class T_dcst4 { // even length algorithm from // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/ private: size_t N; unique_ptr> fft; unique_ptr> rfft; arr> C2; template POCKETFFT_NOINLINE void exec4c(T c[], T0 fct) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); if (N&1) { // The following code is derived from the FFTW3 function apply_re11() // and is released under the 3-clause BSD license with friendly // permission of Matteo Frigo. auto SGN_SET = [](T x, size_t i) {return (i%2) ? -x : x;}; arr y(N); size_t n2 = N/2; { size_t i=0, m=n2; for (; mforward(y.data(), fct); { size_t i=0; for (; i+i+1> y(N/2); for(size_t i=0; iforward(y.data(), fct); for(size_t i=0; i POCKETFFT_NOINLINE void exec4s(T c[], T0 fct) const { size_t N=length(); size_t NS2 = N/2; for (size_t k=0, kc=N-1; k(N/2)), rfft((N&1)? new pocketfft_r(N) : nullptr), C2((N&1) ? 0 : N/2) { constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); if ((N&1)==0) for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/, int /*type*/, bool cosine) const { cosine ? exec4c(c, fct) : exec4s(c, fct); } size_t length() const { return N; } }; // // multi-D infrastructure // template shared_ptr get_plan(size_t length) { #if POCKETFFT_CACHE_SIZE==0 return make_shared(length); #else constexpr size_t nmax=POCKETFFT_CACHE_SIZE; static array, nmax> cache; static array last_access{{0}}; static size_t access_counter = 0; static mutex mut; auto find_in_cache = [&]() -> shared_ptr { for (size_t i=0; ilength()==length)) { // no need to update if this is already the most recent entry if (last_access[i]!=access_counter) { last_access[i] = ++access_counter; // Guard against overflow if (access_counter == 0) last_access.fill(0); } return cache[i]; } return nullptr; }; { lock_guard lock(mut); auto p = find_in_cache(); if (p) return p; } auto plan = make_shared(length); { lock_guard lock(mut); auto p = find_in_cache(); if (p) return p; size_t lru = 0; for (size_t i=1; i class cndarr: public arr_info { protected: const char *d; public: cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_) : arr_info(shape_, stride_), d(reinterpret_cast(data_)) {} const T &operator[](ptrdiff_t ofs) const { return *reinterpret_cast(d+ofs); } }; template class ndarr: public cndarr { public: ndarr(void *data_, const shape_t &shape_, const stride_t &stride_) : cndarr::cndarr(const_cast(data_), shape_, stride_) {} T &operator[](ptrdiff_t ofs) { return *reinterpret_cast(const_cast(cndarr::d+ofs)); } }; template class multi_iter { private: shape_t pos; const arr_info &iarr, &oarr; ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o; size_t idim, rem; void advance_i() { for (int i_=int(pos.size())-1; i_>=0; --i_) { auto i = size_t(i_); if (i==idim) continue; p_ii += iarr.stride(i); p_oi += oarr.stride(i); if (++pos[i] < iarr.shape(i)) return; pos[i] = 0; p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i); p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i); } } public: multi_iter(const arr_info &iarr_, const arr_info &oarr_, size_t idim_) : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0), str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)), idim(idim_), rem(iarr.size()/iarr.shape(idim)) { auto nshares = util::nthreads(); if (nshares==1) return; if (nshares==0) throw runtime_error("can't run with zero threads"); auto myshare = util::thread_num(); if (myshare>=nshares) throw runtime_error("impossible share requested"); size_t nbase = rem/nshares; size_t additional = rem%nshares; size_t lo = myshare*nbase + ((myshare=0; --i_) { auto i = size_t(i_); p += arr.stride(i); if (++pos[i] < arr.shape(i)) return; pos[i] = 0; p -= ptrdiff_t(arr.shape(i))*arr.stride(i); } } ptrdiff_t ofs() const { return p; } size_t remaining() const { return rem; } }; class rev_iter { private: shape_t pos; const arr_info &arr; vector rev_axis; vector rev_jump; size_t last_axis, last_size; shape_t shp; ptrdiff_t p, rp; size_t rem; public: rev_iter(const arr_info &arr_, const shape_t &axes) : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0), rev_jump(arr_.ndim(), 1), p(0), rp(0) { for (auto ax: axes) rev_axis[ax]=1; last_axis = axes.back(); last_size = arr.shape(last_axis)/2 + 1; shp = arr.shape(); shp[last_axis] = last_size; rem=1; for (auto i: shp) rem *= i; } void advance() { --rem; for (int i_=int(pos.size())-1; i_>=0; --i_) { auto i = size_t(i_); p += arr.stride(i); if (!rev_axis[i]) rp += arr.stride(i); else { rp -= arr.stride(i); if (rev_jump[i]) { rp += ptrdiff_t(arr.shape(i))*arr.stride(i); rev_jump[i] = 0; } } if (++pos[i] < shp[i]) return; pos[i] = 0; p -= ptrdiff_t(shp[i])*arr.stride(i); if (rev_axis[i]) { rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i); rev_jump[i] = 1; } else rp -= ptrdiff_t(shp[i])*arr.stride(i); } } ptrdiff_t ofs() const { return p; } ptrdiff_t rev_ofs() const { return rp; } size_t remaining() const { return rem; } }; #ifndef POCKETFFT_NO_VECTORS template struct VTYPE {}; template<> struct VTYPE { using type = float __attribute__ ((vector_size (VLEN::val*sizeof(float)))); }; template<> struct VTYPE { using type = double __attribute__ ((vector_size (VLEN::val*sizeof(double)))); }; template<> struct VTYPE { using type = long double __attribute__ ((vector_size (VLEN::val*sizeof(long double)))); }; #endif template arr alloc_tmp(const shape_t &shape, size_t axsize, size_t elemsize) { auto othersize = util::prod(shape)/axsize; auto tmpsize = axsize*((othersize>=VLEN::val) ? VLEN::val : 1); return arr(tmpsize*elemsize); } template arr alloc_tmp(const shape_t &shape, const shape_t &axes, size_t elemsize) { size_t fullsize=util::prod(shape); size_t tmpsize=0; for (size_t i=0; i=VLEN::val) ? VLEN::val : 1); if (sz>tmpsize) tmpsize=sz; } return arr(tmpsize*elemsize); } #ifdef POCKETFFT_OPENMP #define POCKETFFT_NTHREADS nthreads #else #define POCKETFFT_NTHREADS #endif template POCKETFFT_NOINLINE void general_c( const cndarr> &in, ndarr> &out, const shape_t &axes, bool forward, T fct, size_t POCKETFFT_NTHREADS) { shared_ptr> plan; for (size_t iax=0; iax::val; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) plan = get_plan>(len); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(cmplx)); const auto &tin(iax==0? in : out); multi_iter it(tin, out, axes[iax]); #ifndef POCKETFFT_NO_VECTORS if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); for (size_t i=0; iforward (tdatav, fct) : plan->backward(tdatav, fct); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast *>(storage.data()); if ((&tin[0]==&out[0]) && (it.stride_out()==sizeof(cmplx))) // fully in-place forward ? plan->forward (&out[it.oofs(0)], fct) : plan->backward(&out[it.oofs(0)], fct); else if (it.stride_out()==sizeof(cmplx)) // compute FFT in output location { for (size_t i=0; iforward (&out[it.oofs(0)], fct) : plan->backward(&out[it.oofs(0)], fct); } else { for (size_t i=0; iforward (tdata, fct) : plan->backward(tdata, fct); for (size_t i=0; i POCKETFFT_NOINLINE void general_hartley( const cndarr &in, ndarr &out, const shape_t &axes, T fct, size_t POCKETFFT_NTHREADS) { shared_ptr> plan; for (size_t iax=0; iax::val; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) plan = get_plan>(len); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(T)); const auto &tin(iax==0 ? in : out); multi_iter it(tin, out, axes[iax]); #ifndef POCKETFFT_NO_VECTORS if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdatav, fct); for (size_t j=0; j0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdata, fct); // Hartley order out[it.oofs(0)] = tdata[0]; size_t i=1, i1=1, i2=len-1; for (i=1; i POCKETFFT_NOINLINE void general_dcst( const cndarr &in, ndarr &out, const shape_t &axes, T fct, bool ortho, int type, bool cosine, size_t POCKETFFT_NTHREADS) { shared_ptr plan; for (size_t iax=0; iax::val; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) plan = get_plan(len); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(T)); const auto &tin(iax==0 ? in : out); multi_iter it(tin, out, axes[iax]); #ifndef POCKETFFT_NO_VECTORS if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; iexec(tdatav, fct, ortho, type, cosine); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); if ((&tin[0]==&out[0]) && (it.stride_out()==sizeof(T))) // fully in-place plan->exec(&out[it.oofs(0)], fct, ortho, type, cosine); else if (it.stride_out()==sizeof(T)) // compute FFT in output location { for (size_t i=0; iexec(&out[it.oofs(0)], fct, ortho, type, cosine); } else { for (size_t i=0; iexec(tdata, fct, ortho, type, cosine); for (size_t i=0; i POCKETFFT_NOINLINE void general_r2c( const cndarr &in, ndarr> &out, size_t axis, bool forward, T fct, size_t POCKETFFT_NTHREADS) { auto plan = get_plan>(in.shape(axis)); constexpr auto vlen = VLEN::val; size_t len=in.shape(axis); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axis)) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(T)); multi_iter it(in, out, axis); #ifndef POCKETFFT_NO_VECTORS if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdatav, fct); for (size_t j=0; j0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); for (size_t i=0; iforward(tdata, fct); out[it.oofs(0)].Set(tdata[0]); size_t i=1, ii=1; if (forward) for (; i POCKETFFT_NOINLINE void general_c2r( const cndarr> &in, ndarr &out, size_t axis, bool forward, T fct, size_t POCKETFFT_NTHREADS) { auto plan = get_plan>(out.shape(axis)); constexpr auto vlen = VLEN::val; size_t len=out.shape(axis); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axis)) #endif { auto storage = alloc_tmp(out.shape(), len, sizeof(T)); multi_iter it(in, out, axis); #ifndef POCKETFFT_NO_VECTORS if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t j=0; jbackward(tdatav, fct); for (size_t i=0; i0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); tdata[0]=in[it.iofs(0)].r; { size_t i=1, ii=1; if (forward) for (; ibackward(tdata, fct); for (size_t i=0; i POCKETFFT_NOINLINE void general_r( const cndarr &in, ndarr &out, const shape_t &axes, bool r2c, bool forward, T fct, size_t POCKETFFT_NTHREADS) { shared_ptr> plan; for (size_t iax=0; iax::val; size_t len=in.shape(axes[iax]); if ((!plan) || (len!=plan->length())) plan = get_plan>(len); #ifdef POCKETFFT_OPENMP #pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) #endif { auto storage = alloc_tmp(in.shape(), len, sizeof(T)); const auto &tin(iax==0 ? in : out); multi_iter it(tin, out, axes[iax]); #ifndef POCKETFFT_NO_VECTORS if (vlen>1) while (it.remaining()>=vlen) { using vtype = typename VTYPE::type; it.advance(vlen); auto tdatav = reinterpret_cast(storage.data()); for (size_t i=0; iforward (tdatav, fct) : plan->backward(tdatav, fct); if (r2c && (!forward)) for (size_t i=2; i0) { it.advance(1); auto tdata = reinterpret_cast(storage.data()); if ((&tin[0]==&out[0]) && (it.stride_out()==sizeof(T))) // fully in-place { if ((!r2c) && forward) for (size_t i=2; iforward (&out[it.oofs(0)], fct) : plan->backward(&out[it.oofs(0)], fct); if (r2c && (!forward)) for (size_t i=2; iforward (&out[it.oofs(0)], fct) : plan->backward(&out[it.oofs(0)], fct); if (r2c && (!forward)) for (size_t i=2; iforward (tdata, fct) : plan->backward(tdata, fct); if (r2c && (!forward)) for (size_t i=2; i void c2c(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool forward, const complex *data_in, complex *data_out, T fct, size_t nthreads=1) { if (util::prod(shape)==0) return; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr> ain(data_in, shape, stride_in); ndarr> aout(data_out, shape, stride_out); general_c(ain, aout, axes, forward, fct, nthreads); } template void dct(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1) { if ((type<1) || (type>4)) throw invalid_argument("invalid DCT type"); if (util::prod(shape)==0) return; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); if (type==1) general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); else if (type==4) general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); else general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); } template void dst(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1) { if ((type<1) || (type>4)) throw invalid_argument("invalid DST type"); if (util::prod(shape)==0) return; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); if (type==1) general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); else if (type==4) general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); else general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); } template void r2c(const shape_t &shape_in, const stride_t &stride_in, const stride_t &stride_out, size_t axis, bool forward, const T *data_in, complex *data_out, T fct, size_t nthreads=1) { if (util::prod(shape_in)==0) return; util::sanity_check(shape_in, stride_in, stride_out, false, axis); cndarr ain(data_in, shape_in, stride_in); shape_t shape_out(shape_in); shape_out[axis] = shape_in[axis]/2 + 1; ndarr> aout(data_out, shape_out, stride_out); general_r2c(ain, aout, axis, forward, fct, nthreads); } template void r2c(const shape_t &shape_in, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool forward, const T *data_in, complex *data_out, T fct, size_t nthreads=1) { if (util::prod(shape_in)==0) return; util::sanity_check(shape_in, stride_in, stride_out, false, axes); r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out, fct, nthreads); if (axes.size()==1) return; shape_t shape_out(shape_in); shape_out[axes.back()] = shape_in[axes.back()]/2 + 1; auto newaxes = shape_t{axes.begin(), --axes.end()}; c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out, T(1), nthreads); } template void c2r(const shape_t &shape_out, const stride_t &stride_in, const stride_t &stride_out, size_t axis, bool forward, const complex *data_in, T *data_out, T fct, size_t nthreads=1) { if (util::prod(shape_out)==0) return; util::sanity_check(shape_out, stride_in, stride_out, false, axis); shape_t shape_in(shape_out); shape_in[axis] = shape_out[axis]/2 + 1; cndarr> ain(data_in, shape_in, stride_in); ndarr aout(data_out, shape_out, stride_out); general_c2r(ain, aout, axis, forward, fct, nthreads); } template void c2r(const shape_t &shape_out, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool forward, const complex *data_in, T *data_out, T fct, size_t nthreads=1) { if (util::prod(shape_out)==0) return; if (axes.size()==1) return c2r(shape_out, stride_in, stride_out, axes[0], forward, data_in, data_out, fct, nthreads); util::sanity_check(shape_out, stride_in, stride_out, false, axes); auto shape_in = shape_out; shape_in[axes.back()] = shape_out[axes.back()]/2 + 1; auto nval = util::prod(shape_in); stride_t stride_inter(shape_in.size()); stride_inter.back() = sizeof(cmplx); for (int i=int(shape_in.size())-2; i>=0; --i) stride_inter[size_t(i)] = stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]); arr> tmp(nval); auto newaxes = shape_t({axes.begin(), --axes.end()}); c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(), T(1), nthreads); c2r(shape_out, stride_inter, stride_out, axes.back(), forward, tmp.data(), data_out, fct, nthreads); } template void r2r_fftpack(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct, size_t nthreads=1) { if (util::prod(shape)==0) return; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); general_r(ain, aout, axes, real2hermitian, forward, fct, nthreads); } template void r2r_separable_hartley(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, size_t nthreads=1) { if (util::prod(shape)==0) return; util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); general_hartley(ain, aout, axes, fct, nthreads); } } // namespace detail using detail::FORWARD; using detail::BACKWARD; using detail::shape_t; using detail::stride_t; using detail::c2c; using detail::c2r; using detail::r2c; using detail::r2r_fftpack; using detail::r2r_separable_hartley; using detail::dct; using detail::dst; } // namespace pocketfft #undef POCKETFFT_NOINLINE #undef POCKETFFT_RESTRICT #endif // POCKETFFT_HDRONLY_H