From 2fd77f2063f76b4aa88adb2582861b8aeb482021 Mon Sep 17 00:00:00 2001 From: Martin Reinecke <martin@mpa-garching.mpg.de> Date: Wed, 8 May 2019 21:17:29 +0200 Subject: [PATCH] synchronnize --- pocketfft_hdronly.h | 371 +++++++++++++++++++++++--------------------- pypocketfft.cc | 12 +- 2 files changed, 199 insertions(+), 184 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 1b4af74..03817a0 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -20,6 +20,9 @@ #include <stdexcept> #include <memory> #include <vector> +#if defined(_WIN32) +#include <malloc.h> +#endif #ifdef __GNUC__ #define NOINLINE __attribute__((noinline)) @@ -38,6 +41,21 @@ using stride_t = vector<ptrdiff_t>; namespace detail { +#ifndef POCKETFFT_NO_VECTORS +#if (defined(__AVX512F__)) +#define HAVE_VECSUPPORT +constexpr int VBYTELEN=64; +#elif (defined(__AVX__)) +#define HAVE_VECSUPPORT +constexpr int VBYTELEN=32; +#elif (defined(__SSE2__)) +#define HAVE_VECSUPPORT +constexpr int VBYTELEN=16; +#else +#define POCKETFFT_NO_VECTORS +#endif +#endif + template<typename T> struct arr { private: @@ -51,9 +69,12 @@ template<typename T> struct arr void *res = malloc(num*sizeof(T)); if (!res) throw bad_alloc(); #else -#if 0 +#if __cplusplus >= 201703L void *res = aligned_alloc(64,num*sizeof(T)); if (!res) throw bad_alloc(); +#elif defined(_WIN32) + void *res = _aligned_malloc(num*sizeof(T), 64); + if (!res) throw bad_alloc(); #else void *res(nullptr); if (posix_memalign(&res, 64, num*sizeof(T))!=0) @@ -172,7 +193,7 @@ template<typename T> class sincos_2pibyn if (n==0) return; res[0]=1.; res[1]=0.; if (n==1) return; - size_t l1=(size_t)sqrt(n); + size_t l1 = size_t(sqrt(n)); arr<double> tmp(2*l1); for (size_t i=1; i<l1; ++i) { @@ -320,60 +341,71 @@ template<typename T> class sincos_2pibyn { return reinterpret_cast<const cmplx<T> *>(data.data()); } }; -NOINLINE size_t largest_prime_factor (size_t n) +struct util // hack to avoid duplicate symbols { - size_t res=1; - while ((n&1)==0) - { res=2; n>>=1; } - - size_t limit=size_t(sqrt(n+0.01)); - for (size_t x=3; x<=limit; x+=2) - while ((n/x)*x==n) + static NOINLINE size_t largest_prime_factor (size_t n) { - res=x; - n/=x; - limit=size_t(sqrt(n+0.01)); - } - if (n>1) res=n; + size_t res=1; + while ((n&1)==0) + { res=2; n>>=1; } - return res; - } + size_t limit=size_t(sqrt(n+0.01)); + for (size_t x=3; x<=limit; x+=2) + while ((n/x)*x==n) + { + res=x; + n/=x; + limit=size_t(sqrt(n+0.01)); + } + if (n>1) res=n; -NOINLINE double cost_guess (size_t n) - { - const double lfp=1.1; // penalty for non-hardcoded larger factors - size_t ni=n; - double result=0.; - while ((n&1)==0) - { result+=2; n>>=1; } - - size_t limit=size_t(sqrt(n+0.01)); - for (size_t x=3; x<=limit; x+=2) - while ((n/x)*x==n) + return res; + } + + static NOINLINE double cost_guess (size_t n) { - result+= (x<=5) ? x : lfp*x; // penalize larger prime factors - n/=x; - limit=size_t(sqrt(n+0.01)); + const double lfp=1.1; // penalty for non-hardcoded larger factors + size_t ni=n; + double result=0.; + while ((n&1)==0) + { result+=2; n>>=1; } + + size_t limit=size_t(sqrt(n+0.01)); + for (size_t x=3; x<=limit; x+=2) + while ((n/x)*x==n) + { + result+= (x<=5) ? x : lfp*x; // penalize larger prime factors + n/=x; + limit=size_t(sqrt(n+0.01)); + } + if (n>1) result+=(n<=5) ? n : lfp*n; + + return result*ni; } - if (n>1) result+=(n<=5) ? n : lfp*n; - return result*ni; - } + /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ + static NOINLINE size_t good_size(size_t n) + { + if (n<=12) return n; + + size_t bestfac=2*n; + for (size_t f2=1; f2<bestfac; f2*=2) + for (size_t f23=f2; f23<bestfac; f23*=3) + for (size_t f235=f23; f235<bestfac; f235*=5) + for (size_t f2357=f235; f2357<bestfac; f2357*=7) + for (size_t f235711=f2357; f235711<bestfac; f235711*=11) + if (f235711>=n) bestfac=f235711; + return bestfac; + } -/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ -NOINLINE size_t good_size(size_t n) - { - if (n<=12) return n; - - size_t bestfac=2*n; - for (size_t f2=1; f2<bestfac; f2*=2) - for (size_t f23=f2; f23<bestfac; f23*=3) - for (size_t f235=f23; f235<bestfac; f235*=5) - for (size_t f2357=f235; f2357<bestfac; f2357*=7) - for (size_t f235711=f2357; f235711<bestfac; f235711*=11) - if (f235711>=n) bestfac=f235711; - return bestfac; - } + static size_t prod(const shape_t &shape) + { + size_t res=1; + for (auto sz: shape) + res*=sz; + return res; + } + }; #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] @@ -397,12 +429,12 @@ template<typename T0> class cfftp size_t length, nfct; arr<cmplx<T0>> mem; - fctdata fct[NFCT]; + fctdata fact[NFCT]; void add_factor(size_t factor) { if (nfct>=NFCT) throw runtime_error("too many prime factors"); - fct[nfct++].fct = factor; + fact[nfct++].fct = factor; } template<bool bwd, typename T> void pass2 (size_t ido, size_t l1, @@ -844,33 +876,33 @@ template<bool bwd, typename T> void passg (size_t ido, size_t ip, #undef CX2 #undef CX -template<bool bwd, typename T> void pass_all(T c[], T0 fact) +template<bool bwd, typename T> void pass_all(T c[], T0 fct) { - if (length==1) { c[0]*=fact; return; } + if (length==1) { c[0]*=fct; return; } size_t l1=1; arr<T> ch(length); T *p1=c, *p2=ch.data(); for(size_t k1=0; k1<nfct; k1++) { - size_t ip=fct[k1].fct; + size_t ip=fact[k1].fct; size_t l2=ip*l1; size_t ido = length/l2; if (ip==4) - pass4<bwd> (ido, l1, p1, p2, fct[k1].tw); + pass4<bwd> (ido, l1, p1, p2, fact[k1].tw); else if(ip==2) - pass2<bwd>(ido, l1, p1, p2, fct[k1].tw); + pass2<bwd>(ido, l1, p1, p2, fact[k1].tw); else if(ip==3) - pass3<bwd> (ido, l1, p1, p2, fct[k1].tw); + pass3<bwd> (ido, l1, p1, p2, fact[k1].tw); else if(ip==5) - pass5<bwd> (ido, l1, p1, p2, fct[k1].tw); + pass5<bwd> (ido, l1, p1, p2, fact[k1].tw); else if(ip==7) - pass7<bwd> (ido, l1, p1, p2, fct[k1].tw); + pass7<bwd> (ido, l1, p1, p2, fact[k1].tw); else if(ip==11) - pass11<bwd> (ido, l1, p1, p2, fct[k1].tw); + pass11<bwd> (ido, l1, p1, p2, fact[k1].tw); else { - passg<bwd>(ido, ip, l1, p1, p2, fct[k1].tw, fct[k1].tws); + passg<bwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws); swap(p1,p2); } swap(p1,p2); @@ -878,16 +910,16 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact) } if (p1!=c) { - if (fact!=1.) + if (fct!=1.) for (size_t i=0; i<length; ++i) - c[i] = ch[i]*fact; + c[i] = ch[i]*fct; else memcpy (c,p1,length*sizeof(T)); } else - if (fact!=1.) + if (fct!=1.) for (size_t i=0; i<length; ++i) - c[i] *= fact; + c[i] *= fct; } #undef WA @@ -913,9 +945,9 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact) len>>=1; // factor 2 should be at the front of the factor list add_factor(2); - swap(fct[0].fct, fct[nfct-1].fct); + swap(fact[0].fct, fact[nfct-1].fct); } - size_t maxl=(size_t)(sqrt((double)len))+1; + size_t maxl = size_t(sqrt(double(len)))+1; for (size_t divisor=3; (len>1)&&(divisor<maxl); divisor+=2) if ((len%divisor)==0) { @@ -924,7 +956,7 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact) add_factor(divisor); len/=divisor; } - maxl=size_t(sqrt((double)len))+1; + maxl=size_t(sqrt(double(len)))+1; } if (len>1) add_factor(len); } @@ -934,7 +966,7 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact) size_t twsize=0, l1=1; for (size_t k=0; k<nfct; ++k) { - size_t ip=fct[k].fct, ido= length/(l1*ip); + size_t ip=fact[k].fct, ido= length/(l1*ip); twsize+=(ip-1)*(ido-1); if (ip>11) twsize+=ip; @@ -951,18 +983,18 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact) size_t memofs=0; for (size_t k=0; k<nfct; ++k) { - size_t ip=fct[k].fct, ido=length/(l1*ip); - fct[k].tw=mem.data()+memofs; + size_t ip=fact[k].fct, ido=length/(l1*ip); + fact[k].tw=mem.data()+memofs; memofs+=(ip-1)*(ido-1); for (size_t j=1; j<ip; ++j) for (size_t i=1; i<ido; ++i) - fct[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i]; + fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i]; if (ip>11) { - fct[k].tws=mem.data()+memofs; + fact[k].tws=mem.data()+memofs; memofs+=ip; for (size_t j=0; j<ip; ++j) - fct[k].tws[j] = twiddle[j*l1*ido]; + fact[k].tws[j] = twiddle[j*l1*ido]; } l1*=ip; } @@ -995,12 +1027,12 @@ template<typename T0> class rfftp size_t length, nfct; arr<T0> mem; - fctdata fct[NFCT]; + fctdata fact[NFCT]; void add_factor(size_t factor) { if (nfct>=NFCT) throw runtime_error("too many prime factors"); - fct[nfct++].fct = factor; + fact[nfct++].fct = factor; } #define WA(x,i) wa[(i)+(x)*(ido-1)] @@ -1632,9 +1664,9 @@ template<typename T> void copy_and_norm(T *c, T *p1, size_t n, T0 fct) public: -template<typename T> void forward(T c[], T0 fact) +template<typename T> void forward(T c[], T0 fct) { - if (length==1) { c[0]*=fact; return; } + if (length==1) { c[0]*=fct; return; } size_t n=length; size_t l1=n, nf=nfct; arr<T> ch(n); @@ -1643,30 +1675,30 @@ template<typename T> void forward(T c[], T0 fact) for(size_t k1=0; k1<nf;++k1) { size_t k=nf-k1-1; - size_t ip=fct[k].fct; + size_t ip=fact[k].fct; size_t ido=n / l1; l1 /= ip; if(ip==4) - radf4(ido, l1, p1, p2, fct[k].tw); + radf4(ido, l1, p1, p2, fact[k].tw); else if(ip==2) - radf2(ido, l1, p1, p2, fct[k].tw); + radf2(ido, l1, p1, p2, fact[k].tw); else if(ip==3) - radf3(ido, l1, p1, p2, fct[k].tw); + radf3(ido, l1, p1, p2, fact[k].tw); else if(ip==5) - radf5(ido, l1, p1, p2, fct[k].tw); + radf5(ido, l1, p1, p2, fact[k].tw); else { - radfg(ido, ip, l1, p1, p2, fct[k].tw, fct[k].tws); + radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); swap (p1,p2); } swap (p1,p2); } - copy_and_norm(c,p1,n,fact); + copy_and_norm(c,p1,n,fct); } -template<typename T> void backward(T c[], T0 fact) +template<typename T> void backward(T c[], T0 fct) { - if (length==1) { c[0]*=fact; return; } + if (length==1) { c[0]*=fct; return; } size_t n=length; size_t l1=1, nf=nfct; arr<T> ch(n); @@ -1674,22 +1706,22 @@ template<typename T> void backward(T c[], T0 fact) for(size_t k=0; k<nf; k++) { - size_t ip = fct[k].fct, + size_t ip = fact[k].fct, ido= n/(ip*l1); if(ip==4) - radb4(ido, l1, p1, p2, fct[k].tw); + radb4(ido, l1, p1, p2, fact[k].tw); else if(ip==2) - radb2(ido, l1, p1, p2, fct[k].tw); + radb2(ido, l1, p1, p2, fact[k].tw); else if(ip==3) - radb3(ido, l1, p1, p2, fct[k].tw); + radb3(ido, l1, p1, p2, fact[k].tw); else if(ip==5) - radb5(ido, l1, p1, p2, fct[k].tw); + radb5(ido, l1, p1, p2, fact[k].tw); else - radbg(ido, ip, l1, p1, p2, fct[k].tw, fct[k].tws); + radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); swap (p1,p2); l1*=ip; } - copy_and_norm(c,p1,n,fact); + copy_and_norm(c,p1,n,fct); } private: @@ -1705,9 +1737,9 @@ void factorize() len>>=1; // factor 2 should be at the front of the factor list add_factor(2); - swap(fct[0].fct, fct[nfct-1].fct); + swap(fact[0].fct, fact[nfct-1].fct); } - size_t maxl=(size_t)(sqrt((double)len))+1; + size_t maxl = size_t(sqrt(double(len)))+1; for (size_t divisor=3; (len>1)&&(divisor<maxl); divisor+=2) if ((len%divisor)==0) { @@ -1716,7 +1748,7 @@ void factorize() add_factor(divisor); len/=divisor; } - maxl=(size_t)(sqrt((double)len))+1; + maxl=size_t(sqrt(double(len)))+1; } if (len>1) add_factor(len); } @@ -1726,7 +1758,7 @@ size_t twsize() const size_t twsz=0, l1=1; for (size_t k=0; k<nfct; ++k) { - size_t ip=fct[k].fct, ido=length/(l1*ip); + size_t ip=fact[k].fct, ido=length/(l1*ip); twsz+=(ip-1)*(ido-1); if (ip>5) twsz+=2*ip; l1*=ip; @@ -1741,28 +1773,28 @@ void comp_twiddle() T0 *ptr=mem.data(); for (size_t k=0; k<nfct; ++k) { - size_t ip=fct[k].fct, ido=length/(l1*ip); + size_t ip=fact[k].fct, ido=length/(l1*ip); if (k<nfct-1) // last factor doesn't need twiddles { - fct[k].tw=ptr; ptr+=(ip-1)*(ido-1); + fact[k].tw=ptr; ptr+=(ip-1)*(ido-1); for (size_t j=1; j<ip; ++j) for (size_t i=1; i<=(ido-1)/2; ++i) { - fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i]; - fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1]; + fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i]; + fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1]; } } if (ip>5) // special factors required by *g functions { - fct[k].tws=ptr; ptr+=2*ip; - fct[k].tws[0] = 1.; - fct[k].tws[1] = 0.; + fact[k].tws=ptr; ptr+=2*ip; + fact[k].tws[0] = 1.; + fact[k].tws[1] = 0.; for (size_t i=1; i<=(ip>>1); ++i) { - fct[k].tws[2*i ] = twid[2*i*(length/ip)]; - fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; - fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)]; - fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1]; + fact[k].tws[2*i ] = twid[2*i*(length/ip)]; + fact[k].tws[2*i+1] = twid[2*i*(length/ip)+1]; + fact[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)]; + fact[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1]; } } l1*=ip; @@ -1822,7 +1854,7 @@ template<typename T0> class fftblue public: NOINLINE fftblue(size_t length) - : n(length), n2(good_size(n*2-1)), plan(n2), mem(n+n2), + : n(length), n2(util::good_size(n*2-1)), plan(n2), mem(n+n2), bk(mem.data()), bkf(mem.data()+n) { /* initialize b_k */ @@ -1858,7 +1890,8 @@ template<typename T0> class fftblue { arr<cmplx<T>> tmp(n); tmp[0].Set(c[0],c[0]*0); - memcpy ((void *)(tmp.data()+1),(void *)(c+1), (n-1)*sizeof(T)); + memcpy (reinterpret_cast<void *>(tmp.data()+1), + reinterpret_cast<void *>(c+1), (n-1)*sizeof(T)); if ((n&1)==0) tmp[n/2].i=c[0]*0.; for (size_t m=1; 2*m<n; ++m) tmp[n-m].Set(tmp[m].r, -tmp[m].i); @@ -1894,13 +1927,13 @@ template<typename T0> class pocketfft_c : len(length) { if (length==0) throw runtime_error("zero-length FFT requested"); - if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) + if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length))) { packplan=unique_ptr<cfftp<T0>>(new cfftp<T0>(length)); return; } - double comp1 = cost_guess(length); - double comp2 = 2*cost_guess(good_size(2*length-1)); + double comp1 = util::cost_guess(length); + double comp2 = 2*util::cost_guess(util::good_size(2*length-1)); comp2*=1.5; /* fudge factor that appears to give good overall performance */ if (comp2<comp1) // use Bluestein blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length)); @@ -1908,16 +1941,16 @@ template<typename T0> class pocketfft_c packplan=unique_ptr<cfftp<T0>>(new cfftp<T0>(length)); } - template<typename T> NOINLINE void backward(T c[], T0 fct) + template<typename T> NOINLINE void backward(cmplx<T> c[], T0 fct) { - packplan ? packplan->backward((cmplx<T> *)c,fct) - : blueplan->backward((cmplx<T> *)c,fct); + packplan ? packplan->backward(c,fct) + : blueplan->backward(c,fct); } - template<typename T> NOINLINE void forward(T c[], T0 fct) + template<typename T> NOINLINE void forward(cmplx<T> c[], T0 fct) { - packplan ? packplan->forward((cmplx<T> *)c,fct) - : blueplan->forward((cmplx<T> *)c,fct); + packplan ? packplan->forward(c,fct) + : blueplan->forward(c,fct); } size_t length() const { return len; } @@ -1939,13 +1972,13 @@ template<typename T0> class pocketfft_r : len(length) { if (length==0) throw runtime_error("zero-length FFT requested"); - if ((length<50) || (largest_prime_factor(length)<=sqrt(length))) + if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length))) { packplan=unique_ptr<rfftp<T0>>(new rfftp<T0>(length)); return; } - double comp1 = 0.5*cost_guess(length); - double comp2 = 2*cost_guess(good_size(2*length-1)); + double comp1 = 0.5*util::cost_guess(length); + double comp2 = 2*util::cost_guess(util::good_size(2*length-1)); comp2*=1.5; /* fudge factor that appears to give good overall performance */ if (comp2<comp1) // use Bluestein blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length)); @@ -1972,14 +2005,6 @@ template<typename T0> class pocketfft_r // multi-D infrastructure // -size_t prod(const shape_t &shape) - { - size_t res=1; - for (auto sz: shape) - res*=sz; - return res; - } - template<typename T> class ndarr { private: @@ -1996,7 +2021,7 @@ template<typename T> class ndarr : d(reinterpret_cast<char *>(data_)), cd(reinterpret_cast<const char *>(data_)), shp(shape_), str(stride_) {} size_t ndim() const { return shp.size(); } - size_t size() const { return prod(shp); } + size_t size() const { return util::prod(shp); } const shape_t &shape() const { return shp; } size_t shape(size_t i) const { return shp[i]; } const stride_t &stride() const { return str; } @@ -2067,20 +2092,6 @@ template<size_t N, typename Ti, typename To> class multi_iter bool contiguous_out() const { return str_o==sizeof(To); } }; - -#ifndef POCKETFFT_NO_VECTORS -#if (defined(__AVX512F__)) -#define HAVE_VECSUPPORT -constexpr int VBYTELEN=64; -#elif (defined(__AVX__)) -#define HAVE_VECSUPPORT -constexpr int VBYTELEN=32; -#elif (defined(__SSE2__)) -#define HAVE_VECSUPPORT -constexpr int VBYTELEN=16; -#endif -#endif - #if defined(HAVE_VECSUPPORT) template<typename T> struct VTYPE{}; template<> struct VTYPE<double> @@ -2104,14 +2115,14 @@ template<> struct VTYPE<float> template<typename T> arr<char> alloc_tmp(const shape_t &shape, size_t axsize, size_t elemsize) { - auto othersize = prod(shape)/axsize; + auto othersize = util::prod(shape)/axsize; auto tmpsize = axsize*((othersize>=VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1); return arr<char>(tmpsize*elemsize); } template<typename T> arr<char> alloc_tmp(const shape_t &shape, const shape_t &axes, size_t elemsize) { - size_t fullsize=prod(shape); + size_t fullsize=util::prod(shape); size_t tmpsize=0; for (size_t i=0; i<axes.size(); ++i) { @@ -2123,7 +2134,7 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape, return arr<char>(tmpsize*elemsize); } -template<typename T> NOINLINE void pocketfft_general_c( +template<typename T> NOINLINE void general_c( const ndarr<cmplx<T>> &in, ndarr<cmplx<T>> &out, const shape_t &axes, bool forward, T fct) { @@ -2143,15 +2154,15 @@ template<typename T> NOINLINE void pocketfft_general_c( { using vtype = typename VTYPE<T>::type; it.advance(vlen); - auto tdatav = (cmplx<vtype> *)storage.data(); + auto tdatav = reinterpret_cast<cmplx<vtype> *>(storage.data()); for (size_t i=0; i<len; ++i) for (size_t j=0; j<vlen; ++j) { tdatav[i].r[j] = it.in(j,i).r; tdatav[i].i[j] = it.in(j,i).i; } - forward ? plan->forward ((vtype *)tdatav, fct) - : plan->backward((vtype *)tdatav, fct); + forward ? plan->forward (tdatav, fct) + : plan->backward(tdatav, fct); for (size_t i=0; i<len; ++i) for (size_t j=0; j<vlen; ++j) it.out(j,i).Set(tdatav[i].r[j],tdatav[i].i[j]); @@ -2160,23 +2171,23 @@ template<typename T> NOINLINE void pocketfft_general_c( while (it.remaining()>0) { it.advance(1); - auto tdata = (cmplx<T> *)storage.data(); + auto tdata = reinterpret_cast<cmplx<T> *>(storage.data()); if (it.inplace() && it.contiguous_out()) // fully in-place - forward ? plan->forward ((T *)(&it.out(0)), fct) - : plan->backward((T *)(&it.out(0)), fct); + forward ? plan->forward ((&it.out(0)), fct) + : plan->backward((&it.out(0)), fct); else if (it.contiguous_out()) // compute FFT in output location { for (size_t i=0; i<len; ++i) it.out(i) = it.in(i); - forward ? plan->forward ((T *)(&it.out(0)), fct) - : plan->backward((T *)(&it.out(0)), fct); + forward ? plan->forward ((&it.out(0)), fct) + : plan->backward((&it.out(0)), fct); } else { for (size_t i=0; i<len; ++i) tdata[i] = it.in(i); - forward ? plan->forward ((T *)tdata, fct) - : plan->backward((T *)tdata, fct); + forward ? plan->forward (tdata, fct) + : plan->backward(tdata, fct); for (size_t i=0; i<len; ++i) it.out(i) = tdata[i]; } @@ -2185,7 +2196,7 @@ template<typename T> NOINLINE void pocketfft_general_c( } } -template<typename T> NOINLINE void pocketfft_general_hartley( +template<typename T> NOINLINE void general_hartley( const ndarr<T> &in, ndarr<T> &out, const shape_t &axes, T fct) { auto storage = alloc_tmp<T>(in.shape(), axes, sizeof(T)); @@ -2204,11 +2215,11 @@ template<typename T> NOINLINE void pocketfft_general_hartley( { using vtype = typename VTYPE<T>::type; it.advance(vlen); - auto tdatav = (vtype *)storage.data(); + auto tdatav = reinterpret_cast<vtype *>(storage.data()); for (size_t i=0; i<len; ++i) for (size_t j=0; j<vlen; ++j) tdatav[i][j] = it.in(j,i); - plan->forward((vtype *)tdatav, fct); + plan->forward(tdatav, fct); for (size_t j=0; j<vlen; ++j) it.out(j,0) = tdatav[0][j]; size_t i=1, i1=1, i2=len-1; @@ -2226,10 +2237,10 @@ template<typename T> NOINLINE void pocketfft_general_hartley( while (it.remaining()>0) { it.advance(1); - auto tdata = (T *)storage.data(); + auto tdata = reinterpret_cast<T *>(storage.data()); for (size_t i=0; i<len; ++i) tdata[i] = it.in(i); - plan->forward((T *)tdata, fct); + plan->forward(tdata, fct); // Hartley order it.out(0) = tdata[0]; size_t i=1, i1=1, i2=len-1; @@ -2245,7 +2256,7 @@ template<typename T> NOINLINE void pocketfft_general_hartley( } } -template<typename T> NOINLINE void pocketfft_general_r2c( +template<typename T> NOINLINE void general_r2c( const ndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, T fct) { auto storage = alloc_tmp<T>(in.shape(), in.shape(axis), sizeof(T)); @@ -2260,11 +2271,11 @@ template<typename T> NOINLINE void pocketfft_general_r2c( { using vtype = typename VTYPE<T>::type; it.advance(vlen); - auto tdatav = (vtype *)storage.data(); + auto tdatav = reinterpret_cast<vtype *>(storage.data()); for (size_t i=0; i<len; ++i) for (size_t j=0; j<vlen; ++j) tdatav[i][j] = it.in(j,i); - plan.forward((vtype *)tdatav, fct); + plan.forward(tdatav, fct); for (size_t j=0; j<vlen; ++j) it.out(j,0).Set(tdatav[0][j]); size_t i=1, ii=1; @@ -2279,7 +2290,7 @@ template<typename T> NOINLINE void pocketfft_general_r2c( while (it.remaining()>0) { it.advance(1); - auto tdata = (T *)storage.data(); + auto tdata = reinterpret_cast<T *>(storage.data()); for (size_t i=0; i<len; ++i) tdata[i] = it.in(i); plan.forward(tdata, fct); @@ -2291,7 +2302,7 @@ template<typename T> NOINLINE void pocketfft_general_r2c( it.out(ii).Set(tdata[i]); } } -template<typename T> NOINLINE void pocketfft_general_c2r( +template<typename T> NOINLINE void general_c2r( const ndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, T fct) { auto storage = alloc_tmp<T>(out.shape(), out.shape(axis), sizeof(T)); @@ -2306,9 +2317,10 @@ template<typename T> NOINLINE void pocketfft_general_c2r( { using vtype = typename VTYPE<T>::type; it.advance(vlen); - auto tdatav = (vtype *)storage.data(); + auto tdatav = reinterpret_cast<vtype *>(storage.data()); for (size_t j=0; j<vlen; ++j) tdatav[0][j]=it.in(j,0).r; + { size_t i=1, ii=1; for (; i<len-1; i+=2, ++ii) for (size_t j=0; j<vlen; ++j) @@ -2319,6 +2331,7 @@ template<typename T> NOINLINE void pocketfft_general_c2r( if (i<len) for (size_t j=0; j<vlen; ++j) tdatav[i][j] = it.in(j,ii).r; + } plan.backward(tdatav, fct); for (size_t i=0; i<len; ++i) for (size_t j=0; j<vlen; ++j) @@ -2328,8 +2341,9 @@ template<typename T> NOINLINE void pocketfft_general_c2r( while (it.remaining()>0) { it.advance(1); - auto tdata = (T *)storage.data(); + auto tdata = reinterpret_cast<T *>(storage.data()); tdata[0]=it.in(0).r; + { size_t i=1, ii=1; for (; i<len-1; i+=2, ++ii) { @@ -2338,13 +2352,14 @@ template<typename T> NOINLINE void pocketfft_general_c2r( } if (i<len) tdata[i] = it.in(ii).r; + } plan.backward(tdata, fct); for (size_t i=0; i<len; ++i) it.out(i) = tdata[i]; } } -template<typename T> NOINLINE void pocketfft_general_r( +template<typename T> NOINLINE void general_r( const ndarr<T> &in, ndarr<T> &out, size_t axis, bool forward, T fct) { auto storage = alloc_tmp<T>(in.shape(), in.shape(axis), sizeof(T)); @@ -2359,7 +2374,7 @@ template<typename T> NOINLINE void pocketfft_general_r( { using vtype = typename VTYPE<T>::type; it.advance(vlen); - auto tdatav = (vtype *)storage.data(); + auto tdatav = reinterpret_cast<vtype *>(storage.data()); for (size_t i=0; i<len; ++i) for (size_t j=0; j<vlen; ++j) tdatav[i][j] = it.in(j,i); @@ -2373,7 +2388,7 @@ template<typename T> NOINLINE void pocketfft_general_r( while (it.remaining()>0) { it.advance(1); - auto tdata = (T *)storage.data(); + auto tdata = reinterpret_cast<T *>(storage.data()); if (it.inplace() && it.contiguous_out()) // fully in-place forward ? plan.forward (&it.out(0), fct) : plan.backward(&it.out(0), fct); @@ -2407,7 +2422,7 @@ template<typename T> void c2c(const shape_t &shape, using namespace detail; ndarr<cmplx<T>> ain(data_in, shape, stride_in); ndarr<cmplx<T>> aout(data_out, shape, stride_out); - pocketfft_general_c(ain, aout, axes, forward, fct); + general_c(ain, aout, axes, forward, fct); } template<typename T> void r2c(const shape_t &shape, @@ -2417,7 +2432,7 @@ template<typename T> void r2c(const shape_t &shape, using namespace detail; ndarr<T> ain(data_in, shape, stride_in); ndarr<cmplx<T>> aout(data_out, shape, stride_out); - pocketfft_general_r2c(ain, aout, axis, fct); + general_r2c(ain, aout, axis, fct); } template<typename T> void c2r(const shape_t &shape, size_t new_size, @@ -2429,7 +2444,7 @@ template<typename T> void c2r(const shape_t &shape, size_t new_size, shape_out[axis] = new_size; ndarr<cmplx<T>> ain(data_in, shape, stride_in); ndarr<T> aout(data_out, shape_out, stride_out); - pocketfft_general_c2r(ain, aout, axis, fct); + general_c2r(ain, aout, axis, fct); } template<typename T> void r2r_fftpack(const shape_t &shape, @@ -2439,7 +2454,7 @@ template<typename T> void r2r_fftpack(const shape_t &shape, using namespace detail; ndarr<T> ain(data_in, shape, stride_in); ndarr<T> aout(data_out, shape, stride_out); - pocketfft_general_r(ain, aout, axis, forward, fct); + general_r(ain, aout, axis, forward, fct); } template<typename T> void r2r_hartley(const shape_t &shape, @@ -2449,7 +2464,7 @@ template<typename T> void r2r_hartley(const shape_t &shape, using namespace detail; ndarr<T> ain(data_in, shape, stride_in); ndarr<T> aout(data_out, shape, stride_out); - pocketfft_general_hartley(ain, aout, axes, fct); + general_hartley(ain, aout, axes, fct); } } // namespace pocketfft diff --git a/pypocketfft.cc b/pypocketfft.cc index 2e2cd9a..96b2ad6 100644 --- a/pypocketfft.cc +++ b/pypocketfft.cc @@ -85,7 +85,7 @@ template<typename T> py::array xfftn_internal(const py::array &in, py::array res = inplace ? in : py::array_t<complex<T>>(dims); ndarr<cmplx<T>> ain(in.data(), dims, copy_strides(in)); ndarr<cmplx<T>> aout(res.mutable_data(), dims, copy_strides(res)); - pocketfft_general_c<T>(ain, aout, axes, fwd, fct); + general_c<T>(ain, aout, axes, fwd, fct); return res; } @@ -110,10 +110,10 @@ template<typename T> py::array rfftn_internal(const py::array &in, py::array res = py::array_t<complex<T>>(dims_out); ndarr<T> ain(in.data(), dims_in, copy_strides(in)); ndarr<cmplx<T>> aout(res.mutable_data(), dims_out, copy_strides(res)); - pocketfft_general_r2c<T>(ain, aout, axes.back(), fct); + general_r2c<T>(ain, aout, axes.back(), fct); if (axes.size()==1) return res; shape_t axes2(axes.begin(), --axes.end()); - pocketfft_general_c<T>(aout, aout, axes2, true, 1.); + general_c<T>(aout, aout, axes2, true, 1.); return res; } py::array rfftn(const py::array &in, py::object axes_, double fct) @@ -128,7 +128,7 @@ template<typename T> py::array xrfft_scipy(const py::array &in, py::array res = inplace ? in : py::array_t<T>(dims); ndarr<T> ain(in.data(), dims, copy_strides(in)); ndarr<T> aout(res.mutable_data(), dims, copy_strides(res)); - pocketfft_general_r<T>(ain, aout, axis, fwd, fct); + general_r<T>(ain, aout, axis, fwd, fct); return res; } py::array rfft_scipy(const py::array &in, size_t axis, double fct, bool inplace) @@ -160,7 +160,7 @@ template<typename T> py::array irfftn_internal(const py::array &in, py::array res = py::array_t<T>(dims_out); ndarr<cmplx<T>> ain(inter.data(), copy_shape(inter), copy_strides(inter)); ndarr<T> aout(res.mutable_data(), dims_out, copy_strides(res)); - pocketfft_general_c2r<T>(ain, aout, axis, fct); + general_c2r<T>(ain, aout, axis, fct); return res; } py::array irfftn(const py::array &in, py::object axes_, size_t lastsize, @@ -178,7 +178,7 @@ template<typename T> py::array hartley_internal(const py::array &in, py::array res = inplace ? in : py::array_t<T>(dims); ndarr<T> ain(in.data(), copy_shape(in), copy_strides(in)); ndarr<T> aout(res.mutable_data(), copy_shape(res), copy_strides(res)); - pocketfft_general_hartley<T>(ain, aout, makeaxes(in, axes_), fct); + general_hartley<T>(ain, aout, makeaxes(in, axes_), fct); return res; } py::array hartley(const py::array &in, py::object axes_, double fct, -- GitLab