From 7e4b35133beda71192ee865fe29a46cae1bc21eb Mon Sep 17 00:00:00 2001 From: Martin Reinecke <martin@mpa-garching.mpg.de> Date: Thu, 2 May 2019 11:44:03 +0200 Subject: [PATCH 1/5] smaller memory consumption during twiddle generation --- pocketfft.cc | 129 ++++++++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 63 deletions(-) diff --git a/pocketfft.cc b/pocketfft.cc index bed7179..9026678 100644 --- a/pocketfft.cc +++ b/pocketfft.cc @@ -72,14 +72,58 @@ template<typename T> struct arr size_t size() const { return sz; } }; +template<typename T> struct cmplx { + T r, i; + cmplx() {} + cmplx(T r_, T i_) : r(r_), i(i_) {} + void Set(T r_, T i_) { r=r_; i=i_; } + void Set(T r_) { r=r_; i=T(0); } + cmplx &operator+= (const cmplx &other) + { r+=other.r; i+=other.i; return *this; } + template<typename T2>cmplx &operator*= (T2 other) + { r*=other; i*=other; return *this; } + cmplx operator+ (const cmplx &other) const + { return cmplx(r+other.r, i+other.i); } + cmplx operator- (const cmplx &other) const + { return cmplx(r-other.r, i-other.i); } + template<typename T2> auto operator* (const T2 &other) const + -> cmplx<decltype(r*other)> + { + return {r*other, i*other}; + } + template<typename T2> auto operator* (const cmplx<T2> &other) const + -> cmplx<decltype(r+other.r)> + { + return {r*other.r-i*other.i, r*other.i + i*other.r}; + } + template<bool bwd, typename T2> auto special_mul (const cmplx<T2> &other) const + -> cmplx<decltype(r+other.r)> + { + if (bwd) + return {r*other.r-i*other.i, r*other.i + i*other.r}; + else + return {r*other.r+i*other.i, i*other.r - r*other.i}; + } +}; +template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b, + const cmplx<T> &c, const cmplx<T> &d) + { a = c+d; b = c-d; } +template<typename T> cmplx<T> conj(const cmplx<T> &a) + { return {a.r, -a.i}; } + +template<typename T> void ROT90(cmplx<T> &a) + { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } +template<typename T> void ROTM90(cmplx<T> &a) + { auto tmp_=-a.r; a.r=a.i; a.i=tmp_; } + // // twiddle factor section // -class sincos_2pibyn +template<typename T> class sincos_2pibyn { private: - arr<double> data; + arr<T> data; // adapted from https://stackoverflow.com/questions/42792939/ // CAUTION: this function only works for arguments in the range @@ -110,15 +154,20 @@ class sincos_2pibyn res[1] = s; } - NOINLINE void calc_first_octant(size_t den, double * restrict res) + NOINLINE void calc_first_octant(size_t den, T * restrict res) { size_t n = (den+4)>>3; if (n==0) return; res[0]=1.; res[1]=0.; if (n==1) return; size_t l1=(size_t)sqrt(n); + arr<double> tmp(2*l1); for (size_t i=1; i<l1; ++i) - my_sincosm1pi((2.*i)/den,&res[2*i]); + { + my_sincosm1pi((2.*i)/den,&tmp[2*i]); + res[2*i ] = tmp[2*i]+1.; + res[2*i+1] = tmp[2*i+1]; + } size_t start=l1; while(start<n) { @@ -130,19 +179,17 @@ class sincos_2pibyn if (start+end>n) end = n-start; for (size_t i=1; i<end; ++i) { - double csx[2]={res[2*i], res[2*i+1]}; + double csx[2]={tmp[2*i], tmp[2*i+1]}; res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.; res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]; } start += l1; } - for (size_t i=1; i<l1; ++i) - res[2*i] += 1.; } - NOINLINE void calc_first_quadrant(size_t n, double * restrict res) + NOINLINE void calc_first_quadrant(size_t n, T * restrict res) { - double * restrict p = res+n; + T * restrict p = res+n; calc_first_octant(n<<1, p); size_t ndone=(n+2)>>2; size_t i=0, idx1=0, idx2=2*ndone-2; @@ -160,10 +207,10 @@ class sincos_2pibyn } } - NOINLINE void calc_first_half(size_t n, double * restrict res) + NOINLINE void calc_first_half(size_t n, T * restrict res) { int ndone=(n+1)>>1; - double * p = res+n-1; + T * p = res+n-1; calc_first_octant(n<<2, p); int i4=0, in=n, i=0; for (; i4<=in-i4; ++i, i4+=4) // octant 0 @@ -187,7 +234,7 @@ class sincos_2pibyn } } - NOINLINE void fill_first_quadrant(size_t n, double * restrict res) + NOINLINE void fill_first_quadrant(size_t n, T * restrict res) { const double hsqt2 = 0.707106781186547524400844362104849; size_t quart = n>>2; @@ -200,7 +247,7 @@ class sincos_2pibyn } } - NOINLINE void fill_first_half(size_t n, double * restrict res) + NOINLINE void fill_first_half(size_t n, T * restrict res) { size_t half = n>>1; if ((n&3)==0) @@ -217,7 +264,7 @@ class sincos_2pibyn } } - NOINLINE void fill_second_half(size_t n, double * restrict res) + NOINLINE void fill_second_half(size_t n, T * restrict res) { if ((n&1)==0) for (size_t i=0; i<n; ++i) @@ -230,7 +277,7 @@ class sincos_2pibyn } } - NOINLINE void sincos_2pibyn_half(size_t n, double * restrict res) + NOINLINE void sincos_2pibyn_half(size_t n, T * restrict res) { if ((n&3)==0) { @@ -255,7 +302,7 @@ class sincos_2pibyn if (!half) fill_second_half(n, data.data()); } - double operator[](size_t idx) const { return data[idx]; } + T operator[](size_t idx) const { return data[idx]; } }; NOINLINE size_t largest_prime_factor (size_t n) @@ -313,50 +360,6 @@ NOINLINE size_t good_size(size_t n) return bestfac; } -template<typename T> struct cmplx { - T r, i; - cmplx() {} - cmplx(T r_, T i_) : r(r_), i(i_) {} - void Set(T r_, T i_) { r=r_; i=i_; } - void Set(T r_) { r=r_; i=T(0); } - cmplx &operator+= (const cmplx &other) - { r+=other.r; i+=other.i; return *this; } - template<typename T2>cmplx &operator*= (T2 other) - { r*=other; i*=other; return *this; } - cmplx operator+ (const cmplx &other) const - { return cmplx(r+other.r, i+other.i); } - cmplx operator- (const cmplx &other) const - { return cmplx(r-other.r, i-other.i); } - template<typename T2> auto operator* (const T2 &other) const - -> cmplx<decltype(r*other)> - { - return {r*other, i*other}; - } - template<typename T2> auto operator* (const cmplx<T2> &other) const - -> cmplx<decltype(r+other.r)> - { - return {r*other.r-i*other.i, r*other.i + i*other.r}; - } - template<bool bwd, typename T2> auto special_mul (const cmplx<T2> &other) const - -> cmplx<decltype(r+other.r)> - { - if (bwd) - return {r*other.r-i*other.i, r*other.i + i*other.r}; - else - return {r*other.r+i*other.i, i*other.r - r*other.i}; - } -}; -template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b, - const cmplx<T> &c, const cmplx<T> &d) - { a = c+d; b = c-d; } -template<typename T> cmplx<T> conj(const cmplx<T> &a) - { return {a.r, -a.i}; } - -template<typename T> void ROT90(cmplx<T> &a) - { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; } -template<typename T> void ROTM90(cmplx<T> &a) - { auto tmp_=-a.r; a.r=a.i; a.i=tmp_; } - #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] #define WA(x,i) wa[(i)-1+(x)*(ido-1)] @@ -910,7 +913,7 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact) NOINLINE void comp_twiddle() { - sincos_2pibyn twid(length, false); + sincos_2pibyn<T0> twid(length, false); size_t l1=1; size_t memofs=0; for (size_t k=0; k<nfct; ++k) @@ -1705,7 +1708,7 @@ size_t twsize() const NOINLINE void comp_twiddle() { - sincos_2pibyn twid(length, true); + sincos_2pibyn<T0> twid(length, true); size_t l1=1; T0 *ptr=mem.data(); for (size_t k=0; k<nfct; ++k) @@ -1807,7 +1810,7 @@ template<typename T0> class fftblue bk(mem.data()), bkf(mem.data()+2*n) { /* initialize b_k */ - sincos_2pibyn tmp(2*n, false); + sincos_2pibyn<T0> tmp(2*n, false); bk[0] = 1; bk[1] = 0; -- GitLab From 18adaa57d44610dc7cc7a76ed1f213c3fc33f086 Mon Sep 17 00:00:00 2001 From: Martin Reinecke <martin@mpa-garching.mpg.de> Date: Fri, 3 May 2019 09:05:21 +0200 Subject: [PATCH 2/5] more complex arrays --- pocketfft.cc | 117 ++++++++++++++++++++------------------------------- 1 file changed, 45 insertions(+), 72 deletions(-) diff --git a/pocketfft.cc b/pocketfft.cc index 9026678..d7ea57c 100644 --- a/pocketfft.cc +++ b/pocketfft.cc @@ -88,21 +88,15 @@ template<typename T> struct cmplx { { return cmplx(r-other.r, i-other.i); } template<typename T2> auto operator* (const T2 &other) const -> cmplx<decltype(r*other)> - { - return {r*other, i*other}; - } + { return {r*other, i*other}; } template<typename T2> auto operator* (const cmplx<T2> &other) const -> cmplx<decltype(r+other.r)> - { - return {r*other.r-i*other.i, r*other.i + i*other.r}; - } + { return {r*other.r-i*other.i, r*other.i + i*other.r}; } template<bool bwd, typename T2> auto special_mul (const cmplx<T2> &other) const -> cmplx<decltype(r+other.r)> { - if (bwd) - return {r*other.r-i*other.i, r*other.i + i*other.r}; - else - return {r*other.r+i*other.i, i*other.r - r*other.i}; + return bwd ? cmplx(r*other.r-i*other.i, r*other.i + i*other.r) + : cmplx(r*other.r+i*other.i, i*other.r - r*other.i); } }; template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b, @@ -303,6 +297,9 @@ template<typename T> class sincos_2pibyn } T operator[](size_t idx) const { return data[idx]; } + const T *rdata() const { return data; } + const cmplx<T> *cdata() const + { return reinterpret_cast<const cmplx<T> *>(data.data()); } }; NOINLINE size_t largest_prime_factor (size_t n) @@ -914,6 +911,7 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact) NOINLINE void comp_twiddle() { sincos_2pibyn<T0> twid(length, false); + auto twiddle = twid.cdata(); size_t l1=1; size_t memofs=0; for (size_t k=0; k<nfct; ++k) @@ -923,19 +921,13 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact) memofs+=(ip-1)*(ido-1); for (size_t j=1; j<ip; ++j) for (size_t i=1; i<ido; ++i) - { - fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i]; - fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1]; - } + fct[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i]; if (ip>11) { fct[k].tws=mem.data()+memofs; memofs+=ip; for (size_t j=0; j<ip; ++j) - { - fct[k].tws[j].r = twid[2*j*l1*ido]; - fct[k].tws[j].i = twid[2*j*l1*ido+1]; - } + fct[k].tws[j] = twiddle[j*l1*ido]; } l1*=ip; } @@ -1746,7 +1738,7 @@ NOINLINE void comp_twiddle() NOINLINE rfftp(size_t length_) : length(length_) { - if (length==0) throw runtime_error("zero_sized FFT"); + if (length==0) throw runtime_error("zero-sized FFT"); nfct=0; if (length==1) return; factorize(); @@ -1765,82 +1757,66 @@ template<typename T0> class fftblue private: size_t n, n2; cfftp<T0> plan; - arr<T0> mem; - T0 *bk, *bkf; + arr<cmplx<T0>> mem; + cmplx<T0> *bk, *bkf; template<bool bwd, typename T> NOINLINE - void fft(T c[], T0 fct) + void fft(cmplx<T> c[], T0 fct) { - constexpr T0 isign = bwd ? 1 : -1; - arr<T> akf(2*n2); + arr<cmplx<T>> akf(n2); /* initialize a_k and FFT it */ - for (size_t m=0; m<2*n; m+=2) - { - akf[m] = c[m]*bk[m] -isign*c[m+1]*bk[m+1]; - akf[m+1] = isign*c[m]*bk[m+1] + c[m+1]*bk[m]; - } - for (size_t m=2*n; m<2*n2; ++m) - akf[m]=0.*c[0]; + for (size_t m=0; m<n; ++m) + akf[m] = c[m].template special_mul<bwd>(bk[m]); + for (size_t m=n; m<n2; ++m) + akf[m]=akf[0]*T0(0); - plan.forward ((cmplx<T> *)akf.data(),1.); + plan.forward (akf.data(),1.); /* do the convolution */ - for (size_t m=0; m<2*n2; m+=2) - { - T im = -isign*akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; - akf[m ] = akf[m]*bkf[m] + isign*akf[m+1]*bkf[m+1]; - akf[m+1] = im; - } + for (size_t m=0; m<n2; ++m) + akf[m] = akf[m].template special_mul<!bwd>(bkf[m]); /* inverse FFT */ - plan.backward ((cmplx<T> *)akf.data(),1.); + plan.backward (akf.data(),1.); /* multiply by b_k */ - for (size_t m=0; m<2*n; m+=2) - { - c[m] = fct*(bk[m] *akf[m] - isign*bk[m+1]*akf[m+1]); - c[m+1] = fct*(isign*bk[m+1]*akf[m] + bk[m] *akf[m+1]); - } + for (size_t m=0; m<n; ++m) + c[m] = akf[m].template special_mul<bwd>(bk[m])*fct; } public: NOINLINE fftblue(size_t length) - : n(length), n2(good_size(n*2-1)), plan(n2), mem(2*(n+n2)), - bk(mem.data()), bkf(mem.data()+2*n) + : n(length), n2(good_size(n*2-1)), plan(n2), mem(n+n2), + bk(mem.data()), bkf(mem.data()+n) { /* initialize b_k */ - sincos_2pibyn<T0> tmp(2*n, false); - bk[0] = 1; - bk[1] = 0; + sincos_2pibyn<T0> tmp_(2*n, false); + auto tmp = tmp_.cdata(); + bk[0].Set(1, 0); size_t coeff=0; for (size_t m=1; m<n; ++m) { coeff+=2*m-1; if (coeff>=2*n) coeff-=2*n; - bk[2*m ] = tmp[2*coeff ]; - bk[2*m+1] = tmp[2*coeff+1]; + bk[m] = tmp[coeff]; } /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ T0 xn2 = 1./n2; bkf[0] = bk[0]*xn2; - bkf[1] = bk[1]*xn2; - for (size_t m=2; m<2*n; m+=2) - { - bkf[m] = bkf[2*n2-m] = bk[m] *xn2; - bkf[m+1] = bkf[2*n2-m+1] = bk[m+1] *xn2; - } - for (size_t m=2*n;m<=(2*n2-2*n+1);++m) - bkf[m]=0.; - plan.forward((cmplx<T0> *)bkf,1.); + for (size_t m=1; m<n; ++m) + bkf[m] = bkf[n2-m] = bk[m]*xn2; + for (size_t m=n;m<=(n2-n);++m) + bkf[m].Set(0.,0.); + plan.forward(bkf,1.); } - template<typename T> NOINLINE void backward(T c[], T0 fct) + template<typename T> NOINLINE void backward(cmplx<T> c[], T0 fct) { fft<true>(c,fct); } - template<typename T> NOINLINE void forward(T c[], T0 fct) + template<typename T> NOINLINE void forward(cmplx<T> c[], T0 fct) { fft<false>(c,fct); } template<typename T> NOINLINE void backward_r(T c[], T0 fct) @@ -1855,22 +1831,19 @@ template<typename T0> class fftblue tmp[2*n-m]=tmp[m]; tmp[2*n-m+1]=-tmp[m+1]; } - fft<true>(tmp.data(),fct); + fft<true>((cmplx<T> *)tmp.data(),fct); for (size_t m=0; m<n; ++m) c[m] = tmp[2*m]; } template<typename T> NOINLINE void forward_r(T c[], T0 fct) { - arr<T> tmp(2*n); + arr<cmplx<T>> tmp(n); for (size_t m=0; m<n; ++m) - { - tmp[2*m] = c[m]; - tmp[2*m+1] = c[m]*0.; - } + tmp[m].Set(c[m], 0.*c[m]); fft<false>(tmp.data(),fct); - c[0] = tmp[0]; - memcpy (c+1, tmp.data()+2, (n-1)*sizeof(T)); + c[0] = tmp[0].r; + memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T)); } }; @@ -1907,13 +1880,13 @@ template<typename T0> class pocketfft_c template<typename T> NOINLINE void backward(T c[], T0 fct) { packplan ? packplan->backward((cmplx<T> *)c,fct) - : blueplan->backward(c,fct); + : blueplan->backward((cmplx<T> *)c,fct); } template<typename T> NOINLINE void forward(T c[], T0 fct) { packplan ? packplan->forward((cmplx<T> *)c,fct) - : blueplan->forward(c,fct); + : blueplan->forward((cmplx<T> *)c,fct); } size_t length() const { return len; } -- GitLab From 2cdcfbca47eb160b44961d160b93fb2193323a5f Mon Sep 17 00:00:00 2001 From: Martin Reinecke <martin@mpa-garching.mpg.de> Date: Fri, 3 May 2019 09:55:06 +0200 Subject: [PATCH 3/5] hide all vector-related attributes behind #ifdef --- pocketfft.cc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pocketfft.cc b/pocketfft.cc index d7ea57c..dab8544 100644 --- a/pocketfft.cc +++ b/pocketfft.cc @@ -2066,15 +2066,9 @@ template<> struct VTYPE<float> #else template<typename T> struct VTYPE{}; template<> struct VTYPE<double> - { - using type = double __attribute__ ((vector_size (8))); - static constexpr int vlen=1; - }; + { static constexpr int vlen=1; }; template<> struct VTYPE<float> - { - using type = float __attribute__ ((vector_size (4))); - static constexpr int vlen=1; - }; + { static constexpr int vlen=1; }; #endif template<typename T> arr<char> alloc_tmp(const shape_t &shape, @@ -2112,6 +2106,7 @@ template<typename T> NOINLINE void pocketfft_general_c( size_t len=it.length_in(); if ((!plan) || (len!=plan->length())) plan.reset(new pocketfft_c<T>(len)); +#if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { @@ -2130,6 +2125,7 @@ template<typename T> NOINLINE void pocketfft_general_c( for (size_t j=0; j<vlen; ++j) it.out(j,i).Set(tdatav[i].r[j],tdatav[i].i[j]); } +#endif while (it.remaining()>0) { it.advance(1); @@ -2171,6 +2167,7 @@ template<typename T> NOINLINE void pocketfft_general_hartley( size_t len=it.length_in(); if ((!plan) || (len!=plan->length())) plan.reset(new pocketfft_r<T>(len)); +#if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { @@ -2194,6 +2191,7 @@ template<typename T> NOINLINE void pocketfft_general_hartley( for (size_t j=0; j<vlen; ++j) it.out(j,i1) = tdatav[i][j]; } +#endif while (it.remaining()>0) { it.advance(1); @@ -2225,6 +2223,7 @@ template<typename T> NOINLINE void pocketfft_general_r2c( constexpr int vlen = VTYPE<T>::vlen; multi_iter<vlen, T, cmplx<T>> it(in, out, axis); size_t len=in.shape(axis); +#if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { @@ -2245,6 +2244,7 @@ template<typename T> NOINLINE void pocketfft_general_r2c( for (size_t j=0; j<vlen; ++j) it.out(j,ii).Set(tdatav[i][j]); } +#endif while (it.remaining()>0) { it.advance(1); @@ -2269,6 +2269,7 @@ template<typename T> NOINLINE void pocketfft_general_c2r( constexpr int vlen = VTYPE<T>::vlen; multi_iter<vlen, cmplx<T>, T> it(in, out, axis); size_t len=out.shape(axis); +#if defined(HAVE_VECSUPPORT) if (vlen>1) while (it.remaining()>=vlen) { @@ -2292,6 +2293,7 @@ template<typename T> NOINLINE void pocketfft_general_c2r( for (size_t j=0; j<vlen; ++j) it.out(j,i) = tdatav[i][j]; } +#endif while (it.remaining()>0) { it.advance(1); -- GitLab From 39524f2315536dc6560f958c2b237f8408f0508d Mon Sep 17 00:00:00 2001 From: Martin Reinecke <martin@mpa-garching.mpg.de> Date: Fri, 3 May 2019 13:21:55 +0200 Subject: [PATCH 4/5] restructuring; add interface for scipy emulation --- pocketfft.cc | 143 +++++++++++++++++++++++++++++++++++++++------------ test.py | 56 ++++++++++---------- 2 files changed, 135 insertions(+), 64 deletions(-) diff --git a/pocketfft.cc b/pocketfft.cc index dab8544..1c65916 100644 --- a/pocketfft.cc +++ b/pocketfft.cc @@ -976,8 +976,8 @@ template<typename T0> class rfftp #define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] #define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] -template<typename T> NOINLINE void radf2 (size_t ido, size_t l1, const T * restrict cc, - T * restrict ch, const T0 * restrict wa) +template<typename T> NOINLINE void radf2 (size_t ido, size_t l1, + const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=2; @@ -1001,8 +1001,8 @@ template<typename T> NOINLINE void radf2 (size_t ido, size_t l1, const T * restr } } -template<typename T> NOINLINE void radf3(size_t ido, size_t l1, const T * restrict cc, - T * restrict ch, const T0 * restrict wa) +template<typename T> NOINLINE void radf3(size_t ido, size_t l1, + const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=0.86602540378443864676; @@ -1035,8 +1035,8 @@ template<typename T> NOINLINE void radf3(size_t ido, size_t l1, const T * restri } } -template<typename T> NOINLINE void radf4(size_t ido, size_t l1, const T * restrict cc, - T * restrict ch, const T0 * restrict wa) +template<typename T> NOINLINE void radf4(size_t ido, size_t l1, + const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=4; constexpr T0 hsqt2=0.70710678118654752440; @@ -1076,8 +1076,8 @@ template<typename T> NOINLINE void radf4(size_t ido, size_t l1, const T * restri } } -template<typename T> NOINLINE void radf5(size_t ido, size_t l1, const T * restrict cc, - T * restrict ch, const T0 * restrict wa) +template<typename T> NOINLINE void radf5(size_t ido, size_t l1, + const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=5; constexpr T0 tr11= 0.3090169943749474241, ti11=0.95105651629515357212, @@ -1301,8 +1301,8 @@ template<typename T> NOINLINE void radb2(size_t ido, size_t l1, const T * restri } } -template<typename T> NOINLINE void radb3(size_t ido, size_t l1, const T * restrict cc, - T * restrict ch, const T0 * restrict wa) +template<typename T> NOINLINE void radb3(size_t ido, size_t l1, + const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=3; constexpr T0 taur=-0.5, taui=0.86602540378443864676; @@ -1336,8 +1336,8 @@ template<typename T> NOINLINE void radb3(size_t ido, size_t l1, const T * restri } } -template<typename T> NOINLINE void radb4(size_t ido, size_t l1, const T * restrict cc, - T * restrict ch, const T0 * restrict wa) +template<typename T> NOINLINE void radb4(size_t ido, size_t l1, + const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=4; constexpr T0 sqrt2=1.41421356237309504880; @@ -1382,8 +1382,8 @@ template<typename T> NOINLINE void radb4(size_t ido, size_t l1, const T * restri } } -template<typename T>NOINLINE void radb5(size_t ido, size_t l1, const T * restrict cc, - T * restrict ch, const T0 * restrict wa) +template<typename T>NOINLINE void radb5(size_t ido, size_t l1, + const T * restrict cc, T * restrict ch, const T0 * restrict wa) { constexpr size_t cdim=5; constexpr T0 tr11= 0.3090169943749474241, ti11=0.95105651629515357212, @@ -1821,19 +1821,15 @@ template<typename T0> class fftblue template<typename T> NOINLINE void backward_r(T c[], T0 fct) { - arr<T> tmp(2*n); - tmp[0]=c[0]; - tmp[1]=c[0]*0.; - memcpy (tmp.data()+2,c+1, (n-1)*sizeof(T)); - if ((n&1)==0) tmp[n+1]=c[0]*0.; - for (size_t m=2; m<n; m+=2) - { - tmp[2*n-m]=tmp[m]; - tmp[2*n-m+1]=-tmp[m+1]; - } - fft<true>((cmplx<T> *)tmp.data(),fct); + arr<cmplx<T>> tmp(n); + tmp[0].Set(c[0],c[0]*0); + memcpy ((void *)(tmp.data()+1),(void *)(c+1), (n-1)*sizeof(T)); + if ((n&1)==0) tmp[n/2].i=c[0]*0.; + for (size_t m=1; 2*m<n; ++m) + tmp[n-m].Set(tmp[m].r, -tmp[m].i); + fft<true>(tmp.data(),fct); for (size_t m=0; m<n; ++m) - c[m] = tmp[2*m]; + c[m] = tmp[m].r; } template<typename T> NOINLINE void forward_r(T c[], T0 fct) @@ -2056,12 +2052,12 @@ template<typename T> struct VTYPE{}; template<> struct VTYPE<double> { using type = double __attribute__ ((vector_size (VBYTELEN))); - static constexpr int vlen=VBYTELEN/8; + static constexpr int vlen=VBYTELEN/sizeof(double); }; template<> struct VTYPE<float> { using type = float __attribute__ ((vector_size (VBYTELEN))); - static constexpr int vlen=VBYTELEN/4; + static constexpr int vlen=VBYTELEN/sizeof(float); }; #else template<typename T> struct VTYPE{}; @@ -2087,7 +2083,8 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape, { auto axsize = shape[axes[i]]; auto othersize = fullsize/axsize; - tmpsize = max(tmpsize, axsize*((othersize>=VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1)); + tmpsize = max(tmpsize, + axsize*((othersize>=VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1)); } return arr<char>(tmpsize*elemsize); } @@ -2119,7 +2116,7 @@ template<typename T> NOINLINE void pocketfft_general_c( tdatav[i].r[j] = it.in(j,i).r; tdatav[i].i[j] = it.in(j,i).i; } - forward ? plan->forward((vtype *)tdatav, fct) + forward ? plan->forward ((vtype *)tdatav, fct) : plan->backward((vtype *)tdatav, fct); for (size_t i=0; i<len; ++i) for (size_t j=0; j<vlen; ++j) @@ -2131,20 +2128,20 @@ template<typename T> NOINLINE void pocketfft_general_c( it.advance(1); auto tdata = (cmplx<T> *)storage.data(); if (it.inplace() && it.contiguous_out()) // fully in-place - forward ? plan->forward((T *)(&it.in(0)), fct) - : plan->backward((T *)(&it.in(0)), fct); + forward ? plan->forward ((T *)(&it.out(0)), fct) + : plan->backward((T *)(&it.out(0)), fct); else if (it.contiguous_out()) // compute FFT in output location { for (size_t i=0; i<len; ++i) it.out(i) = it.in(i); - forward ? plan->forward((T *)(&it.out(0)), fct) + forward ? plan->forward ((T *)(&it.out(0)), fct) : plan->backward((T *)(&it.out(0)), fct); } else { for (size_t i=0; i<len; ++i) tdata[i] = it.in(i); - forward ? plan->forward((T *)tdata, fct) + forward ? plan->forward ((T *)tdata, fct) : plan->backward((T *)tdata, fct); for (size_t i=0; i<len; ++i) it.out(i) = tdata[i]; @@ -2313,6 +2310,58 @@ template<typename T> NOINLINE void pocketfft_general_c2r( } } +template<typename T> NOINLINE void pocketfft_general_r( + const ndarr<T> &in, ndarr<T> &out, int axis, bool forward, T fct) + { + auto storage = alloc_tmp<T>(in.shape(), in.shape(axis), sizeof(T)); + + constexpr int vlen = VTYPE<T>::vlen; + multi_iter<vlen, T, T> it(in, out, axis); + size_t len=it.length_in(); + pocketfft_r<T> plan(len); +#if defined(HAVE_VECSUPPORT) + if (vlen>1) + while (it.remaining()>=vlen) + { + using vtype = typename VTYPE<T>::type; + it.advance(vlen); + auto tdatav = (vtype *)storage.data(); + for (size_t i=0; i<len; ++i) + for (size_t j=0; j<vlen; ++j) + tdatav[i][j] = it.in(j,i); + forward ? plan.forward (tdatav, fct) + : plan.backward(tdatav, fct); + for (size_t i=0; i<len; ++i) + for (size_t j=0; j<vlen; ++j) + it.out(j,i) = tdatav[i][j]; + } +#endif + while (it.remaining()>0) + { + it.advance(1); + auto tdata = (T *)storage.data(); + if (it.inplace() && it.contiguous_out()) // fully in-place + forward ? plan.forward (&it.out(0), fct) + : plan.backward(&it.out(0), fct); + else if (it.contiguous_out()) // compute FFT in output location + { + for (size_t i=0; i<len; ++i) + it.out(i) = it.in(i); + forward ? plan.forward (&it.out(0), fct) + : plan.backward(&it.out(0), fct); + } + else + { + for (size_t i=0; i<len; ++i) + tdata[i] = it.in(i); + forward ? plan.forward (tdata, fct) + : plan.backward(tdata, fct); + for (size_t i=0; i<len; ++i) + it.out(i) = tdata[i]; + } + } + } + // // Python interface // @@ -2410,6 +2459,28 @@ py::array rfftn(const py::array &in, py::object axes_, double fct) return tcheck(in, f64, f32) ? rfftn_internal<double>(in, axes_, fct) : rfftn_internal<float> (in, axes_, fct); } +template<typename T> py::array xrfft_scipy(const py::array &in, + int axis, double fct, bool inplace, bool fwd) + { + auto dims(copy_shape(in)); + py::array res = inplace ? in : py::array_t<T>(dims); + ndarr<T> ain(in.data(), dims, copy_strides(in)); + ndarr<T> aout(res.mutable_data(), dims, copy_strides(res)); + pocketfft_general_r<T>(ain, aout, axis, fwd, fct); + return res; + } +py::array rfft_scipy(const py::array &in, int axis, double fct, bool inplace) + { + return tcheck(in, f64, f32) ? + xrfft_scipy<double>(in, axis, fct, inplace, true) : + xrfft_scipy<float> (in, axis, fct, inplace, true); + } +py::array irfft_scipy(const py::array &in, int axis, double fct, bool inplace) + { + return tcheck(in, f64, f32) ? + xrfft_scipy<double>(in, axis, fct, inplace, false) : + xrfft_scipy<float> (in, axis, fct, inplace, false); + } template<typename T> py::array irfftn_internal(const py::array &in, py::object axes_, size_t lastsize, T fct) { @@ -2639,6 +2710,10 @@ PYBIND11_MODULE(pypocketfft, m) m.def("ifftn",&ifftn, ifftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1., "inplace"_a=false); m.def("rfftn",&rfftn, rfftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.); + m.def("rfft_scipy",&rfft_scipy, "a"_a, "axis"_a, "fct"_a=1., + "inplace"_a=false); + m.def("irfft_scipy",&irfft_scipy, "a"_a, "axis"_a, "fct"_a=1., + "inplace"_a=false); m.def("irfftn",&irfftn, irfftn_DS, "a"_a, "axes"_a=py::none(), "lastsize"_a=0, "fct"_a=1.); m.def("hartley",&hartley, hartley_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1., diff --git a/test.py b/test.py index 181f545..be6f5c7 100644 --- a/test.py +++ b/test.py @@ -32,96 +32,92 @@ def test1D(len): @pmp("shp", shapes) def test_fftn(shp): a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j - vmax = np.max(np.abs(pyfftw.interfaces.numpy_fft.fftn(a))) - assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a), pypocketfft.fftn(a))<1e-15*vmax) + assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a), pypocketfft.fftn(a))<1e-15) a=a.astype(np.complex64) - assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a), pypocketfft.fftn(a))<5e-7*vmax) + assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a), pypocketfft.fftn(a))<5e-7) @pmp("shp", shapes2D) @pmp("axes", ((0,),(1,),(0,1),(1,0))) def test_fftn2D(shp, axes): a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j - vmax = np.max(np.abs(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes))) - assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes), pypocketfft.fftn(a, axes=axes))<1e-15*vmax) + assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes), pypocketfft.fftn(a, axes=axes))<1e-15) a=a.astype(np.complex64) - assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes), pypocketfft.fftn(a, axes=axes))<5e-7*vmax) + assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes), pypocketfft.fftn(a, axes=axes))<5e-7) @pmp("shp", shapes) def test_rfftn(shp): a=np.random.rand(*shp)-0.5 - vmax = np.max(np.abs(pyfftw.interfaces.numpy_fft.fftn(a))) - assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a), pypocketfft.rfftn(a))<1e-15*vmax) + assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a), pypocketfft.rfftn(a))<1e-15) a=a.astype(np.float32) - assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a), pypocketfft.rfftn(a))<5e-7*vmax) + assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a), pypocketfft.rfftn(a))<5e-7) + +@pmp("shp", shapes) +def test_rfft_scipy(shp): + for i in range(len(shp)): + a=np.random.rand(*shp)-0.5 + assert(_l2error(pyfftw.interfaces.scipy_fftpack.rfft(a,axis=i), pypocketfft.rfft_scipy(a,axis=i))<1e-15) + assert(_l2error(pyfftw.interfaces.scipy_fftpack.irfft(a,axis=i), pypocketfft.irfft_scipy(a,axis=i,fct=1./a.shape[i]))<1e-15) @pmp("shp", shapes2D) @pmp("axes", ((0,),(1,),(0,1),(1,0))) def test_rfftn2D(shp, axes): a=np.random.rand(*shp)-0.5 - vmax = np.max(np.abs(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes))) - assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes), pypocketfft.rfftn(a, axes=axes))<1e-15*vmax) + assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes), pypocketfft.rfftn(a, axes=axes))<1e-15) a=a.astype(np.float32) - assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes), pypocketfft.rfftn(a, axes=axes))<5e-7*vmax) + assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes), pypocketfft.rfftn(a, axes=axes))<5e-7) @pmp("shp", shapes) def test_identity(shp): a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j - vmax = np.max(np.abs(a)) - assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1.5e-15*vmax) + assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1.5e-15) tmp=a.copy() assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./a.size, inplace=True) is tmp) - assert(_l2error(tmp, a)<1.5e-15*vmax) + assert(_l2error(tmp, a)<1.5e-15) a=a.astype(np.complex64) - assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<6e-7*vmax) + assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<6e-7) @pmp("shp", shapes) def test_identity_r(shp): a=np.random.rand(*shp)-0.5 b=a.astype(np.float32) - vmax = np.max(np.abs(a)) for ax in range(a.ndim): n = a.shape[ax] - assert(_l2error(pypocketfft.irfftn(pypocketfft.rfftn(a,(ax,)),(ax,),lastsize=n,fct=1./n), a)<1e-15*vmax) - assert(_l2error(pypocketfft.irfftn(pypocketfft.rfftn(b,(ax,)),(ax,),lastsize=n,fct=1./n), b)<5e-7*vmax) + assert(_l2error(pypocketfft.irfftn(pypocketfft.rfftn(a,(ax,)),(ax,),lastsize=n,fct=1./n), a)<1e-15) + assert(_l2error(pypocketfft.irfftn(pypocketfft.rfftn(b,(ax,)),(ax,),lastsize=n,fct=1./n), b)<5e-7) @pmp("shp", shapes) def test_identity_r2(shp): a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j a=pypocketfft.rfftn(pypocketfft.irfftn(a)) fct = 1./pypocketfft.irfftn(a).size - vmax = np.max(np.abs(a)) - assert(_l2error(pypocketfft.rfftn(pypocketfft.irfftn(a),fct=fct), a)<1e-15*vmax) + assert(_l2error(pypocketfft.rfftn(pypocketfft.irfftn(a),fct=fct), a)<1e-15) @pmp("shp", shapes2D+shapes3D) def test_hartley2(shp): a=np.random.rand(*shp)-0.5 v1 = pypocketfft.hartley2(a) v2 = pypocketfft.fftn(a.astype(np.complex128)) - vmax = np.max(np.abs(v1)) v2 = v2.real+v2.imag - assert(_l2error(v1, v2)<1e-15*vmax) + assert(_l2error(v1, v2)<1e-15) @pmp("shp", shapes) def test_hartley_identity(shp): a=np.random.rand(*shp)-0.5 v1 = pypocketfft.hartley(pypocketfft.hartley(a))/a.size - vmax = np.max(np.abs(a)) - assert(_l2error(a, v1)<1e-15*vmax) + assert(_l2error(a, v1)<1e-15) @pmp("shp", shapes) def test_hartley2_identity(shp): a=np.random.rand(*shp)-0.5 v1 = pypocketfft.hartley2(pypocketfft.hartley2(a))/a.size - vmax = np.max(np.abs(a)) - assert(_l2error(a, v1)<1e-15*vmax) + assert(_l2error(a, v1)<1e-15) v1 = a.copy() assert (pypocketfft.hartley2(pypocketfft.hartley2(v1, inplace=True), fct=1./a.size, inplace=True) is v1) - assert(_l2error(a, v1)<1e-15*vmax) + assert(_l2error(a, v1)<1e-15) @pmp("shp", shapes2D) @pmp("axes", ((0,),(1,),(0,1),(1,0))) def test_hartley2_2D(shp, axes): a=np.random.rand(*shp)-0.5 fct = 1./np.prod(np.take(shp,axes)) - vmax = np.max(np.abs(a)) - assert(_l2error(pypocketfft.hartley2(pypocketfft.hartley2(a, axes=axes), axes=axes, fct=fct),a)<1e-15*vmax) + assert(_l2error(pypocketfft.hartley2(pypocketfft.hartley2(a, axes=axes), axes=axes, fct=fct),a)<1e-15) -- GitLab From 5f09d105e716d5f77f0cf79c527f1052b9b2671c Mon Sep 17 00:00:00 2001 From: Martin Reinecke <martin@mpa-garching.mpg.de> Date: Sat, 4 May 2019 12:50:27 +0200 Subject: [PATCH 5/5] small type fixes --- pocketfft.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pocketfft.cc b/pocketfft.cc index 1c65916..08dd1be 100644 --- a/pocketfft.cc +++ b/pocketfft.cc @@ -2311,7 +2311,7 @@ template<typename T> NOINLINE void pocketfft_general_c2r( } template<typename T> NOINLINE void pocketfft_general_r( - const ndarr<T> &in, ndarr<T> &out, int axis, bool forward, T fct) + const ndarr<T> &in, ndarr<T> &out, size_t axis, bool forward, T fct) { auto storage = alloc_tmp<T>(in.shape(), in.shape(axis), sizeof(T)); @@ -2460,7 +2460,7 @@ py::array rfftn(const py::array &in, py::object axes_, double fct) : rfftn_internal<float> (in, axes_, fct); } template<typename T> py::array xrfft_scipy(const py::array &in, - int axis, double fct, bool inplace, bool fwd) + size_t axis, double fct, bool inplace, bool fwd) { auto dims(copy_shape(in)); py::array res = inplace ? in : py::array_t<T>(dims); @@ -2469,13 +2469,14 @@ template<typename T> py::array xrfft_scipy(const py::array &in, pocketfft_general_r<T>(ain, aout, axis, fwd, fct); return res; } -py::array rfft_scipy(const py::array &in, int axis, double fct, bool inplace) +py::array rfft_scipy(const py::array &in, size_t axis, double fct, bool inplace) { return tcheck(in, f64, f32) ? xrfft_scipy<double>(in, axis, fct, inplace, true) : xrfft_scipy<float> (in, axis, fct, inplace, true); } -py::array irfft_scipy(const py::array &in, int axis, double fct, bool inplace) +py::array irfft_scipy(const py::array &in, size_t axis, double fct, + bool inplace) { return tcheck(in, f64, f32) ? xrfft_scipy<double>(in, axis, fct, inplace, false) : -- GitLab