From d556ae211ce1da3ff7e2c460cf67e37490ced917 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 22 Jul 2019 09:37:19 +0200 Subject: [PATCH 1/4] cleanup DCT/DST, part 1 --- pocketfft_hdronly.h | 133 +++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 83 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index eba3398..b875db8 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2342,40 +2342,29 @@ template class T_dct2 { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); - if (N==1) - c[0]*=2*fct; - else if (N==2) + size_t NS2 = (N+1)/2; + for (size_t i=2; i class T_dct3 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); if (ortho) c[0]*=sqrt2; - if (N==1) - c[0]*=fct; - else if (N==2) + size_t NS2 = (N+1)/2; + for (size_t k=1, kc=N-1; k class T_dst2 { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); - if (N==1) - c[0]*=2*fct; - else - { - for (size_t k=1; k class T_dst3 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); if (ortho) c[0]*=sqrt2; - if (N==1) - c[0]*=fct; - else - { - size_t NS2 = N/2; - for (size_t k=0, kc=N-1; k class T_dst4 template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/) { size_t N=length(); - //if (N==1) { c[0]*=fct; return; } size_t NS2 = N/2; for (size_t k=0, kc=N-1; k Date: Mon, 22 Jul 2019 10:22:43 +0200 Subject: [PATCH 2/4] consolidate trigonometric transforms to save cache space and precomputation time --- pocketfft_hdronly.h | 258 +++++++++++++++++++------------------------- 1 file changed, 112 insertions(+), 146 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index b875db8..ac2ac7f 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2293,6 +2293,7 @@ template class pocketfft_r // // sine/cosine transforms // + template class T_dct1 { private: @@ -2302,7 +2303,8 @@ template class T_dct1 POCKETFFT_NOINLINE T_dct1(size_t length) : fftplan(2*(length-1)) {} - template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho) const + template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, + int /*type*/, bool /*cosine*/) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=fftplan.length(), n=N/2+1; @@ -2323,22 +2325,42 @@ template class T_dct1 size_t length() const { return fftplan.length()/2+1; } }; -template class T_dct2 +template class T_dst1 { private: pocketfft_r fftplan; - vector twiddle; public: - POCKETFFT_NOINLINE T_dct2(size_t length) - : fftplan(length), twiddle(length) + POCKETFFT_NOINLINE T_dst1(size_t length) + : fftplan(2*(length+1)) {} + + template POCKETFFT_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool /*cosine*/) const { - constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); - for (size_t i=0; i tmp(N); + tmp[0] = tmp[n+1] = c[0]*0; + for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho) const + size_t length() const { return fftplan.length()/2-1; } + }; + +template class T_dcst23 + { + private: + pocketfft_r fftplan; + vector twiddle; + + template POCKETFFT_NOINLINE void exec2c(T c[], T0 fct, + bool ortho) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); @@ -2368,25 +2390,8 @@ template class T_dct2 if (ortho) c[0]/=sqrt2; } - size_t length() const { return fftplan.length(); } - }; - -template class T_dct3 - { - private: - pocketfft_r fftplan; - vector twiddle; - - public: - POCKETFFT_NOINLINE T_dct3(size_t length) - : fftplan(length), twiddle(length) - { - constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); - for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho) const + template POCKETFFT_NOINLINE void exec3c(T c[], T0 fct, + bool ortho) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); @@ -2417,10 +2422,55 @@ template class T_dct3 } } + template POCKETFFT_NOINLINE void exec2s(T c[], T0 fct, + bool ortho) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=length(); + for (size_t k=1; k POCKETFFT_NOINLINE void exec3s(T c[], T0 fct, + bool ortho) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=length(); + if (ortho) c[0]*=sqrt2; + size_t NS2 = N/2; + for (size_t k=0, kc=N-1; k POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, + int type, bool cosine) const + { + if (type==2) + cosine ? exec2c(c, fct, ortho) : exec2s(c, fct, ortho); + else + cosine ? exec3c(c, fct, ortho) : exec3s(c, fct, ortho); + } + size_t length() const { return fftplan.length(); } }; -template class T_dct4 +template class T_dcst4 { // even length algorithm from // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/ @@ -2430,23 +2480,7 @@ template class T_dct4 unique_ptr> rfft; arr> C2; - public: - POCKETFFT_NOINLINE T_dct4(size_t length) - : N(length), - fft((N&1) ? nullptr : new pocketfft_c(N/2)), - rfft((N&1)? new pocketfft_r(N) : nullptr), - C2((N&1) ? 0 : N/2) - { - constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); - if ((N&1)==0) - for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/) const + template POCKETFFT_NOINLINE void exec4c(T c[], T0 fct) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); if (N&1) @@ -2512,106 +2546,38 @@ template class T_dct4 } } - size_t length() const { return N; } - }; - -template class T_dst1 - { - private: - pocketfft_r fftplan; - - public: - POCKETFFT_NOINLINE T_dst1(size_t length) - : fftplan(2*(length+1)) {} - - template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/) const + template POCKETFFT_NOINLINE void exec4s(T c[], T0 fct) const { - size_t N=fftplan.length(), n=N/2-1; - arr tmp(N); - tmp[0] = tmp[n+1] = c[0]*0; - for (size_t i=0; i class T_dst2 - { - private: - T_dct2 dct; - - public: - POCKETFFT_NOINLINE T_dst2(size_t length) - : dct(length) {} - - template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho) const - { - constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); - for (size_t k=1; k class T_dst3 - { - private: - T_dct3 dct; - - public: - POCKETFFT_NOINLINE T_dst3(size_t length) - : dct(length) {} - - template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho) - { - constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); - size_t N=length(); - if (ortho) c[0]*=sqrt2; size_t NS2 = N/2; for (size_t k=0, kc=N-1; k class T_dst4 - { - private: - T_dct4 dct; - public: - POCKETFFT_NOINLINE T_dst4(size_t length) - : dct(length) {} - - template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/) + POCKETFFT_NOINLINE T_dcst4(size_t length) + : N(length), + fft((N&1) ? nullptr : new pocketfft_c(N/2)), + rfft((N&1)? new pocketfft_r(N) : nullptr), + C2((N&1) ? 0 : N/2) { - size_t N=length(); - size_t NS2 = N/2; - for (size_t k=0, kc=N-1; k POCKETFFT_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine) const + { cosine ? exec4c(c, fct) : exec4s(c, fct); } + + size_t length() const { return N; } }; @@ -3060,7 +3026,7 @@ template POCKETFFT_NOINLINE void general_hartley( template POCKETFFT_NOINLINE void general_dcst( const cndarr &in, ndarr &out, const shape_t &axes, - T fct, bool ortho, size_t POCKETFFT_NTHREADS) + T fct, bool ortho, int type, bool cosine, size_t POCKETFFT_NTHREADS) { shared_ptr plan; @@ -3088,7 +3054,7 @@ template POCKETFFT_NOINLINE void general_dcst( for (size_t i=0; iexec(tdatav, fct, ortho); + plan->exec(tdatav, fct, ortho, type, cosine); for (size_t i=0; i POCKETFFT_NOINLINE void general_dcst( it.advance(1); auto tdata = reinterpret_cast(storage.data()); if ((&tin[0]==&out[0]) && (it.stride_out()==sizeof(T))) // fully in-place - plan->exec(&out[it.oofs(0)], fct, ortho); + plan->exec(&out[it.oofs(0)], fct, ortho, type, cosine); else if (it.stride_out()==sizeof(T)) // compute FFT in output location { for (size_t i=0; iexec(&out[it.oofs(0)], fct, ortho); + plan->exec(&out[it.oofs(0)], fct, ortho, type, cosine); } else { for (size_t i=0; iexec(tdata, fct, ortho); + plan->exec(tdata, fct, ortho, type, cosine); for (size_t i=0; i void dct(const shape_t &shape, cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); if (type==1) - general_dcst>(ain, aout, axes, fct, ortho, nthreads); + general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); else if (type==2) - general_dcst>(ain, aout, axes, fct, ortho, nthreads); + general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); else if (type==3) - general_dcst>(ain, aout, axes, fct, ortho, nthreads); + general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); else if (type==4) - general_dcst>(ain, aout, axes, fct, ortho, nthreads); + general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); else throw runtime_error("unsupported DCT type"); } @@ -3395,13 +3361,13 @@ template void dst(const shape_t &shape, cndarr ain(data_in, shape, stride_in); ndarr aout(data_out, shape, stride_out); if (type==1) - general_dcst>(ain, aout, axes, fct, ortho, nthreads); + general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); else if (type==2) - general_dcst>(ain, aout, axes, fct, ortho, nthreads); + general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); else if (type==3) - general_dcst>(ain, aout, axes, fct, ortho, nthreads); + general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); else if (type==4) - general_dcst>(ain, aout, axes, fct, ortho, nthreads); + general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); else throw runtime_error("unsupported DST type"); } -- GitLab From c9dc81da35a351b76657f89917e9ef02ebfe7608 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 22 Jul 2019 13:15:59 +0200 Subject: [PATCH 3/4] more polishing --- pocketfft_hdronly.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index ac2ac7f..96cdf05 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2492,10 +2492,9 @@ template class T_dcst4 auto SGN_SET = [](T x, size_t i) {return (i%2) ? -x : x;}; arr y(N); size_t n2 = N/2; - size_t i; { - size_t m; - for (i=0, m=n2; m class T_dcst4 y[i] = c[m]; } rfft->forward(y.data(), fct); - for (i=0; i+i+1 class T_dcst4 c[i] = sqrt2 * (SGN_SET(cx, (i+1)/2) + SGN_SET(sx, i/2)); c[N-(i+1)] = sqrt2 * (SGN_SET(cx, (i+2)/2) + SGN_SET(sx, (i+1)/2)); } + } c[n2] = sqrt2 * SGN_SET(y[0], (n2+1)/2); // FFTW-derived code ends here } else { + // even length algorithm from + // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/ arr> y(N/2); for(size_t i=0; i void dct(const shape_t &shape, ndarr aout(data_out, shape, stride_out); if (type==1) general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); - else if (type==2) - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); - else if (type==3) - general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); else if (type==4) general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); else - throw runtime_error("unsupported DCT type"); + general_dcst>(ain, aout, axes, fct, ortho, type, true, nthreads); } template void dst(const shape_t &shape, @@ -3362,14 +3362,10 @@ template void dst(const shape_t &shape, ndarr aout(data_out, shape, stride_out); if (type==1) general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); - else if (type==2) - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); - else if (type==3) - general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); else if (type==4) general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); else - throw runtime_error("unsupported DST type"); + general_dcst>(ain, aout, axes, fct, ortho, type, false, nthreads); } template void r2c(const shape_t &shape_in, -- GitLab From 88f64fcf1d8ef54025a19c619f5ce8b3e8c58a02 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 22 Jul 2019 15:04:17 +0200 Subject: [PATCH 4/4] more compactification --- pocketfft_hdronly.h | 157 +++++++++++++++++--------------------------- 1 file changed, 62 insertions(+), 95 deletions(-) diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 96cdf05..b74b1c6 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -209,13 +209,19 @@ template struct cmplx { r = tmp; return *this; } - cmplx operator+ (const cmplx &other) const - { return cmplx(r+other.r, i+other.i); } - cmplx operator- (const cmplx &other) const - { return cmplx(r-other.r, i-other.i); } + templatecmplx &operator+= (const cmplx &other) + { r+=other.r; i+=other.i; return *this; } + templatecmplx &operator-= (const cmplx &other) + { r-=other.r; i-=other.i; return *this; } template auto operator* (const T2 &other) const -> cmplx { return {r*other, i*other}; } + template auto operator+ (const cmplx &other) const + -> cmplx + { return {r+other.r, i+other.i}; } + template auto operator- (const cmplx &other) const + -> cmplx + { return {r-other.r, i-other.i}; } template auto operator* (const cmplx &other) const -> cmplx { return {r*other.r-i*other.i, r*other.i + i*other.r}; } @@ -227,8 +233,9 @@ template struct cmplx { : Tres(r*other.r-i*other.i, r*other.i+i*other.r); } }; -template void PMC(cmplx &a, cmplx &b, - const cmplx &c, const cmplx &d) +template inline void PMINPLACE(T &a, T &b) + { T t = a; a+=b; b=t-b; } +template void PMC(T &a, T &b, const T &c, const T &d) { a = c+d; b = c-d; } template cmplx conj(const cmplx &a) { return {a.r, -a.i}; } @@ -845,8 +852,6 @@ template void ROTX135(T &a) else { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); } } -template inline void PMINPLACE(T &a, T &b) - { T t = a; a.r+=b.r; a.i+=b.i; b.r=t.r-b.r; b.i=t.i-b.i; } template void pass8 (size_t ido, size_t l1, const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch, @@ -2359,12 +2364,15 @@ template class T_dcst23 pocketfft_r fftplan; vector twiddle; - template POCKETFFT_NOINLINE void exec2c(T c[], T0 fct, - bool ortho) const + template POCKETFFT_NOINLINE void exec2(T c[], T0 fct, + bool ortho, bool cosine) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); size_t NS2 = (N+1)/2; + if (!cosine) + for (size_t k=1; k class T_dcst23 if ((N&1)==0) c[NS2] = twiddle[NS2-1]*(c[NS2]+c[NS2]); for (size_t k=1, kc=N-1; k POCKETFFT_NOINLINE void exec3c(T c[], T0 fct, - bool ortho) const + template POCKETFFT_NOINLINE void exec3(T c[], T0 fct, + bool ortho, bool cosine) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); if (ortho) c[0]*=sqrt2; size_t NS2 = (N+1)/2; + if (!cosine) + for (size_t k=0, kc=N-1; k class T_dcst23 c[i] += c[i-1]; c[i-1] = xim1; } - } - - template POCKETFFT_NOINLINE void exec2s(T c[], T0 fct, - bool ortho) const - { - constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); - size_t N=length(); - for (size_t k=1; k POCKETFFT_NOINLINE void exec3s(T c[], T0 fct, - bool ortho) const - { - constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); - size_t N=length(); - if (ortho) c[0]*=sqrt2; - size_t NS2 = N/2; - for (size_t k=0, kc=N-1; k class T_dcst23 template POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, int type, bool cosine) const - { - if (type==2) - cosine ? exec2c(c, fct, ortho) : exec2s(c, fct, ortho); - else - cosine ? exec3c(c, fct, ortho) : exec3s(c, fct, ortho); - } + { type==2 ? exec2(c, fct, ortho, cosine) : exec3(c, fct, ortho, cosine); } size_t length() const { return fftplan.length(); } }; @@ -2480,9 +2457,30 @@ template class T_dcst4 unique_ptr> rfft; arr> C2; - template POCKETFFT_NOINLINE void exec4c(T c[], T0 fct) const + public: + POCKETFFT_NOINLINE T_dcst4(size_t length) + : N(length), + fft((N&1) ? nullptr : new pocketfft_c(N/2)), + rfft((N&1)? new pocketfft_r(N) : nullptr), + C2((N&1) ? 0 : N/2) + { + constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); + if ((N&1)==0) + for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t n2 = N/2; + if (!cosine) + for (size_t k=0, kc=N-1; k class T_dcst4 auto SGN_SET = [](T x, size_t i) {return (i%2) ? -x : x;}; arr y(N); - size_t n2 = N/2; { size_t i=0, m=n2; for (; m class T_dcst4 { // even length algorithm from // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/ - arr> y(N/2); - for(size_t i=0; i> y(n2); + for(size_t i=0; iforward(y.data(), fct); - for(size_t i=0; i POCKETFFT_NOINLINE void exec4s(T c[], T0 fct) const - { - size_t N=length(); - size_t NS2 = N/2; - for (size_t k=0, kc=N-1; k(N/2)), - rfft((N&1)? new pocketfft_r(N) : nullptr), - C2((N&1) ? 0 : N/2) - { - constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); - if ((N&1)==0) - for (size_t i=0; i POCKETFFT_NOINLINE void exec(T c[], T0 fct, - bool /*ortho*/, int /*type*/, bool cosine) const - { cosine ? exec4c(c, fct) : exec4s(c, fct); } - size_t length() const { return N; } }; -- GitLab