diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h index 6a94b114910f759a8165aaa7b605eae19ad52060..1923c15c3c27c2c4cac36dfc45a3693022fea91c 100644 --- a/pocketfft_hdronly.h +++ b/pocketfft_hdronly.h @@ -2276,6 +2276,144 @@ template<typename T0> class pocketfft_r size_t length() const { return len; } }; + +// +// sine/cosine transforms +// + +template<typename T0> class T_cosq + { + private: + pocketfft_r<T0> fftplan; + vector<T0> twiddle; + + public: + POCKETFFT_NOINLINE T_cosq(size_t length) + : fftplan(length), twiddle(length) + { + constexpr T0 pi = T0(3.141592653589793238462643383279502884197L); + for (size_t i=0; i<length; ++i) + twiddle[i] = T0(cos(0.5*pi*T0(i+1)/T0(length))); + } + + template<typename T> POCKETFFT_NOINLINE void backward(T c[], T0 fct) + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=length(); + if (N==1) { c[0]*=2*fct; return; } + if (N==2) + { + T x1 = 2*fct*(c[0]+c[1]); + c[1] = sqrt2*fct*(c[0]-c[1]); + c[0] = x1; + return; + } + size_t NS2 = (N+1)/2; + for (size_t i=2; i<N; i+=2) + { + T xim1 = T0(0.5)*(c[i-1]+c[i]); + c[i] = T0(0.5)*(c[i]-c[i-1]); + c[i-1] = xim1; + } +// c[0] *= 2; +// if ((N&1)==0) +// c[N-1] *= 2; + fftplan.backward(c, fct); + for (size_t k=1, kc=N-1; k<NS2; ++k, --kc) + { + T tmp = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k]; + c[kc] = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc]; + c[k] = tmp; + } + if ((N&1)==0) + c[NS2] = twiddle[NS2-1]*(c[NS2]+c[NS2]); + for (size_t k=1, kc=N-1; k<NS2; ++k, --kc) + { + T tmp = c[k]+c[kc]; + c[kc] = c[k]-c[kc]; + c[k] = tmp; + } + c[0] *= 2; + } + + template<typename T> POCKETFFT_NOINLINE void forward(T c[], T0 fct) + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=length(); + if (N==1) { c[0]*=fct; return; } + if (N==2) + { + T TSQX = sqrt2*c[1]; + c[1] = c[0]-TSQX; + c[0] = c[0]+TSQX; + return; + } + size_t NS2 = (N+1)/2; + for (size_t k=1, kc=N-1; k<NS2; ++k, --kc) + { + T tmp = c[k]-c[kc]; + c[k] = c[k]+c[kc]; + c[kc] = tmp; + } + if ((N&1)==0) + c[NS2] = c[NS2]+c[NS2]; + for (size_t k=1, kc=N-1; k<NS2; ++k, --kc) + { + T tmp = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc]; + c[k] = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k]; + c[kc] = tmp; + } + if ((N&1)==0) + c[NS2] = twiddle[NS2-1]*c[NS2]; + fftplan.forward(c, fct); + for (size_t i=2; i<N; i+=2) + { + T xim1 = c[i-1]-c[i]; + c[i] += c[i-1]; + c[i-1] = xim1; + } + } + + size_t length() const { return fftplan.length(); } + }; + +template<typename T0> class T_sinq + { + private: + T_cosq<T0> cosq; + + public: + POCKETFFT_NOINLINE T_sinq(size_t length) + : cosq(length) {} + + template<typename T> POCKETFFT_NOINLINE void backward(T c[], T0 fct) + { + size_t N=length(); + if (N==1) { c[0]*=2*fct; return; } + + for (size_t k=1; k<N; k+=2) + c[k] = -c[k]; + cosq.backward(c, fct); + for (size_t k=0, kc=N-1; k<kc; ++k, --kc) + swap(c[k], c[kc]); + } + + template<typename T> POCKETFFT_NOINLINE void forward(T c[], T0 fct) + { + size_t N=length(); + if (N==1) { c[0]*=fct; return; } + size_t NS2 = N/2; + for (size_t k=0, kc=N-1; k<NS2; ++k, --kc) + swap(c[k], c[kc]); + cosq.forward(c, fct); + for (size_t k=1; k<N; k+=2) + c[k] = -c[k]; + } + + size_t length() const { return cosq.length(); } + }; + + // // multi-D infrastructure // @@ -2719,6 +2857,133 @@ template<typename T> POCKETFFT_NOINLINE void general_hartley( } } +template<typename T> POCKETFFT_NOINLINE void general_dct23( + const cndarr<T> &in, ndarr<T> &out, const shape_t &axes, bool forward, T fct, + size_t POCKETFFT_NTHREADS) + { + shared_ptr<T_cosq<T>> plan; + + for (size_t iax=0; iax<axes.size(); ++iax) + { + constexpr auto vlen = VLEN<T>::val; + size_t len=in.shape(axes[iax]); + if ((!plan) || (len!=plan->length())) + plan = get_plan<T_cosq<T>>(len); + +#ifdef POCKETFFT_OPENMP +#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) +#endif +{ + auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T)); + const auto &tin(iax==0 ? in : out); + multi_iter<vlen> it(tin, out, axes[iax]); +#ifndef POCKETFFT_NO_VECTORS + if (vlen>1) + while (it.remaining()>=vlen) + { + using vtype = typename VTYPE<T>::type; + it.advance(vlen); + auto tdatav = reinterpret_cast<vtype *>(storage.data()); + for (size_t i=0; i<len; ++i) + for (size_t j=0; j<vlen; ++j) + tdatav[i][j] = tin[it.iofs(j,i)]; + forward ? plan->forward (tdatav, fct) : plan->backward(tdatav, fct); + for (size_t i=0; i<len; ++i) + for (size_t j=0; j<vlen; ++j) + out[it.oofs(j,i)] = tdatav[i][j]; + } +#endif + while (it.remaining()>0) + { + it.advance(1); + auto tdata = reinterpret_cast<T *>(storage.data()); + if ((&tin[0]==&out[0]) && (it.stride_out()==sizeof(T))) // fully in-place + forward ? plan->forward (&out[it.oofs(0)], fct) + : plan->backward(&out[it.oofs(0)], fct); + else if (it.stride_out()==sizeof(T)) // compute FFT in output location + { + for (size_t i=0; i<len; ++i) + out[it.oofs(i)] = tin[it.iofs(i)]; + forward ? plan->forward (&out[it.oofs(0)], fct) + : plan->backward(&out[it.oofs(0)], fct); + } + else + { + for (size_t i=0; i<len; ++i) + tdata[i] = tin[it.iofs(i)]; + forward ? plan->forward (tdata, fct) : plan->backward(tdata, fct); + for (size_t i=0; i<len; ++i) + out[it.oofs(i)] = tdata[i]; + } + } +} // end of parallel region + fct = T(1); // factor has been applied, use 1 for remaining axes + } + } +template<typename T> POCKETFFT_NOINLINE void general_dst23( + const cndarr<T> &in, ndarr<T> &out, const shape_t &axes, bool forward, T fct, + size_t POCKETFFT_NTHREADS) + { + shared_ptr<T_sinq<T>> plan; + + for (size_t iax=0; iax<axes.size(); ++iax) + { + constexpr auto vlen = VLEN<T>::val; + size_t len=in.shape(axes[iax]); + if ((!plan) || (len!=plan->length())) + plan = get_plan<T_sinq<T>>(len); + +#ifdef POCKETFFT_OPENMP +#pragma omp parallel num_threads(util::thread_count(nthreads, in.shape(), axes[iax])) +#endif +{ + auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T)); + const auto &tin(iax==0 ? in : out); + multi_iter<vlen> it(tin, out, axes[iax]); +#ifndef POCKETFFT_NO_VECTORS + if (vlen>1) + while (it.remaining()>=vlen) + { + using vtype = typename VTYPE<T>::type; + it.advance(vlen); + auto tdatav = reinterpret_cast<vtype *>(storage.data()); + for (size_t i=0; i<len; ++i) + for (size_t j=0; j<vlen; ++j) + tdatav[i][j] = tin[it.iofs(j,i)]; + forward ? plan->forward (tdatav, fct) : plan->backward(tdatav, fct); + for (size_t i=0; i<len; ++i) + for (size_t j=0; j<vlen; ++j) + out[it.oofs(j,i)] = tdatav[i][j]; + } +#endif + while (it.remaining()>0) + { + it.advance(1); + auto tdata = reinterpret_cast<T *>(storage.data()); + if ((&tin[0]==&out[0]) && (it.stride_out()==sizeof(T))) // fully in-place + forward ? plan->forward (&out[it.oofs(0)], fct) + : plan->backward(&out[it.oofs(0)], fct); + else if (it.stride_out()==sizeof(T)) // compute FFT in output location + { + for (size_t i=0; i<len; ++i) + out[it.oofs(i)] = tin[it.iofs(i)]; + forward ? plan->forward (&out[it.oofs(0)], fct) + : plan->backward(&out[it.oofs(0)], fct); + } + else + { + for (size_t i=0; i<len; ++i) + tdata[i] = tin[it.iofs(i)]; + forward ? plan->forward (tdata, fct) : plan->backward(tdata, fct); + for (size_t i=0; i<len; ++i) + out[it.oofs(i)] = tdata[i]; + } + } +} // end of parallel region + fct = T(1); // factor has been applied, use 1 for remaining axes + } + } + template<typename T> POCKETFFT_NOINLINE void general_r2c( const cndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, bool forward, T fct, size_t POCKETFFT_NTHREADS) @@ -2963,6 +3228,28 @@ template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in, general_c(ain, aout, axes, forward, fct, nthreads); } +template<typename T> void r2r_dct23(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + bool forward, const T *data_in, T *data_out, T fct, size_t nthreads=1) + { + if (util::prod(shape)==0) return; + util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); + cndarr<T> ain(data_in, shape, stride_in); + ndarr<T> aout(data_out, shape, stride_out); + general_dct23(ain, aout, axes, forward, fct, nthreads); + } + +template<typename T> void r2r_dst23(const shape_t &shape, + const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, + bool forward, const T *data_in, T *data_out, T fct, size_t nthreads=1) + { + if (util::prod(shape)==0) return; + util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes); + cndarr<T> ain(data_in, shape, stride_in); + ndarr<T> aout(data_out, shape, stride_out); + general_dst23(ain, aout, axes, forward, fct, nthreads); + } + template<typename T> void r2c(const shape_t &shape_in, const stride_t &stride_in, const stride_t &stride_out, size_t axis, bool forward, const T *data_in, complex<T> *data_out, T fct, @@ -3069,6 +3356,8 @@ using detail::c2r; using detail::r2c; using detail::r2r_fftpack; using detail::r2r_separable_hartley; +using detail::r2r_dct23; +using detail::r2r_dst23; } // namespace pocketfft diff --git a/pypocketfft.cc b/pypocketfft.cc index efe85f0bbf551044dc78fc5772541470446925d4..608001ee6ed87fbd2220eb42d063858fc3f3676d 100644 --- a/pypocketfft.cc +++ b/pypocketfft.cc @@ -226,6 +226,60 @@ py::array r2r_fftpack(const py::array &in, const py::object &axes_, real2hermitian, forward, inorm, out_, nthreads)) } +template<typename T> py::array r2r_dct23_internal(const py::array &in, + const py::object &axes_, bool forward, int inorm, py::object &out_, + size_t nthreads) + { + auto axes = makeaxes(in, axes_); + auto dims(copy_shape(in)); + py::array res = prepare_output<T>(out_, dims); + auto s_in=copy_strides(in); + auto s_out=copy_strides(res); + auto d_in=reinterpret_cast<const T *>(in.data()); + auto d_out=reinterpret_cast<T *>(res.mutable_data()); + { + py::gil_scoped_release release; + T fct = norm_fct<T>(inorm, dims, axes); + pocketfft::r2r_dct23(dims, s_in, s_out, axes, forward, d_in, d_out, fct, + nthreads); + } + return res; + } + +py::array r2r_dct23(const py::array &in, const py::object &axes_, + bool forward, int inorm, py::object &out_, size_t nthreads) + { + DISPATCH(in, f64, f32, flong, r2r_dct23_internal, (in, axes_, + forward, inorm, out_, nthreads)) + } + +template<typename T> py::array r2r_dst23_internal(const py::array &in, + const py::object &axes_, bool forward, int inorm, py::object &out_, + size_t nthreads) + { + auto axes = makeaxes(in, axes_); + auto dims(copy_shape(in)); + py::array res = prepare_output<T>(out_, dims); + auto s_in=copy_strides(in); + auto s_out=copy_strides(res); + auto d_in=reinterpret_cast<const T *>(in.data()); + auto d_out=reinterpret_cast<T *>(res.mutable_data()); + { + py::gil_scoped_release release; + T fct = norm_fct<T>(inorm, dims, axes); + pocketfft::r2r_dst23(dims, s_in, s_out, axes, forward, d_in, d_out, fct, + nthreads); + } + return res; + } + +py::array r2r_dst23(const py::array &in, const py::object &axes_, + bool forward, int inorm, py::object &out_, size_t nthreads) + { + DISPATCH(in, f64, f32, flong, r2r_dst23_internal, (in, axes_, + forward, inorm, out_, nthreads)) + } + template<typename T> py::array c2r_internal(const py::array &in, const py::object &axes_, size_t lastsize, bool forward, int inorm, py::object &out_, size_t nthreads) @@ -543,4 +597,8 @@ PYBIND11_MODULE(pypocketfft, m) "axes"_a=None, "inorm"_a=0, "out"_a=None, "nthreads"_a=1); m.def("genuine_hartley", genuine_hartley, genuine_hartley_DS, "a"_a, "axes"_a=None, "inorm"_a=0, "out"_a=None, "nthreads"_a=1); + m.def("r2r_dct23", r2r_dct23, /*r2r_dct23_DS,*/ "a"_a, + "axes"_a=None, "forward"_a, "inorm"_a=0, "out"_a=None, "nthreads"_a=1); + m.def("r2r_dst23", r2r_dst23, /*r2r_dst23_DS,*/ "a"_a, + "axes"_a=None, "forward"_a, "inorm"_a=0, "out"_a=None, "nthreads"_a=1); }