/* * Copyright (C) 2020 Max-Planck-Society * Author: Martin Reinecke */ #ifndef MRUTIL_INTERPOL_NG_H #define MRUTIL_INTERPOL_NG_H #include #include #include #include "mr_util/math/constants.h" #include "mr_util/math/gl_integrator.h" #include "mr_util/math/es_kernel.h" #include "mr_util/infra/mav.h" #include "mr_util/sharp/sharp.h" #include "mr_util/sharp/sharp_almhelpers.h" #include "mr_util/sharp/sharp_geomhelpers.h" #include "alm.h" #include "mr_util/math/fft.h" #include "mr_util/bindings/pybind_utils.h" namespace mr { #if 0 namespace detail_fft { using std::vector; template aligned_array alloc_tmp_conv (const fmav_info &info, size_t axis, size_t len) { auto othersize = info.size()/info.shape(axis); constexpr auto vlen = native_simd::size(); auto tmpsize = len*((othersize>=vlen) ? vlen : 1); return aligned_array(tmpsize); } template MRUTIL_NOINLINE void general_convolve(const fmav &in, fmav &out, const size_t axis, const vector &kernel, size_t nthreads, const Exec &exec, const bool allow_inplace=true) { std::shared_ptr plan1, plan2; size_t l_in=in.shape(axis), l_out=out.shape(axis); size_t l_min=std::min(l_in, l_out), l_max=std::max(l_in, l_out); MR_assert(kernel.size()==l_min/2+1, "bad kernel size"); plan1 = get_plan(l_in); plan2 = get_plan(l_out); execParallel( util::thread_count(nthreads, in, axis, native_simd::size()), [&](Scheduler &sched) { constexpr auto vlen = native_simd::size(); auto storage = alloc_tmp_conv(in, axis, l_max); //FIXME! multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); #ifndef MRUTIL_NO_SIMD if (vlen>1) while (it.remaining()>=vlen) { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); exec(it, in, out, tdatav, *plan1, *plan2, kernel); } #endif while (it.remaining()>0) { it.advance(1); auto buf = allow_inplace && it.stride_out() == 1 ? &out.vraw(it.oofs(0)) : reinterpret_cast(storage.data()); exec(it, in, out, buf, *plan1, *plan2, kernel); } }); // end of parallel region } struct ExecConvR1 { template void operator() ( const multi_iter &it, const fmav &in, fmav &out, T * buf, const pocketfft_r &plan1, const pocketfft_r &plan2, const vector &kernel) const { size_t l_in = plan1.length(), l_out = plan2.length(), l_min = std::min(l_in, l_out); copy_input(it, in, buf); plan1.exec(buf, T0(1), true); buf[0] *= kernel[0]; for (size_t i=1; i void convolve_1d(const fmav &in, fmav &out, size_t axis, const vector &kernel, size_t nthreads=1) { // util::sanity_check_onetype(in, out, in.data()==out.data(), axes); MR_assert(axis>(in, out, axis, kernel, nthreads, ExecConvR1()); } } #endif namespace detail_interpol_ng { using namespace std; template class Interpolator { protected: bool adjoint; size_t lmax, kmax, nphi0, ntheta0, nphi, ntheta; int nthreads; T ofactor; size_t supp; ES_Kernel kernel; size_t ncomp; mav cube; // the data cube (theta, phi, 2*mbeam+1, TGC) void correct(mav &arr, int spin) { T sfct = (spin&1) ? -1 : 1; mav tmp({nphi,nphi}); tmp.apply([](T &v){v=0.;}); auto tmp0=tmp.template subarray<2>({0,0},{nphi0, nphi0}); fmav ftmp0(tmp0); for (size_t i=0; i=nphi0) j2-=nphi0; tmp0.v(i2,j) = sfct*tmp0(i,j2); } // FFT to frequency domain on minimal grid // one bad FFT axis r2r_fftpack(ftmp0,ftmp0,{0,1},true,true,T(1./(nphi0*nphi0)),nthreads); // correct amplitude at Nyquist frequency for (size_t i=0; i({0,0},{nphi, nphi0}); fmav ftmp1(tmp1); // zero-padded FFT in theta direction // one bad FFT axis r2r_fftpack(ftmp1,ftmp1,{0},false,false,T(1),nthreads); auto tmp2=tmp.template subarray<2>({0,0},{ntheta, nphi}); fmav ftmp2(tmp2); fmav farr(arr); // zero-padded FFT in phi direction r2r_fftpack(ftmp2,farr,{1},false,false,T(1),nthreads); } void decorrect(mav &arr, int spin) { T sfct = (spin&1) ? -1 : 1; mav tmp({nphi,nphi}); fmav ftmp(tmp); for (size_t i=0; i=nphi) j2-=nphi; tmp.v(i2,j) = sfct*tmp(i,j2); } r2r_fftpack(ftmp,ftmp,{1},true,true,T(1),nthreads); auto tmp1=tmp.template subarray<2>({0,0},{nphi, nphi0}); fmav ftmp1(tmp1); r2r_fftpack(ftmp1,ftmp1,{0},true,true,T(1),nthreads); auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads); auto tmp0=tmp.template subarray<2>({0,0},{nphi0, nphi0}); fmav ftmp0(tmp0); for (size_t i=0; i getIdx(const mav &ptg) const { vector idx(ptg.shape(0)); constexpr size_t cellsize=16; size_t nct = ntheta/cellsize+1, ncp = nphi/cellsize+1; vector> mapper(nct*ncp); for (size_t i=0; i>> &slm, const vector>> &blm, bool separate, T epsilon, T ofmin, int nthreads_) : adjoint(false), lmax(slm.at(0).Lmax()), kmax(blm.at(0).Mmax()), nphi0(2*good_size_real(lmax+1)), ntheta0(nphi0/2+1), nphi(std::max(20,2*good_size_real(size_t((2*lmax+1)*ofmin/2.)))), ntheta(nphi/2+1), nthreads(nthreads_), ofactor(T(nphi)/(2*lmax+1)), supp(ES_Kernel::get_supp(epsilon, ofactor)), kernel(supp, ofactor, nthreads), ncomp(separate ? slm.size() : 1), cube({ntheta+2*supp, nphi+2*supp, 2*kmax+1, ncomp}) { MR_assert((ncomp==1)||(ncomp==3), "currently only 1 or 3 components allowed"); MR_assert(slm.size()==blm.size(), "inconsistent slm and blm vectors"); for (size_t i=0; i> a1(lmax, lmax), a2(lmax,lmax); auto ginfo = sharp_make_cc_geom_info(ntheta0,nphi0,0.,cube.stride(1),cube.stride(0)); auto ainfo = sharp_make_triangular_alm_info(lmax,lmax,1); vectorlnorm(lmax+1); for (size_t i=0; i<=lmax; ++i) lnorm[i]=std::sqrt(4*pi/(2*i+1.)); for (size_t icomp=0; icomp({supp,supp,0,icomp},{ntheta,nphi,0,0}); sharp_alm2map(a1.Alms().data(), m1.vdata(), *ginfo, *ainfo, 0, nthreads); correct(m1,0); for (size_t k=1; k<=kmax; ++k) { for (size_t m=0; m<=lmax; ++m) for (size_t l=m; l<=lmax; ++l) { if (l({supp,supp,2*k-1,icomp},{ntheta,nphi,0,0}); auto m2 = cube.template subarray<2>({supp,supp,2*k ,icomp},{ntheta,nphi,0,0}); sharp_alm2map_spin(k, a1.Alms().data(), a2.Alms().data(), m1.vdata(), m2.vdata(), *ginfo, *ainfo, 0, nthreads); correct(m1,k); correct(m2,k); } } // fill border regions for (size_t i=0; i=nphi) j2-=nphi; for (size_t l=0; l(20,2*good_size_real(size_t((2*lmax+1)*ofmin/2.)))), ntheta(nphi/2+1), nthreads(nthreads_), ofactor(T(nphi)/(2*lmax+1)), supp(ES_Kernel::get_supp(epsilon, ofactor)), kernel(supp, ofactor, nthreads), ncomp(ncomp_), cube({ntheta+2*supp, nphi+2*supp, 2*kmax+1, ncomp_}) { MR_assert((ncomp==1)||(ncomp==3), "currently only 1 or 3 components allowed"); MR_assert((supp<=ntheta) && (supp<=nphi), "support too large!"); cube.apply([](T &v){v=0.;}); } void interpol (const mav &ptg, mav &res) const { MR_assert(!adjoint, "cannot be called in adjoint mode"); MR_assert(ptg.shape(0)==res.shape(0), "dimension mismatch"); MR_assert(ptg.shape(1)==3, "second dimension must have length 3"); MR_assert(res.shape(1)==ncomp, "# of components mismatch"); T delta = T(2)/supp; T xdtheta = T((ntheta-1)/pi), xdphi = T(nphi/(2*pi)); auto idx = getIdx(ptg); execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) { vector wt(supp), wp(supp); vector psiarr(2*kmax+1); while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind &ptg, const mav &data) { MR_assert(adjoint, "can only be called in adjoint mode"); MR_assert(ptg.shape(0)==data.shape(0), "dimension mismatch"); MR_assert(ptg.shape(1)==3, "second dimension must have length 3"); MR_assert(data.shape(1)==ncomp, "# of components mismatch"); T delta = T(2)/supp; T xdtheta = T((ntheta-1)/pi), xdphi = T(nphi/(2*pi)); auto idx = getIdx(ptg); constexpr size_t cellsize=16; size_t nct = ntheta/cellsize+5, ncp = nphi/cellsize+5; mav locks({nct,ncp}); execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) { size_t b_theta=99999999999999, b_phi=9999999999999999; vector wt(supp), wp(supp); vector psiarr(2*kmax+1); while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind>> &blm, vector>> &slm) { MR_assert(adjoint, "can only be called in adjoint mode"); MR_assert((blm.size()==ncomp) || (ncomp==1), "incorrect number of beam a_lm sets"); MR_assert((slm.size()==ncomp) || (ncomp==1), "incorrect number of sky a_lm sets"); Alm> a1(lmax, lmax), a2(lmax,lmax); auto ginfo = sharp_make_cc_geom_info(ntheta0,nphi0,0.,cube.stride(1),cube.stride(0)); auto ainfo = sharp_make_triangular_alm_info(lmax,lmax,1); // move stuff from border regions onto the main grid for (size_t i=0; i=nphi) j2-=nphi; for (size_t l=0; l=nphi) j2-=nphi; T tval = (cube(supp,j+supp,k,l) + fct*cube(supp,j2+supp,k,l)); cube.v(supp,j+supp,k,l) = tval; cube.v(supp,j2+supp,k,l) = fct*tval; tval = (cube(supp+ntheta-1,j+supp,k,l) + fct*cube(supp+ntheta-1,j2+supp,k,l)); cube.v(supp+ntheta-1,j+supp,k,l) = tval; cube.v(supp+ntheta-1,j2+supp,k,l) = fct*tval; } vectorlnorm(lmax+1); for (size_t i=0; i<=lmax; ++i) lnorm[i]=std::sqrt(4*pi/(2*i+1.)); for (size_t j=0; j1; { auto m1 = cube.template subarray<2>({supp,supp,0,icomp},{ntheta,nphi,0,0}); decorrect(m1,0); sharp_alm2map_adjoint(a1.Alms().vdata(), m1.data(), *ginfo, *ainfo, 0, nthreads); for (size_t m=0; m<=lmax; ++m) for (size_t l=m; l<=lmax; ++l) if (separate) slm[icomp](l,m) += conj(a1(l,m))*blm[icomp](l,0).real()*T(lnorm[l]); else for (size_t j=0; j({supp,supp,2*k-1,icomp},{ntheta,nphi,0,0}); auto m2 = cube.template subarray<2>({supp,supp,2*k ,icomp},{ntheta,nphi,0,0}); decorrect(m1,k); decorrect(m2,k); sharp_alm2map_spin_adjoint(k, a1.Alms().vdata(), a2.Alms().vdata(), m1.data(), m2.data(), *ginfo, *ainfo, 0, nthreads); for (size_t m=0; m<=lmax; ++m) for (size_t l=m; l<=lmax; ++l) if (l>=k) { if (separate) { auto tmp = conj(blm[icomp](l,k))*T(-2*lnorm[l]); slm[icomp](l,m) += conj(a1(l,m))*tmp.real(); slm[icomp](l,m) -= conj(a2(l,m))*tmp.imag(); } else for (size_t j=0; j