/* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* * Copyright (C) 2020 Max-Planck-Society * Author: Martin Reinecke */ #ifndef DUCC0_INTERPOL_NG_H #define DUCC0_INTERPOL_NG_H #define SIMD_INTERPOL #define SPECIAL_CASING #include #include #include #include "ducc0/math/constants.h" #include "ducc0/math/gl_integrator.h" #include "ducc0/math/es_kernel.h" #include "ducc0/infra/mav.h" #include "ducc0/infra/simd.h" #include "ducc0/sharp/sharp.h" #include "ducc0/sharp/sharp_almhelpers.h" #include "ducc0/sharp/sharp_geomhelpers.h" #include "python/alm.h" #include "ducc0/math/fft.h" namespace ducc0 { namespace detail_fft { using std::vector; template aligned_array alloc_tmp_conv (const fmav_info &info, size_t axis, size_t len) { auto othersize = info.size()/info.shape(axis); constexpr auto vlen = native_simd::size(); auto tmpsize = len*((othersize>=vlen) ? vlen : 1); return aligned_array(tmpsize); } template DUCC0_NOINLINE void general_convolve(const fmav &in, fmav &out, const size_t axis, const vector &kernel, size_t nthreads, const Exec &exec) { std::unique_ptr plan1, plan2; size_t l_in=in.shape(axis), l_out=out.shape(axis); size_t l_min=std::min(l_in, l_out), l_max=std::max(l_in, l_out); MR_assert(kernel.size()==l_min/2+1, "bad kernel size"); plan1 = std::make_unique(l_in); plan2 = std::make_unique(l_out); execParallel( util::thread_count(nthreads, in, axis, native_simd::size()), [&](Scheduler &sched) { constexpr auto vlen = native_simd::size(); auto storage = alloc_tmp_conv(in, axis, l_max); multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); #ifndef DUCC0_NO_SIMD if (vlen>1) while (it.remaining()>=vlen) { it.advance(vlen); auto tdatav = reinterpret_cast *>(storage.data()); exec(it, in, out, tdatav, *plan1, *plan2, kernel); } #endif while (it.remaining()>0) { it.advance(1); auto buf = reinterpret_cast(storage.data()); exec(it, in, out, buf, *plan1, *plan2, kernel); } }); // end of parallel region } struct ExecConvR1 { template void operator() ( const multi_iter &it, const fmav &in, fmav &out, T * buf, const pocketfft_r &plan1, const pocketfft_r &plan2, const vector &kernel) const { size_t l_in = plan1.length(), l_out = plan2.length(), l_min = std::min(l_in, l_out); copy_input(it, in, buf); plan1.exec(buf, T0(1), true); for (size_t i=0; i void convolve_1d(const fmav &in, fmav &out, size_t axis, const vector &kernel, size_t nthreads=1) { MR_assert(axis>(in, out, axis, kernel, nthreads, ExecConvR1()); } } using detail_fft::convolve_1d; namespace detail_totalconvolve { using namespace std; template class Interpolator { protected: bool adjoint; size_t lmax, kmax, nphi0, ntheta0, nphi, ntheta; int nthreads; T ofactor; size_t supp; ES_Kernel kernel; size_t ncomp; #ifdef SIMD_INTERPOL mav,4> scube; #endif mav cube; // the data cube (theta, phi, 2*mbeam+1, TGC) void correct(mav &arr, int spin) { T sfct = (spin&1) ? -1 : 1; mav tmp({nphi,nphi0}); // copy and extend to second half for (size_t j=0; j=nphi0) j2-=nphi0; tmp.v(i,j2) = arr(i,j2); tmp.v(i2,j) = sfct*tmp(i,j2); } for (size_t j=0; j k2(fct.size()); for (size_t i=0; i ftmp(tmp); fmav ftmp0(tmp.template subarray<2>({0,0},{nphi0, nphi0})); convolve_1d(ftmp0, ftmp, 0, k2, nthreads); fmav ftmp2(tmp.template subarray<2>({0,0},{ntheta, nphi0})); fmav farr(arr); convolve_1d(ftmp2, farr, 1, k2, nthreads); } void decorrect(mav &arr, int spin) { T sfct = (spin&1) ? -1 : 1; mav tmp({nphi,nphi0}); auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads); vector k2(fct.size()); for (size_t i=0; i farr(arr); fmav ftmp2(tmp.template subarray<2>({0,0},{ntheta, nphi0})); convolve_1d(farr, ftmp2, 1, k2, nthreads); // extend to second half for (size_t i=1, i2=nphi-1; i+1=nphi0) j2-=nphi0; tmp.v(i2,j) = sfct*tmp(i,j2); } fmav ftmp(tmp); fmav ftmp0(tmp.template subarray<2>({0,0},{nphi0, nphi0})); convolve_1d(ftmp, ftmp0, 0, k2, nthreads); for (size_t j=0; j getIdx(const mav &ptg) const { vector idx(ptg.shape(0)); constexpr size_t cellsize=16; size_t nct = ntheta/cellsize+1, ncp = nphi/cellsize+1; vector> mapper(nct*ncp); for (size_t i=0; i>> &slm, const vector>> &blm, bool separate, T epsilon, T ofmin, int nthreads_) : adjoint(false), lmax(slm.at(0).Lmax()), kmax(blm.at(0).Mmax()), nphi0(2*good_size_real(lmax+1)), ntheta0(nphi0/2+1), nphi(std::max(20,2*good_size_real(size_t((2*lmax+1)*ofmin/2.)))), ntheta(nphi/2+1), nthreads(nthreads_), ofactor(T(nphi)/(2*lmax+1)), supp(ES_Kernel::get_supp(epsilon, ofactor)), kernel(supp, ofactor, nthreads), ncomp(separate ? slm.size() : 1), #ifdef SIMD_INTERPOL scube({ntheta+2*supp, nphi+2*supp, ncomp, (2*kmax+1+native_simd::size()-1)/native_simd::size()}), cube(reinterpret_cast(scube.vdata()),{ntheta+2*supp, nphi+2*supp, ncomp, ((2*kmax+1+native_simd::size()-1)/native_simd::size())*native_simd::size()},true) #else cube({ntheta+2*supp, nphi+2*supp, ncomp, 2*kmax+1}) #endif { MR_assert((ncomp==1)||(ncomp==3), "currently only 1 or 3 components allowed"); MR_assert(slm.size()==blm.size(), "inconsistent slm and blm vectors"); for (size_t i=0; i> a1(lmax, lmax), a2(lmax,lmax); auto ginfo = sharp_make_cc_geom_info(ntheta0,nphi0,0.,cube.stride(1),cube.stride(0)); auto ainfo = sharp_make_triangular_alm_info(lmax,lmax,1); vectorlnorm(lmax+1); for (size_t i=0; i<=lmax; ++i) lnorm[i]=T(std::sqrt(4*pi/(2*i+1.))); for (size_t icomp=0; icomp({supp,supp,icomp,0},{ntheta,nphi,0,0}); sharp_alm2map(a1.Alms().data(), m1.vdata(), *ginfo, *ainfo, 0, nthreads); correct(m1,0); for (size_t k=1; k<=kmax; ++k) { for (size_t m=0; m<=lmax; ++m) for (size_t l=m; l<=lmax; ++l) { if (l({supp,supp,icomp,2*k-1},{ntheta,nphi,0,0}); auto m2 = cube.template subarray<2>({supp,supp,icomp,2*k },{ntheta,nphi,0,0}); sharp_alm2map_spin(k, a1.Alms().data(), a2.Alms().data(), m1.vdata(), m2.vdata(), *ginfo, *ainfo, 0, nthreads); correct(m1,k); correct(m2,k); } } // fill border regions for (size_t i=0; i=nphi) j2-=nphi; for (size_t l=0; l(20,2*good_size_real(size_t((2*lmax+1)*ofmin/2.)))), ntheta(nphi/2+1), nthreads(nthreads_), ofactor(T(nphi)/(2*lmax+1)), supp(ES_Kernel::get_supp(epsilon, ofactor)), kernel(supp, ofactor, nthreads), ncomp(ncomp_), #ifdef SIMD_INTERPOL scube({ntheta+2*supp, nphi+2*supp, ncomp, (2*kmax+1+native_simd::size()-1)/native_simd::size()}), cube(reinterpret_cast(scube.vdata()),{ntheta+2*supp, nphi+2*supp, ncomp, ((2*kmax+1+native_simd::size()-1)/native_simd::size())*native_simd::size()},true) #else cube({ntheta+2*supp, nphi+2*supp, ncomp, 2*kmax+1}) #endif { MR_assert((ncomp==1)||(ncomp==3), "currently only 1 or 3 components allowed"); MR_assert((supp<=ntheta) && (supp<=nphi), "support too large!"); cube.apply([](T &v){v=0.;}); } #ifdef SIMD_INTERPOL template void interpol_help0(const T * DUCC0_RESTRICT wt, const T * DUCC0_RESTRICT wp, const native_simd * DUCC0_RESTRICT p, size_t d0, size_t d1, const native_simd * DUCC0_RESTRICT psiarr2, mav &res, size_t idx) const { array,nc> vv; for (auto &vvv:vv) vvv=0; for (size_t j=0; j,nc> tvv; for (auto &vvv:tvv) vvv=0; for (size_t l=0; l()); } template void deinterpol_help0(const T * DUCC0_RESTRICT wt, const T * DUCC0_RESTRICT wp, native_simd * DUCC0_RESTRICT p, size_t d0, size_t d1, const native_simd * DUCC0_RESTRICT psiarr2, const mav &data, size_t idx) const { array,nc> vv; for (size_t i=0; i wtjwpk = wtj*wp[k]; for (size_t l=0; l &ptg, mav &res) const { #ifdef SIMD_INTERPOL constexpr size_t vl=native_simd::size(); MR_assert(scube.stride(3)==1, "bad stride"); size_t nv=scube.shape(3); #endif MR_assert(!adjoint, "cannot be called in adjoint mode"); MR_assert(ptg.shape(0)==res.shape(0), "dimension mismatch"); MR_assert(ptg.shape(1)==3, "second dimension must have length 3"); MR_assert(res.shape(1)==ncomp, "# of components mismatch"); T delta = T(2)/supp; T xdtheta = T((ntheta-1)/pi), xdphi = T(nphi/(2*pi)); auto idx = getIdx(ptg); execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) { union { native_simd simd[64/vl]; T scalar[64]; } kdata; T *wt(kdata.scalar), *wp(kdata.scalar+supp); size_t nvec = (2*supp+vl-1)/vl; for (auto &v: kdata.simd) v=0; MR_assert(2*supp<=64, "support too large"); vector psiarr(2*kmax+1); #ifdef SIMD_INTERPOL vector> psiarr2((2*kmax+1+vl-1)/vl); for (auto &v:psiarr2) v=0; #endif while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind(psiarr2.data()), psiarr.data(), (2*kmax+1)*sizeof(T)); #endif if (ncomp==1) { #ifndef SIMD_INTERPOL T vv=0; for (size_t j=0; j *p=&scube(i0,i1,0,0); ptrdiff_t d0 = scube.stride(0); ptrdiff_t d1 = scube.stride(1); switch (nv) { #ifdef SPECIAL_CASING case 1: interpol_help0<1,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 2: interpol_help0<2,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 3: interpol_help0<3,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 4: interpol_help0<4,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 5: interpol_help0<5,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 6: interpol_help0<6,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 7: interpol_help0<7,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; #endif default: { native_simd vv=0.; for (size_t j=0; j tvv=0; for (size_t l=0; l()); } } #endif } else // ncomp==3 { #ifndef SIMD_INTERPOL T v0=0., v1=0., v2=0.; for (size_t j=0; j *p=&scube(i0,i1,0,0); ptrdiff_t d0 = scube.stride(0); ptrdiff_t d1 = scube.stride(1); switch (nv) { #ifdef SPECIAL_CASING case 1: interpol_help0<1,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 2: interpol_help0<2,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 3: interpol_help0<3,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 4: interpol_help0<4,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 5: interpol_help0<5,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 6: interpol_help0<6,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; case 7: interpol_help0<7,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i); break; #endif default: { ptrdiff_t d2 = scube.stride(2); native_simd v0=0., v1=0., v2=0.; for (size_t j=0; j wtjwpk = wtj*wp[k]; for (size_t l=0; l()); res.v(i,1) = reduce(v1, std::plus<>()); res.v(i,2) = reduce(v2, std::plus<>()); } } #endif } } }); } size_t support() const { return supp; } void deinterpol (const mav &ptg, const mav &data) { #ifdef SIMD_INTERPOL constexpr size_t vl=native_simd::size(); MR_assert(scube.stride(3)==1, "bad stride"); MR_assert(scube.stride(2)==ptrdiff_t(scube.shape(3)), "bad stride"); size_t nv=scube.shape(3); #endif MR_assert(adjoint, "can only be called in adjoint mode"); MR_assert(ptg.shape(0)==data.shape(0), "dimension mismatch"); MR_assert(ptg.shape(1)==3, "second dimension must have length 3"); MR_assert(data.shape(1)==ncomp, "# of components mismatch"); T delta = T(2)/supp; T xdtheta = T((ntheta-1)/pi), xdphi = T(nphi/(2*pi)); auto idx = getIdx(ptg); constexpr size_t cellsize=16; size_t nct = ntheta/cellsize+5, ncp = nphi/cellsize+5; mav locks({nct,ncp}); execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) { size_t b_theta=99999999999999, b_phi=9999999999999999; union { native_simd simd[64/vl]; T scalar[64]; } kdata; T *wt(kdata.scalar), *wp(kdata.scalar+supp); size_t nvec = (2*supp+vl-1)/vl; for (auto &v: kdata.simd) v=0; MR_assert(2*supp<=64, "support too large"); vector psiarr(2*kmax+1); #ifdef SIMD_INTERPOL vector> psiarr2((2*kmax+1+vl-1)/vl); for (auto &v:psiarr2) v=0; #endif while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind(psiarr2.data()), psiarr.data(), (2*kmax+1)*sizeof(T)); #endif if (ncomp==1) { #ifndef SIMD_INTERPOL T val = data(i,0); for (size_t j=0; j *p=&scube.v(i0,i1,0,0); ptrdiff_t d0 = scube.stride(0); ptrdiff_t d1 = scube.stride(1); switch (nv) { #ifdef SPECIAL_CASING case 1: deinterpol_help0<1,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 2: deinterpol_help0<2,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 3: deinterpol_help0<3,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 4: deinterpol_help0<4,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 5: deinterpol_help0<5,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 6: deinterpol_help0<6,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 7: deinterpol_help0<7,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; #endif default: { native_simd val = data(i,0); for (size_t j=0; j tv = wtj*wp[k]*val; for (size_t l=0; l *p=&scube.v(i0,i1,0,0); ptrdiff_t d0 = scube.stride(0); ptrdiff_t d1 = scube.stride(1); switch (nv) { #ifdef SPECIAL_CASING case 1: deinterpol_help0<1,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 2: deinterpol_help0<2,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 3: deinterpol_help0<3,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 4: deinterpol_help0<4,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 5: deinterpol_help0<5,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 6: deinterpol_help0<6,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; case 7: deinterpol_help0<7,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i); break; #endif default: { native_simd v0=data(i,0), v1=data(i,1), v2=data(i,2); ptrdiff_t d2 = scube.stride(2); for (size_t j=0; j wtjwpk = wtj*wp[k]; for (size_t l=0; l>> &blm, vector>> &slm) { MR_assert(adjoint, "can only be called in adjoint mode"); MR_assert((blm.size()==ncomp) || (ncomp==1), "incorrect number of beam a_lm sets"); MR_assert((slm.size()==ncomp) || (ncomp==1), "incorrect number of sky a_lm sets"); Alm> a1(lmax, lmax), a2(lmax,lmax); auto ginfo = sharp_make_cc_geom_info(ntheta0,nphi0,0.,cube.stride(1),cube.stride(0)); auto ainfo = sharp_make_triangular_alm_info(lmax,lmax,1); // move stuff from border regions onto the main grid for (size_t i=0; i=nphi) j2-=nphi; for (size_t l=0; l=nphi) j2-=nphi; T tval = (cube(supp,j+supp,l,k) + fct*cube(supp,j2+supp,l,k)); cube.v(supp,j+supp,l,k) = tval; cube.v(supp,j2+supp,l,k) = fct*tval; tval = (cube(supp+ntheta-1,j+supp,l,k) + fct*cube(supp+ntheta-1,j2+supp,l,k)); cube.v(supp+ntheta-1,j+supp,l,k) = tval; cube.v(supp+ntheta-1,j2+supp,l,k) = fct*tval; } vectorlnorm(lmax+1); for (size_t i=0; i<=lmax; ++i) lnorm[i]=T(std::sqrt(4*pi/(2*i+1.))); for (size_t j=0; j1; { auto m1 = cube.template subarray<2>({supp,supp,icomp,0},{ntheta,nphi,0,0}); decorrect(m1,0); sharp_alm2map_adjoint(a1.Alms().vdata(), m1.data(), *ginfo, *ainfo, 0, nthreads); for (size_t m=0; m<=lmax; ++m) for (size_t l=m; l<=lmax; ++l) if (separate) slm[icomp](l,m) += conj(a1(l,m))*blm[icomp](l,0).real()*lnorm[l]; else for (size_t j=0; j({supp,supp,icomp,2*k-1},{ntheta,nphi,0,0}); auto m2 = cube.template subarray<2>({supp,supp,icomp,2*k },{ntheta,nphi,0,0}); decorrect(m1,k); decorrect(m2,k); sharp_alm2map_spin_adjoint(k, a1.Alms().vdata(), a2.Alms().vdata(), m1.data(), m2.data(), *ginfo, *ainfo, 0, nthreads); for (size_t m=0; m<=lmax; ++m) for (size_t l=m; l<=lmax; ++l) if (l>=k) { if (separate) { auto tmp = conj(blm[icomp](l,k))*(-2*lnorm[l]); slm[icomp](l,m) += conj(a1(l,m))*tmp.real(); slm[icomp](l,m) -= conj(a2(l,m))*tmp.imag(); } else for (size_t j=0; j