#ifndef GRIDDER_CXX_H #define GRIDDER_CXX_H /* * This file is part of nifty_gridder. * * nifty_gridder is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * nifty_gridder is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with nifty_gridder; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Copyright (C) 2019 Max-Planck-Society Author: Martin Reinecke */ #include #include #include #include #include #include #ifdef _OPENMP #include #endif #include "pocketfft_hdronly.h" #if defined(__GNUC__) #define NOINLINE __attribute__((noinline)) #define ALIGNED(align) __attribute__ ((aligned(align))) #define RESTRICT __restrict__ #else #define NOINLINE #define ALIGNED(align) #define RESTRICT #endif namespace gridder { namespace detail { using namespace std; template struct VLEN { static constexpr size_t val=1; }; #if (defined(__AVX512F__)) template<> struct VLEN { static constexpr size_t val=16; }; template<> struct VLEN { static constexpr size_t val=8; }; #elif (defined(__AVX__)) template<> struct VLEN { static constexpr size_t val=8; }; template<> struct VLEN { static constexpr size_t val=4; }; #elif (defined(__SSE2__)) template<> struct VLEN { static constexpr size_t val=4; }; template<> struct VLEN { static constexpr size_t val=2; }; #elif (defined(__VSX__)) template<> struct VLEN { static constexpr size_t val=4; }; template<> struct VLEN { static constexpr size_t val=2; }; #endif // "mav" stands for "multidimensional array view" template class mav { static_assert((ndim>0) && (ndim<3), "only supports 1D and 2D arrays"); private: T *d; array shp; array str; public: mav(T *d_, const array &shp_, const array &str_) : d(d_), shp(shp_), str(str_) {} mav(T *d_, const array &shp_) : d(d_), shp(shp_) { str[ndim-1]=1; for (size_t d=2; d<=ndim; ++d) str[ndim-d] = str[ndim-d+1]*shp[ndim-d+1]; } T &operator[](size_t i) const { return operator()(i); } T &operator()(size_t i) const { static_assert(ndim==1, "ndim must be 1"); return d[str[0]*i]; } T &operator()(size_t i, size_t j) const { static_assert(ndim==2, "ndim must be 2"); return d[str[0]*i + str[1]*j]; } size_t shape(size_t i) const { return shp[i]; } const array &shape() const { return shp; } size_t size() const { size_t res=1; for (auto v: shp) res*=v; return res; } ptrdiff_t stride(size_t i) const { return str[i]; } T *data() const { return d; } bool contiguous() const { ptrdiff_t stride=1; for (size_t i=0; i using const_mav = mav; template const_mav cmav (const mav &mav) { return const_mav(mav.data(), mav.shape()); } template const_mav nullmav() { array shp; shp.fill(0); return const_mav(nullptr, shp); } template class tmpStorage { private: vector d; mav mav_; static size_t prod(const array &shp) { size_t res=1; for (auto v: shp) res*=v; return res; } public: tmpStorage(const array &shp) : d(prod(shp)), mav_(d.data(), shp) {} mav &getMav() { return mav_; } const_mav getCmav() { return cmav(mav_); } void fill(const T & val) { std::fill(d.begin(), d.end(), val); } }; // // basic utilities // #if defined (__GNUC__) #define LOC_ CodeLocation(__FILE__, __LINE__, __PRETTY_FUNCTION__) #else #define LOC_ CodeLocation(__FILE__, __LINE__) #endif #define myfail(...) \ do { \ std::ostringstream os; \ streamDump__(os, LOC_, "\n", ##__VA_ARGS__, "\n"); \ throw std::runtime_error(os.str()); \ } while(0) #define myassert(cond,...) \ do { \ if (cond); \ else { myfail("Assertion failure\n", ##__VA_ARGS__); } \ } while(0) template inline void streamDump__(std::ostream &os, const T& value) { os << value; } template inline void streamDump__(std::ostream &os, const T& value, const Args& ... args) { os << value; streamDump__(os, args...); } // to be replaced with std::source_location once available class CodeLocation { private: const char *file, *func; int line; public: CodeLocation(const char *file_, int line_, const char *func_=nullptr) : file(file_), func(func_), line(line_) {} ostream &print(ostream &os) const { os << "file: " << file << ", line: " << line; if (func) os << ", function: " << func; return os; } }; inline std::ostream &operator<<(std::ostream &os, const CodeLocation &loc) { return loc.print(os); } template void checkShape (const array &shp1, const array &shp2) { for (size_t i=0; i inline T fmod1 (T v) { return v-floor(v); } // // Utilities for Gauss-Legendre quadrature // static inline double one_minus_x2 (double x) { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; } void legendre_prep(int n, vector &x, vector &w, size_t nthreads) { constexpr double pi = 3.141592653589793238462643383279502884197; constexpr double eps = 3e-14; int m = (n+1)>>1; x.resize(m); w.resize(m); double t0 = 1 - (1-1./n) / (8.*n*n); double t1 = 1./(4.*n+2.); #pragma omp parallel num_threads(nthreads) { int i; #pragma omp for schedule(dynamic,100) for (i=1; i<=m; ++i) { double x0 = cos(pi * ((i<<2)-1) * t1) * t0; int dobreak=0; int j=0; double dpdx; while(1) { double P_1 = 1.0; double P0 = x0; double dx, x1; for (int k=2; k<=n; k++) { double P_2 = P_1; P_1 = P0; // P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k; P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2); } dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0); /* Newton step */ x1 = x0 - P0/dpdx; dx = x0-x1; x0 = x1; if (dobreak) break; if (abs(dx)<=eps) dobreak=1; myassert(++j<100, "convergence problem"); } x[m-i] = x0; w[m-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx); } } // end of parallel region } // // Start of real gridder functionality // struct RowChan { size_t row, chan; }; template void complex2hartley (const const_mav, 2> &grid, const mav &grid2, size_t nthreads) { checkShape(grid.shape(), grid2.shape()); size_t nu=grid.shape(0), nv=grid.shape(1); #pragma omp parallel for num_threads(nthreads) for (size_t u=0; u void hartley2complex (const const_mav &grid, const mav,2> &grid2, size_t nthreads) { checkShape(grid.shape(), grid2.shape()); size_t nu=grid.shape(0), nv=grid.shape(1); #pragma omp parallel for num_threads(nthreads) for (size_t u=0; u(v1+v2, v1-v2); } } } template void hartley2_2D(const const_mav &in, const mav &out, size_t nthreads) { checkShape(in.shape(), out.shape()); size_t nu=in.shape(0), nv=in.shape(1); ptrdiff_t sz=ptrdiff_t(sizeof(T)); pocketfft::stride_t stri{sz*in.stride(0), sz*in.stride(1)}; pocketfft::stride_t stro{sz*out.stride(0), sz*out.stride(1)}; auto d_i = in.data(); auto ptmp = out.data(); pocketfft::r2r_separable_hartley({nu, nv}, stri, stro, {0,1}, d_i, ptmp, T(1), nthreads); #pragma omp parallel for num_threads(nthreads) for(size_t i=1; i<(nu+1)/2; ++i) for(size_t j=1; j<(nv+1)/2; ++j) { T a = ptmp[i*nv+j]; T b = ptmp[(nu-i)*nv+j]; T c = ptmp[i*nv+nv-j]; T d = ptmp[(nu-i)*nv+nv-j]; ptmp[i*nv+j] = T(0.5)*(a+b+c-d); ptmp[(nu-i)*nv+j] = T(0.5)*(a+b+d-c); ptmp[i*nv+nv-j] = T(0.5)*(a+c+d-b); ptmp[(nu-i)*nv+nv-j] = T(0.5)*(b+c+d-a); } } class ES_Kernel { private: static constexpr double pi = 3.141592653589793238462643383279502884197; double beta; int p; vector x, wgt, psi; size_t supp; public: ES_Kernel(size_t supp_, size_t nthreads) : beta(get_beta(supp_)*supp_), p(int(1.5*supp_+2)), supp(supp_) { legendre_prep(2*p,x,wgt,nthreads); psi=x; for (auto &v:psi) v=operator()(v); } double operator()(double v) const { return exp(beta*(sqrt(1.-v*v)-1.)); } /* Compute correction factors for the ES gridding kernel This implementation follows eqs. (3.8) to (3.10) of Barnett et al. 2018 */ double corfac(double v) const { double tmp=0; for (int i=0; i opt_beta {-1, 0.14, 1.70, 2.08, 2.205, 2.26, 2.29, 2.307, 2.316, 2.3265, 2.3324, 2.282, 2.294, 2.304, 2.3138, 2.317}; myassert(supp maxmaperr { 1e8, 0.19, 2.98e-3, 5.98e-5, 1.11e-6, 2.01e-8, 3.55e-10, 5.31e-12, 8.81e-14, 1.34e-15, 2.17e-17, 2.12e-19, 2.88e-21, 3.92e-23, 8.21e-25, 7.13e-27 }; double epssq = epsilon*epsilon; for (size_t i=1; imaxmaperr[i]) return i; myfail("requested epsilon too small - minimum is 1e-13"); } }; /* Compute correction factors for the ES gridding kernel This implementation follows eqs. (3.8) to (3.10) of Barnett et al. 2018 */ vector correction_factors(size_t n, size_t nval, size_t supp, size_t nthreads) { ES_Kernel kernel(supp, nthreads); vector res(nval); double xn = 1./n; #pragma omp parallel for schedule(static) num_threads(nthreads) for (size_t k=0; k coord; vector f_over_c; size_t nrows, nchan; size_t shift, mask; public: template Baselines(const const_mav &coord_, const const_mav &freq) { constexpr double speedOfLight = 299792458.; myassert(coord_.shape(1)==3, "dimension mismatch"); nrows = coord_.shape(0); nchan = freq.shape(0); shift=0; while((size_t(1)<0, "negative channel frequency encountered"); f_over_c[i] = freq(i)/speedOfLight; } coord.resize(nrows); for (size_t i=0; i>shift, index&mask}; } UVW effectiveCoord(const RowChan &rc) const { return coord[rc.row]*f_over_c[rc.chan]; } UVW effectiveCoord(uint32_t index) const { return effectiveCoord(getRowChan(index)); } size_t Nrows() const { return nrows; } size_t Nchannels() const { return nchan; } uint32_t getIdx(size_t irow, size_t ichan) const { return ichan+(irow< void effectiveUVW(const mav &idx, mav &res) const { size_t nvis = idx.shape(0); checkShape(res.shape(), {idx.shape(0), 3}); for (size_t i=0; i void ms2vis(const mav &ms, const mav &idx, mav &vis, size_t nthreads) const { checkShape(ms.shape(), {nrows, nchan}); size_t nvis = idx.shape(0); checkShape(vis.shape(), {nvis}); #pragma omp parallel for num_threads(nthreads) for (size_t i=0; i void vis2ms(const mav &vis, const mav &idx, mav &ms, size_t nthreads) const { size_t nvis = vis.shape(0); checkShape(idx.shape(), {nvis}); checkShape(ms.shape(), {nrows, nchan}); #pragma omp parallel for num_threads(nthreads) for (size_t i=0; i cfu, cfv; size_t nthreads; double ushift, vshift; int maxiu0, maxiv0; complex wscreen(double x, double y, double w, bool adjoint) const { constexpr double pi = 3.141592653589793238462643383279502884197; double tmp = 1-x-y; if (tmp<0) return 0.; double nm1 = (-x-y)/(sqrt(tmp)+1); // more accurate form of sqrt(1-x-y) double n = nm1+1., xn = 1./n; double phase = 2*pi*w*nm1; if (adjoint) phase *= -1; return complex(cos(phase)*xn, sin(phase)*xn); } public: GridderConfig(size_t nxdirty, size_t nydirty, double epsilon, double pixsize_x, double pixsize_y, size_t nthreads_) : nx_dirty(nxdirty), ny_dirty(nydirty), eps(epsilon), psx(pixsize_x), psy(pixsize_y), supp(ES_Kernel::get_supp(epsilon)), nsafe((supp+1)/2), nu(max(2*nsafe,2*nx_dirty)), nv(max(2*nsafe,2*ny_dirty)), beta(ES_Kernel::get_beta(supp)*supp), cfu(nx_dirty), cfv(ny_dirty), nthreads(nthreads_), ushift(supp*(-0.5)+1+nu), vshift(supp*(-0.5)+1+nv), maxiu0((nu+nsafe)-supp), maxiv0((nv+nsafe)-supp) { myassert((nx_dirty&1)==0, "nx_dirty must be even"); myassert((ny_dirty&1)==0, "ny_dirty must be even"); myassert(epsilon>0, "epsilon must be positive"); myassert(pixsize_x>0, "pixsize_x must be positive"); myassert(pixsize_y>0, "pixsize_y must be positive"); auto tmp = correction_factors(nu, nx_dirty/2+1, supp, nthreads); cfu[nx_dirty/2]=tmp[0]; cfu[0]=tmp[nx_dirty/2]; for (size_t i=1; i void apply_taper(const mav &img, mav &img2, bool divide) const { checkShape(img.shape(), {nx_dirty, ny_dirty}); checkShape(img2.shape(), {nx_dirty, ny_dirty}); if (divide) for (size_t i=0; i void grid2dirty_post(const mav &tmav, const mav &dirty) const { checkShape(dirty.shape(), {nx_dirty,ny_dirty}); for (size_t i=0; i=nu) i2-=nu; size_t j2 = nv-ny_dirty/2+j; if (j2>=nv) j2-=nv; // FIXME: for some reason g++ warns about double-to-float conversion // here, even though there is an explicit cast... dirty(i,j) = tmav(i2,j2)*T(cfu[i]*cfv[j]); } } template void grid2dirty(const const_mav &grid, const mav &dirty) const { checkShape(grid.shape(), {nu,nv}); tmpStorage tmpdat({nu,nv}); auto tmav = tmpdat.getMav(); hartley2_2D(grid, tmav, nthreads); grid2dirty_post(tmav, dirty); } template void grid2dirty_c(const mav,2> &grid, mav,2> &dirty) const { checkShape(grid.shape(), {nu,nv}); tmpStorage,2> tmpdat({nu,nv}); auto tmp = tmpdat.getMav(); constexpr auto sc = ptrdiff_t(sizeof(complex)); pocketfft::c2c({nu,nv},{grid.stride(0)*sc,grid.stride(1)*sc}, {tmp.stride(0)*sc, tmp.stride(1)*sc}, {0,1}, pocketfft::BACKWARD, grid.data(), tmp.data(), T(1), nthreads); grid2dirty_post(tmp, dirty); } template void dirty2grid_pre(const const_mav &dirty, mav &grid) const { checkShape(dirty.shape(), {nx_dirty, ny_dirty}); checkShape(grid.shape(), {nu, nv}); grid.fill(0); for (size_t i=0; i=nu) i2-=nu; size_t j2 = nv-ny_dirty/2+j; if (j2>=nv) j2-=nv; // FIXME: for some reason g++ warns about double-to-float conversion // here, even though there is an explicit cast... grid(i2,j2) = dirty(i,j)*T(cfu[i]*cfv[j]); } } template void dirty2grid(const const_mav &dirty, mav &grid) const { dirty2grid_pre(dirty, grid); hartley2_2D(cmav(grid), grid, nthreads); } template void dirty2grid_c(const const_mav,2> &dirty, mav,2> &grid) const { dirty2grid_pre(dirty, grid); constexpr auto sc = ptrdiff_t(sizeof(complex)); pocketfft::stride_t strides{grid.stride(0)*sc,grid.stride(1)*sc}; pocketfft::c2c({nu,nv}, strides, strides, {0,1}, pocketfft::FORWARD, grid.data(), grid.data(), T(1), nthreads); } void getpix(double u_in, double v_in, double &u, double &v, int &iu0, int &iv0) const { u=fmod1(u_in*psx)*nu; iu0 = min(int(u+ushift)-int(nu), maxiu0); v=fmod1(v_in*psy)*nv; iv0 = min(int(v+vshift)-int(nv), maxiv0); } template void apply_wscreen(const const_mav,2> &dirty, mav,2> &dirty2, double w, bool adjoint) const { checkShape(dirty.shape(), {nx_dirty, ny_dirty}); checkShape(dirty2.shape(), {nx_dirty, ny_dirty}); double x0 = -0.5*nx_dirty*psx, y0 = -0.5*ny_dirty*psy; #pragma omp parallel for num_threads(nthreads) schedule(static) for (size_t i=0; i<=nx_dirty/2; ++i) { double fx = x0+i*psx; fx *= fx; for (size_t j=0; j<=ny_dirty/2; ++j) { double fy = y0+j*psy; auto ws = complex(wscreen(fx, fy*fy, w, adjoint)); dirty2(i,j) = dirty(i,j)*ws; // lower left size_t i2 = nx_dirty-i, j2 = ny_dirty-j; if ((i>0)&&(i0)&&(j0)&&(j> class Helper { private: const GridderConfig &gconf; int nu, nv, nsafe, supp; T beta; const T2 *grid_r; T2 *grid_w; int su, sv; int iu0, iv0; // start index of the current visibility int bu0, bv0; // start index of the current buffer vector rbuf, wbuf; bool do_w_gridding; double w0, xdw; size_t nexp; size_t nvecs; vector &locks; void dump() const { if (bu0<-nsafe) return; // nothing written into buffer yet #pragma omp critical (gridder_writing_to_grid) { int idxu = (bu0+nu)%nu; int idxv0 = (bv0+nv)%nv; for (int iu=0; iu=nv) idxv=0; } locks[idxu].unlock(); if (++idxu>=nu) idxu=0; } } } void load() { int idxu = (bu0+nu)%nu; int idxv0 = (bv0+nv)%nv; for (int iu=0; iu=nv) idxv=0; } if (++idxu>=nu) idxu=0; } } public: const T2 *p0r; T2 *p0w; T kernel[64] ALIGNED(64); Helper(const GridderConfig &gconf_, const T2 *grid_r_, T2 *grid_w_, vector &locks_, double w0_=-1, double dw_=-1) : gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()), supp(gconf.Supp()), beta(T(gconf.Beta())), grid_r(grid_r_), grid_w(grid_w_), su(2*nsafe+(1<0), w0(w0_), xdw(T(1)/dw_), nexp(2*supp + do_w_gridding), nvecs(VLEN::val*((nexp+VLEN::val-1)/VLEN::val)), locks(locks_) {} ~Helper() { if (grid_w) dump(); } int lineJump() const { return sv; } T Wfac() const { return kernel[2*supp]; } void prep(const UVW &in) { double u, v; gconf.getpix(in.u, in.v, u, v, iu0, iv0); double xsupp=2./supp; double x0 = xsupp*(iu0-u); double y0 = xsupp*(iv0-v); for (int i=0; ibu0+su) || (iv0+supp>bv0+sv)) { if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); } bu0=((((iu0+nsafe)>>logsquare)<>logsquare)< class SubServ { private: const Serv &srv; const_mav subidx; public: SubServ(const Serv &orig, const const_mav &subidx_) : srv(orig), subidx(subidx_){} size_t Nvis() const { return subidx.size(); } const Baselines &getBaselines() const { return srv.getBaselines(); } UVW getCoord(size_t i) const { return srv.getCoord(subidx[i]); } complex getVis(size_t i) const { return srv.getVis(subidx[i]); } uint32_t getIdx(size_t i) const { return srv.getIdx(subidx[i]); } void setVis (size_t i, const complex &v) const { srv.setVis(subidx[i], v); } void addVis (size_t i, const complex &v) const { srv.addVis(subidx[i], v); } }; template class VisServ { private: const Baselines &baselines; const_mav idx; T2 vis; const_mav wgt; size_t nvis; bool have_wgt; public: VisServ(const Baselines &baselines_, const const_mav &idx_, T2 vis_, const const_mav &wgt_) : baselines(baselines_), idx(idx_), vis(vis_), wgt(wgt_), nvis(vis.shape(0)), have_wgt(wgt.size()!=0) { checkShape(idx.shape(), {nvis}); if (have_wgt) checkShape(wgt.shape(), {nvis}); } SubServ getSubserv(const const_mav &subidx) const { return SubServ(*this, subidx); } size_t Nvis() const { return nvis; } const Baselines &getBaselines() const { return baselines; } UVW getCoord(size_t i) const { return baselines.effectiveCoord(idx[i]); } complex getVis(size_t i) const { return have_wgt ? vis(i)*wgt(i) : vis(i); } uint32_t getIdx(size_t i) const { return idx[i]; } void setVis (size_t i, const complex &v) const { vis(i) = have_wgt ? v*wgt(i) : v; } void addVis (size_t i, const complex &v) const { vis(i) += have_wgt ? v*wgt(i) : v; } }; template VisServ makeVisServ (const Baselines &baselines, const const_mav &idx, const T2 &vis, const const_mav &wgt) { return VisServ(baselines, idx, vis, wgt); } template class MsServ { private: const Baselines &baselines; const_mav idx; T2 ms; const_mav wgt; size_t nvis; bool have_wgt; public: MsServ(const Baselines &baselines_, const const_mav &idx_, T2 ms_, const const_mav &wgt_) : baselines(baselines_), idx(idx_), ms(ms_), wgt(wgt_), nvis(idx.shape(0)), have_wgt(wgt.size()!=0) { checkShape(ms.shape(), {baselines.Nrows(), baselines.Nchannels()}); if (have_wgt) checkShape(wgt.shape(), ms.shape()); } SubServ getSubserv(const const_mav &subidx) const { return SubServ(*this, subidx); } size_t Nvis() const { return nvis; } const Baselines &getBaselines() const { return baselines; } UVW getCoord(size_t i) const { return baselines.effectiveCoord(idx(i)); } complex getVis(size_t i) const { auto rc = baselines.getRowChan(idx(i)); return have_wgt ? ms(rc.row, rc.chan)*wgt(rc.row, rc.chan) : ms(rc.row, rc.chan); } uint32_t getIdx(size_t i) const { return idx[i]; } void setVis (size_t i, const complex &v) const { auto rc = baselines.getRowChan(idx(i)); ms(rc.row, rc.chan) = have_wgt ? v*wgt(rc.row, rc.chan) : v; } void addVis (size_t i, const complex &v) const { auto rc = baselines.getRowChan(idx(i)); ms(rc.row, rc.chan) += have_wgt ? v*wgt(rc.row, rc.chan) : v; } }; template MsServ makeMsServ (const Baselines &baselines, const const_mav &idx, const T2 &ms, const const_mav &wgt) { return MsServ(baselines, idx, ms, wgt); } template void x2grid_c (const GridderConfig &gconf, const Serv &srv, mav,2> &grid, double w0=-1, double dw=-1) { checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); myassert(grid.contiguous(), "grid is not contiguous"); size_t supp = gconf.Supp(); size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); #pragma omp parallel num_threads(nthreads) { Helper hlp(gconf, nullptr, grid.data(), locks, w0, dw); int jump = hlp.lineJump(); const T * RESTRICT ku = hlp.kernel; const T * RESTRICT kv = hlp.kernel+supp; size_t np = srv.Nvis(); // Loop over sampling points #pragma omp for schedule(guided,100) for (size_t ipart=0; ipart tmp(v*ku[cu]); size_t cv=0; for (; cv+3 void vis2grid_c (const Baselines &baselines, const GridderConfig &gconf, const const_mav &idx, const const_mav,1> &vis, mav,2> &grid, const const_mav &wgt, double w0=-1, double dw=-1) { x2grid_c(gconf, makeVisServ(baselines, idx, vis, wgt), grid, w0, dw); } template void ms2grid_c (const Baselines &baselines, const GridderConfig &gconf, const const_mav &idx, const const_mav,2> &ms, mav,2> &grid, const const_mav &wgt) { x2grid_c(gconf, makeMsServ(baselines, idx, ms, wgt), grid); } template void grid2x_c (const GridderConfig &gconf, const const_mav,2> &grid, const Serv &srv, double w0=-1, double dw=-1) { checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); myassert(grid.contiguous(), "grid is not contiguous"); size_t supp = gconf.Supp(); size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); // Loop over sampling points #pragma omp parallel num_threads(nthreads) { Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); int jump = hlp.lineJump(); const T * RESTRICT ku = hlp.kernel; const T * RESTRICT kv = hlp.kernel+supp; size_t np = srv.Nvis(); #pragma omp for schedule(guided,100) for (size_t ipart=0; ipart r = 0; const auto * RESTRICT ptr = hlp.p0r; for (size_t cu=0; cu tmp(0); size_t cv=0; for (; cv+3 void grid2vis_c (const Baselines &baselines, const GridderConfig &gconf, const const_mav &idx, const const_mav,2> &grid, mav,1> &vis, const const_mav &wgt, double w0=-1, double dw=-1) { grid2x_c(gconf, grid, makeVisServ(baselines, idx, vis, wgt), w0, dw); } template void grid2ms_c (const Baselines &baselines, const GridderConfig &gconf, const const_mav &idx, const const_mav,2> &grid, mav,2> &ms, const const_mav &wgt) { grid2x_c(gconf, grid, makeMsServ(baselines, idx, ms, wgt)); } template void apply_holo (const Baselines &baselines, const GridderConfig &gconf, const const_mav &idx, const const_mav,2> &grid, mav,2> &ogrid, const const_mav &wgt) { checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); size_t nvis = idx.shape(0); size_t nthreads = gconf.Nthreads(); bool have_wgt = wgt.size()!=0; if (have_wgt) checkShape(wgt.shape(), {nvis}); checkShape(ogrid.shape(), grid.shape()); ogrid.fill(0); size_t supp = gconf.Supp(); vector locks(gconf.Nu()); // Loop over sampling points #pragma omp parallel num_threads(nthreads) { Helper hlp(gconf, grid.data(), ogrid.data(), locks); int jump = hlp.lineJump(); const T * RESTRICT ku = hlp.kernel; const T * RESTRICT kv = hlp.kernel+supp; #pragma omp for schedule(guided,100) for (size_t ipart=0; ipart r = 0; const auto * RESTRICT ptr = hlp.p0r; for (size_t cu=0; cu tmp(0); size_t cv=0; for (;cv tmp(r*ku[cu]); size_t cv=0; for (;cv void get_correlations (const Baselines &baselines, const GridderConfig &gconf, const const_mav &idx, int du, int dv, mav &ogrid, const const_mav &wgt) { size_t nvis = idx.shape(0); bool have_wgt = wgt.size()!=0; if (have_wgt) checkShape(wgt.shape(), {nvis}); checkShape(ogrid.shape(), {gconf.Nu(), gconf.Nv()}); size_t supp = gconf.Supp(); myassert(size_t(abs(du)) locks(gconf.Nu()); size_t u0, u1, v0, v1; if (du>=0) { u0=0; u1=supp-du; } else { u0=-du; u1=supp; } if (dv>=0) { v0=0; v1=supp-dv; } else { v0=-dv; v1=supp; } // Loop over sampling points #pragma omp parallel num_threads(nthreads) { Helper hlp(gconf, nullptr, ogrid.data(), locks); int jump = hlp.lineJump(); const T * RESTRICT ku = hlp.kernel; const T * RESTRICT kv = hlp.kernel+supp; #pragma omp for schedule(guided,100) for (size_t ipart=0; ipart void apply_wcorr(const GridderConfig &gconf, const mav &dirty, const ES_Kernel &kernel, double dw) { auto nx_dirty=gconf.Nxdirty(); auto ny_dirty=gconf.Nydirty(); size_t nthreads = gconf.Nthreads(); auto psx=gconf.Pixsize_x(); auto psy=gconf.Pixsize_y(); double x0 = -0.5*nx_dirty*psx, y0 = -0.5*ny_dirty*psy; #pragma omp parallel for schedule(static) num_threads(nthreads) for (size_t i=0; i<=nx_dirty/2; ++i) { double fx = x0+i*psx; fx *= fx; for (size_t j=0; j<=ny_dirty/2; ++j) { double fy = y0+j*psy; fy*=fy; T fct = 0; double tmp = 1.-fx-fy; if (tmp>=0) { auto nm1 = (-fx-fy)/(sqrt(1.-fx-fy)+1.); // accurate form of sqrt(1-x-y) fct = T((nm1<=-1) ? 0. : kernel.corfac(nm1*dw)); } size_t i2 = nx_dirty-i, j2 = ny_dirty-j; dirty(i,j)*=fct; if ((i>0)&&(i0)&&(j0)&&(j void wminmax(const GridderConfig &gconf, const Serv &srv, double &wmin, double &wmax) { size_t nvis = srv.Nvis(); size_t nthreads = gconf.Nthreads(); wmin= 1e38; wmax=-1e38; // FIXME maybe this can be done more intelligently #pragma omp parallel for num_threads(nthreads) reduction(min:wmin) reduction(max:wmax) for (size_t ipart=0; ipart void update_idx(vector &v, vector &vold, const vector &add, const vector &del) { vold.resize(0); vold.reserve((v.size()+add.size())-del.size()); auto iin=v.begin(), ein=v.end(); auto iadd=add.begin(), eadd=add.end(); auto irem=del.begin(), erem=del.end(); while(iin!=ein) { if (irem!=erem) { while ((iin!=ein) && (*iin==*irem)) { ++irem; ++iin; } // skip the entries to be removed if (iin==ein) break; } if (iadd!=eadd) while ((iadd!=eadd) && (*iadd<*iin)) vold.push_back(*(iadd++)); vold.push_back(*(iin++)); } while(iadd!=eadd) vold.push_back(*(iadd++)); v.swap(vold); } template void wstack_common( const GridderConfig &gconf, const Serv &srv, double &wmin, double &dw, size_t &nplanes, vector &nvis_plane, vector> &minplane, size_t verbosity) { size_t nvis = srv.Nvis(); size_t nthreads = gconf.Nthreads(); double wmax; wminmax(gconf, srv, wmin, wmax); #ifdef _OPENMP if (verbosity>0) cout << "Using " << nthreads << " threads" << endl; #else if (verbosity>0) cout << "Using single-threaded mode" << endl; #endif if (verbosity>0) cout << "W range: " << wmin << " to " << wmax << endl; double x0 = -0.5*gconf.Nxdirty()*gconf.Pixsize_x(), y0 = -0.5*gconf.Nydirty()*gconf.Pixsize_y(); double nmin = sqrt(max(1.-x0*x0-y0*y0,0.))-1.; dw = 0.25/abs(nmin); nplanes = size_t((wmax-wmin)/dw+2); dw = (1.+1e-13)*(wmax-wmin)/(nplanes-1); auto supp = gconf.Supp(); wmin -= (0.5*supp-1)*dw; wmax += (0.5*supp-1)*dw; nplanes += supp-2; if (verbosity>0) cout << "Kernel support: " << supp << endl; if (verbosity>0) cout << "nplanes: " << nplanes << endl; nvis_plane.resize(nplanes,0); minplane.resize(nplanes); vector tcnt(my_max_threads()*nplanes,0); #pragma omp parallel num_threads(nthreads) { vector mytcnt(nplanes,0); vector nvp(nplanes,0); auto mythread = my_thread_num(); #pragma omp for schedule(static) for (size_t ipart=0; ipart(nplanes,plane0+supp); ++i) ++nvp[i]; } #pragma omp critical (wstack_common) { for (size_t i=0; i myofs(nplanes, 0); for (size_t j=0; j void x2dirty( const GridderConfig &gconf, const Serv &srv, const mav &dirty, bool do_wstacking, size_t verbosity) { if (do_wstacking) { auto nx_dirty=gconf.Nxdirty(); auto ny_dirty=gconf.Nydirty(); auto supp = gconf.Supp(); size_t nthreads = gconf.Nthreads(); double wmin; double dw; size_t nplanes; vector nvis_plane; vector> minplane; if (verbosity>0) cout << "Gridding using improved w-stacking" << endl; wstack_common(gconf, srv, wmin, dw, nplanes, nvis_plane, minplane, verbosity); dirty.fill(0); vector subidx, subidx_old; tmpStorage,2> grid_({gconf.Nu(),gconf.Nv()}); auto grid=grid_.getMav(); tmpStorage,2> tdirty_(dirty.shape()); auto tdirty=tdirty_.getMav(); for (size_t iw=0; iw1) cout << "Working on plane " << iw << " containing " << nvis_plane[iw] << " visibilities" << endl; double wcur = wmin+iw*dw; update_idx(subidx, subidx_old, minplane[iw], iw>=supp ? minplane[iw-supp] : vector()); auto subidx2 = const_mav(subidx.data(), {subidx.size()}); auto subsrv = srv.getSubserv(subidx2); grid.fill(0); x2grid_c(gconf, subsrv, grid, wcur, dw); gconf.grid2dirty_c(cmav(grid), tdirty); gconf.apply_wscreen(cmav(tdirty), tdirty, wcur, true); #pragma omp parallel for num_threads(nthreads) for (size_t i=0; i0) cout << "Gridding without w-stacking: " << srv.Nvis() << " visibilities" << endl; if (verbosity>0) cout << "Using " << gconf.Nthreads() << " threads" << endl; tmpStorage,2> grid_({gconf.Nu(), gconf.Nv()}); auto grid=grid_.getMav(); grid_.fill(0.); x2grid_c(gconf, srv, grid); tmpStorage rgrid_(grid.shape()); auto rgrid=rgrid_.getMav(); complex2hartley(cmav(grid), rgrid, gconf.Nthreads()); gconf.grid2dirty(cmav(rgrid), dirty); } } template void vis2dirty(const Baselines &baselines, const GridderConfig &gconf, const const_mav &idx, const const_mav,1> &vis, const const_mav &wgt, mav &dirty, bool do_wstacking, size_t verbosity) { x2dirty(gconf, makeVisServ(baselines, idx, vis, wgt), dirty, do_wstacking, verbosity); } template void dirty2x( const GridderConfig &gconf, const const_mav &dirty, const Serv &srv, bool do_wstacking, size_t verbosity) { if (do_wstacking) { auto nx_dirty=gconf.Nxdirty(); auto ny_dirty=gconf.Nydirty(); auto supp = gconf.Supp(); size_t nthreads = gconf.Nthreads(); double wmin; double dw; size_t nplanes; vector nvis_plane; vector> minplane; if (verbosity>0) cout << "Degridding using improved w-stacking" << endl; wstack_common(gconf, srv, wmin, dw, nplanes, nvis_plane, minplane, verbosity); tmpStorage tdirty_({nx_dirty,ny_dirty}); auto tdirty=tdirty_.getMav(); for (size_t i=0; i subidx, subidx_old; tmpStorage,2> grid_({gconf.Nu(),gconf.Nv()}); auto grid=grid_.getMav(); tmpStorage,2> tdirty2_({nx_dirty,ny_dirty}); auto tdirty2=tdirty2_.getMav(); for (size_t iw=0; iw1) cout << "Working on plane " << iw << " containing " << nvis_plane[iw] << " visibilities" << endl; double wcur = wmin+iw*dw; #pragma omp parallel for num_threads(nthreads) for (size_t i=0; i=supp ? minplane[iw-supp] : vector()); auto subidx2 = const_mav(subidx.data(), {subidx.size()}); auto subsrv = srv.getSubserv(subidx2); grid2x_c(gconf, cmav(grid), subsrv, wcur, dw); } } else { if (verbosity>0) cout << "Degridding without w-stacking: " << srv.Nvis() << " visibilities" << endl; if (verbosity>0) cout << "Using " << gconf.Nthreads() << " threads" << endl; tmpStorage grid_({gconf.Nu(), gconf.Nv()}); auto grid=grid_.getMav(); gconf.dirty2grid(dirty, grid); tmpStorage,2> grid2_(grid.shape()); auto grid2=grid2_.getMav(); hartley2complex(cmav(grid), grid2, gconf.Nthreads()); grid2x_c(gconf, cmav(grid2), srv); } } template void dirty2vis(const Baselines &baselines, const GridderConfig &gconf, const const_mav &idx, const const_mav &dirty, const const_mav &wgt, mav,1> &vis, bool do_wstacking, size_t verbosity) { dirty2x(gconf, dirty, makeVisServ(baselines, idx, vis, wgt), do_wstacking, verbosity); } size_t getIdxSize(const Baselines &baselines, const const_mav &flags, int chbegin, int chend, double wmin, double wmax, size_t nthreads) { size_t nrow=baselines.Nrows(), nchan=baselines.Nchannels(); if (chbegin<0) chbegin=0; if (chend<0) chend=nchan; myassert(chend>chbegin, "empty channel range selected"); myassert(chend<=int(nchan), "chend too large"); myassert(wmax>wmin, "empty w range selected"); checkShape(flags.shape(), {nrow, nchan}); size_t res=0; #pragma omp parallel for num_threads(nthreads) reduction(+:res) for (size_t irow=0; irow=wmin) && (w &flags, int chbegin, int chend, double wmin, double wmax, mav &res) { size_t nrow=baselines.Nrows(), nchan=baselines.Nchannels(), nsafe=gconf.Nsafe(); if (chbegin<0) chbegin=0; if (chend<0) chend=nchan; myassert(chend>chbegin, "empty channel range selected"); myassert(chend<=int(nchan), "chend too large"); myassert(wmax>wmin, "empty w range selected"); checkShape(flags.shape(), {nrow, nchan}); constexpr int side=1<> logsquare, nbv = (gconf.Nv()+1+side-1) >> logsquare; vector acc(nbu*nbv+1, 0); vector tmp(nrow*(chend-chbegin)); for (size_t irow=0, idx=0; irow=wmin) && (uvw.w>logsquare; iv0 = (iv0+nsafe)>>logsquare; ++acc[nbv*iu0 + iv0 + 1]; tmp[idx++] = nbv*iu0 + iv0; } } for (size_t i=1; i=wmin) && (w vector getWgtIndices(const Baselines &baselines, const GridderConfig &gconf, const const_mav &wgt) { size_t nrow=baselines.Nrows(), nchan=baselines.Nchannels(), nsafe=gconf.Nsafe(); bool have_wgt=wgt.size()!=0; if (have_wgt) checkShape(wgt.shape(),{nrow,nchan}); constexpr int side=1<> logsquare, nbv = (gconf.Nv()+1+side-1) >> logsquare; vector acc(nbu*nbv+1, 0); vector tmp(nrow*nchan); for (size_t irow=0, idx=0; irow>logsquare; iv0 = (iv0+nsafe)>>logsquare; ++acc[nbv*iu0 + iv0 + 1]; tmp[idx++] = nbv*iu0 + iv0; } for (size_t i=1; i res(acc.back()); for (size_t irow=0, idx=0; irow void ms2dirty(const const_mav &uvw, const const_mav &freq, const const_mav,2> &ms, const const_mav &wgt, double pixsize_x, double pixsize_y, double epsilon, bool do_wstacking, size_t nthreads, const mav &dirty, size_t verbosity) { Baselines baselines(uvw, freq); GridderConfig gconf(dirty.shape(0), dirty.shape(1), epsilon, pixsize_x, pixsize_y, nthreads); auto idx = getWgtIndices(baselines, gconf, wgt); auto idx2 = const_mav(idx.data(),{idx.size()}); x2dirty(gconf, makeMsServ(baselines,idx2,ms,wgt), dirty, do_wstacking, verbosity); } template void dirty2ms(const const_mav &uvw, const const_mav &freq, const const_mav &dirty, const const_mav &wgt, double pixsize_x, double pixsize_y, double epsilon, bool do_wstacking, size_t nthreads, const mav,2> &ms, size_t verbosity) { Baselines baselines(uvw, freq); GridderConfig gconf(dirty.shape(0), dirty.shape(1), epsilon, pixsize_x, pixsize_y, nthreads); auto idx = getWgtIndices(baselines, gconf, wgt); auto idx2 = const_mav(idx.data(),{idx.size()}); ms.fill(0); dirty2x(gconf, dirty, makeMsServ(baselines,idx2,ms,wgt), do_wstacking, verbosity); } } // namespace detail // public names using detail::mav; using detail::const_mav; using detail::Baselines; using detail::GridderConfig; using detail::ms2grid_c; using detail::grid2ms_c; using detail::vis2grid_c; using detail::grid2vis_c; using detail::vis2dirty; using detail::dirty2vis; using detail::ms2dirty; using detail::dirty2ms; using detail::streamDump__; using detail::CodeLocation; } // namespace gridder #endif