diff --git a/gridder_cxx.h b/gridder_cxx.h index fc312ce1b915ac9d317410f77cd7b564d65f5e56..af8e9047d980eea0dbe5549052a5638c3a5e8f92 100644 --- a/gridder_cxx.h +++ b/gridder_cxx.h @@ -1198,10 +1198,16 @@ template<typename T> void apply_wcorr(const GridderConfig &gconf, } } +struct idxhelper + { + uint32_t idx; + int minplane; + }; + template<typename Serv> void wstack_common( const GridderConfig &gconf, const Serv &srv, double &wmin, double &dw, size_t &nplanes, vector<size_t> &nvis_plane, - vector<int> &minplane, size_t verbosity) + vector<vector<idxhelper>> &minplane, size_t verbosity) { size_t nvis = srv.Nvis(); size_t nthreads = gconf.Nthreads(); @@ -1232,22 +1238,24 @@ template<typename Serv> void wstack_common( if (verbosity>0) cout << "Kernel support: " << supp << endl; if (verbosity>0) cout << "nplanes: " << nplanes << endl; nvis_plane.resize(nplanes,0); - minplane.resize(nvis); -#pragma omp parallel num_threads(nthreads) -{ - vector<size_t> nvploc(nplanes,0); -#pragma omp for + + // FIXME: this could still use some tweaking + int dbunch=supp; + int nbunch=(nplanes+dbunch-1)/dbunch; + minplane.resize(nbunch); + for (auto &pl: minplane) + pl.resize(0); for (size_t ipart=0; ipart<nvis; ++ipart) { int plane0 = int((abs(srv.getCoord(ipart).w)-wmin)/dw+0.5*supp)-int(supp-1); - minplane[ipart]=plane0; + for (int ibunch=0; ibunch<nbunch; ++ibunch) + if ((plane0+dbunch>ibunch*dbunch) && (plane0<(ibunch+1)*dbunch)) + minplane[ibunch].push_back({ipart,plane0}); for (int i=max<int>(0,plane0); i<min<int>(nplanes,plane0+supp); ++i) - ++nvploc[i]; + ++nvis_plane[i]; } -#pragma omp critical - for (size_t i=0; i<nplanes; ++i) - nvis_plane[i]+=nvploc[i]; -} + for(auto &pl:minplane) + pl.shrink_to_fit(); } template<typename T, typename Serv> void x2dirty( @@ -1258,21 +1266,20 @@ template<typename T, typename Serv> void x2dirty( { auto nx_dirty=gconf.Nxdirty(); auto ny_dirty=gconf.Nydirty(); - size_t nvis = srv.Nvis(); auto supp = gconf.Supp(); size_t nthreads = gconf.Nthreads(); double wmin; double dw; size_t nplanes; vector<size_t> nvis_plane; - vector<int> minplane; + vector<vector<idxhelper>> minplane; if (verbosity>0) cout << "Gridding using improved w-stacking" << endl; wstack_common(gconf, srv, wmin, dw, nplanes, nvis_plane, minplane, verbosity); dirty.fill(0); vector<complex<T>> vis_loc_; vector<uint32_t> idx_loc_; - vector<uint32_t> newidx; + vector<uint32_t> mapper; tmpStorage<complex<T>,2> grid_({gconf.Nu(),gconf.Nv()}); auto grid=grid_.getMav(); tmpStorage<complex<T>,2> tdirty_(dirty.shape()); @@ -1286,35 +1293,36 @@ template<typename T, typename Serv> void x2dirty( double wcur = wmin+iw*dw; size_t cnt=0; - newidx.resize(nvis_plane[iw]); + mapper.resize(nvis_plane[iw]); + const auto &bla(minplane[iw/supp]); + for (size_t i=0; i<bla.size(); ++i) + if ((int(iw)>=bla[i].minplane) && (int(iw)<bla[i].minplane+int(supp))) + mapper[cnt++] = bla[i].idx; + idx_loc_.resize(nvis_plane[iw]); if (srv.isMsServ()) { - for (size_t ipart=0; ipart<nvis; ++ipart) - if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp)) - newidx[cnt++] = srv.getIdx(ipart); +#pragma omp parallel for num_threads(nthreads) + for (size_t i=0; i<nvis_plane[iw]; ++i) + idx_loc_[i] = srv.getIdx(mapper[i]); grid.fill(0); - auto newidx2 = const_mav<uint32_t, 1>(newidx.data(), {newidx.size()}); - Serv newsrv(srv, newidx2); + auto idx_loc = const_mav<uint32_t, 1>(idx_loc_.data(), {idx_loc_.size()}); + Serv newsrv(srv, idx_loc); x2grid_c(gconf, newsrv, grid, wcur, dw); } else { - for (size_t ipart=0; ipart<nvis; ++ipart) - if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp)) - newidx[cnt++] = ipart; vis_loc_.resize(nvis_plane[iw]); - auto vis_loc = mav<complex<T>,1>(vis_loc_.data(), {nvis_plane[iw]}); - idx_loc_.resize(nvis_plane[iw]); - auto idx_loc = mav<uint32_t,1>(idx_loc_.data(), {nvis_plane[iw]}); #pragma omp parallel for num_threads(nthreads) for (size_t i=0; i<nvis_plane[iw]; ++i) { - auto ipart = newidx[i]; - vis_loc[i] = srv.getVis(ipart); - idx_loc[i] = srv.getIdx(ipart); + auto ipart = mapper[i]; + vis_loc_[i] = srv.getVis(ipart); + idx_loc_[i] = srv.getIdx(ipart); } grid.fill(0); - vis2grid_c(srv.getBaselines(), gconf, cmav(idx_loc), cmav(vis_loc), grid, + auto vis_loc = const_mav<complex<T>,1>(vis_loc_.data(), {nvis_plane[iw]}); + auto idx_loc = const_mav<uint32_t,1>(idx_loc_.data(), {nvis_plane[iw]}); + vis2grid_c(srv.getBaselines(), gconf, idx_loc, vis_loc, grid, nullmav<T,1>(), wcur, dw); } gconf.grid2dirty_c(cmav(grid), tdirty); @@ -1359,14 +1367,13 @@ template<typename T, typename Serv> void dirty2x( { auto nx_dirty=gconf.Nxdirty(); auto ny_dirty=gconf.Nydirty(); - size_t nvis = srv.Nvis(); auto supp = gconf.Supp(); size_t nthreads = gconf.Nthreads(); double wmin; double dw; size_t nplanes; vector<size_t> nvis_plane; - vector<int> minplane; + vector<vector<idxhelper>> minplane; if (verbosity>0) cout << "Degridding using improved w-stacking" << endl; wstack_common(gconf, srv, wmin, dw, nplanes, nvis_plane, minplane, verbosity); @@ -1380,7 +1387,7 @@ template<typename T, typename Serv> void dirty2x( vector<complex<T>> vis_loc_; vector<uint32_t> idx_loc_; - vector<uint32_t> newidx; + vector<uint32_t> mapper; tmpStorage<complex<T>,2> grid_({gconf.Nu(),gconf.Nv()}); auto grid=grid_.getMav(); tmpStorage<complex<T>,2> tdirty2_({nx_dirty,ny_dirty}); @@ -1400,35 +1407,36 @@ template<typename T, typename Serv> void dirty2x( gconf.apply_wscreen(cmav(tdirty2), tdirty2, wcur, false); gconf.dirty2grid_c(cmav(tdirty2), grid); - newidx.resize(nvis_plane[iw]); size_t cnt=0; + mapper.resize(nvis_plane[iw]); + const auto &bla(minplane[iw/supp]); + for (size_t i=0; i<bla.size(); ++i) + if ((int(iw)>=bla[i].minplane) && (int(iw)<bla[i].minplane+int(supp))) + mapper[cnt++] = bla[i].idx; + idx_loc_.resize(nvis_plane[iw]); if (srv.isMsServ()) { - for (size_t ipart=0; ipart<nvis; ++ipart) - if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp)) - newidx[cnt++] = srv.getIdx(ipart); - auto newidx2 = const_mav<uint32_t, 1>(newidx.data(), {newidx.size()}); - Serv newsrv(srv, newidx2); +#pragma omp parallel for num_threads(nthreads) + for (size_t i=0; i<nvis_plane[iw]; ++i) + idx_loc_[i] = srv.getIdx(mapper[i]); + auto idx_loc = const_mav<uint32_t, 1>(idx_loc_.data(), {idx_loc_.size()}); + Serv newsrv(srv, idx_loc); grid2x_c(gconf, cmav(grid), newsrv, wcur, dw); } else { - for (size_t ipart=0; ipart<nvis; ++ipart) - if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp)) - newidx[cnt++] = ipart; vis_loc_.resize(nvis_plane[iw]); auto vis_loc = mav<complex<T>,1>(vis_loc_.data(), {nvis_plane[iw]}); vis_loc.fill(0); - idx_loc_.resize(nvis_plane[iw]); auto idx_loc = mav<uint32_t,1>(idx_loc_.data(), {nvis_plane[iw]}); #pragma omp parallel for num_threads(nthreads) for (size_t i=0; i<nvis_plane[iw]; ++i) - idx_loc[i] = srv.getIdx(newidx[i]); + idx_loc[i] = srv.getIdx(mapper[i]); grid2vis_c(srv.getBaselines(), gconf, cmav(idx_loc), cmav(grid), vis_loc, nullmav<T,1>(), wcur, dw); #pragma omp parallel for num_threads(nthreads) for (size_t i=0; i<nvis_plane[iw]; ++i) - srv.addVis(newidx[i], vis_loc[i]); + srv.addVis(mapper[i], vis_loc[i]); } } }