From 3d5a6524671120037accb9f54098389af54c0adc Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 31 Jul 2020 08:59:02 +0200 Subject: [PATCH 01/21] first try --- src/ducc0/infra/threading.cc | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/ducc0/infra/threading.cc b/src/ducc0/infra/threading.cc index 8948fb3..04994f3 100644 --- a/src/ducc0/infra/threading.cc +++ b/src/ducc0/infra/threading.cc @@ -29,6 +29,7 @@ #include #include #include +#include #if __has_include() #include #endif @@ -159,6 +160,8 @@ class thread_pool ~thread_pool() { shutdown(); } + size_t size() const { return threads_.size(); } + void submit(std::function work) { work_queue_.push(move(work)); @@ -196,6 +199,27 @@ inline thread_pool &get_pool() return pool; } +inline thread_pool &get_pool2(size_t nthreads=0) + { + static std::unique_ptr pool(std::make_unique(1)); + if ((!pool) || ((nthreads!=0) && (nthreads!=pool->size()))) // resize + { + pool = std::make_unique(nthreads); + } +#if __has_include() + static std::once_flag f; + call_once(f, + []{ + pthread_atfork( + +[]{ get_pool2().shutdown(); }, // prepare + +[]{ get_pool2().restart(); }, // parent + +[]{ get_pool2().restart(); } // child + ); + }); +#endif + + return *pool; + } class Distribution { @@ -317,7 +341,8 @@ void Distribution::thread_map(std::function f) return; } - auto & pool = get_pool(); + auto & pool = get_pool2(nthreads_); +// auto & pool = get_pool(); latch counter(nthreads_); std::exception_ptr ex; std::mutex ex_mut; -- GitLab From fe2d663ffde17e22e83a1551d9719014868764cf Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 31 Jul 2020 15:38:38 +0200 Subject: [PATCH 02/21] performance tweaks --- python/gridder_cxx.h | 135 ++++++++++++++++++++----------------------- 1 file changed, 62 insertions(+), 73 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 9c48bc3..764ff21 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -237,15 +237,15 @@ template class GridderConfig size_t vlim; bool uv_side_fast; - complex wscreen(T x, T y, T w, bool adjoint) const + T phase (T x, T y, T w, bool adjoint) const { constexpr T pi = T(3.141592653589793238462643383279502884197); T tmp = 1-x-y; if (tmp<=0) return 1; // no phase factor beyond the horizon T nm1 = (-x-y)/(sqrt(tmp)+1); // more accurate form of sqrt(1-x-y)-1 - T phase = 2*pi*w*nm1; - if (adjoint) phase *= -1; - return complex(cos(phase), sin(phase)); + T phs = 2*pi*w*nm1; + if (adjoint) phs *= -1; + return phs; } public: @@ -307,8 +307,6 @@ template class GridderConfig if (i2>=nu) i2-=nu; size_t j2 = nv-ny_dirty/2+j; if (j2>=nv) j2-=nv; - // FIXME: for some reason g++ warns about double-to-float conversion - // here, even though there is an explicit cast... dirty.v(i,j) = tmav(i2,j2)*T(cfu[icfu]*cfv[icfv]); } } @@ -322,33 +320,45 @@ template class GridderConfig y0 = -0.5*ny_dirty*psy; execStatic(nx_dirty/2+1, nthreads, 0, [&](Scheduler &sched) { + using vtype = native_simd; + constexpr size_t vlen=vtype::size(); + size_t nvec = (ny_dirty/2+1+(vlen-1))/vlen; + vector ph(nvec), sp(nvec), cp(nvec); while (auto rng=sched.getNext()) for(auto i=rng.lo; i=nu) ix-=nu; + size_t i2 = nx_dirty-i; + size_t ix2 = nu-nx_dirty/2+i2; + if (ix2>=nu) ix2-=nu; for (size_t j=0; j<=ny_dirty/2; ++j) { T fy = T(y0+j*psy); - auto ws = wscreen(fx, fy*fy, w, true); - size_t ix = nu-nx_dirty/2+i; - if (ix>=nu) ix-=nu; - size_t jx = nv-ny_dirty/2+j; - if (jx>=nv) jx-=nv; - dirty.v(i,j) += (tmav(ix,jx)*ws).real(); // lower left - size_t i2 = nx_dirty-i, j2 = ny_dirty-j; - size_t ix2 = nu-nx_dirty/2+i2; - if (ix2>=nu) ix2-=nu; - size_t jx2 = nv-ny_dirty/2+j2; - if (jx2>=nv) jx2-=nv; - if ((i>0)&&(i0)&&(i=nv)? jx+1-nv : jx+1) { + size_t j2 = min(j, ny_dirty-j); + auto ws = complex(cp[j2/vlen][j2%vlen],sp[j2/vlen][j2%vlen]); + dirty.v(i,j) += (tmav(ix,jx)*ws).real(); // lower left dirty.v(i2,j) += (tmav(ix2,jx)*ws).real(); // lower right - if ((j>0)&&(j0)&&(j=nv)? jx+1-nv : jx+1) + { + size_t j2 = min(j, ny_dirty-j); + auto ws = complex(cp[j2/vlen][j2%vlen],sp[j2/vlen][j2%vlen]); + dirty.v(i,j) += (tmav(ix,jx)*ws).real(); // lower left + } } }); } @@ -420,33 +430,45 @@ template class GridderConfig y0 = -0.5*ny_dirty*psy; execStatic(nx_dirty/2+1, nthreads, 0, [&](Scheduler &sched) { + using vtype = native_simd; + constexpr size_t vlen=vtype::size(); + size_t nvec = (ny_dirty/2+1+(vlen-1))/vlen; + vector ph(nvec), sp(nvec), cp(nvec); while (auto rng=sched.getNext()) for(auto i=rng.lo; i=nu) ix-=nu; + size_t i2 = nx_dirty-i; + size_t ix2 = nu-nx_dirty/2+i2; + if (ix2>=nu) ix2-=nu; for (size_t j=0; j<=ny_dirty/2; ++j) { T fy = T(y0+j*psy); - auto ws = wscreen(fx, fy*fy, w, false); - size_t ix = nu-nx_dirty/2+i; - if (ix>=nu) ix-=nu; - size_t jx = nv-ny_dirty/2+j; - if (jx>=nv) jx-=nv; - grid.v(ix,jx) = dirty(i,j)*ws; // lower left - size_t i2 = nx_dirty-i, j2 = ny_dirty-j; - size_t ix2 = nu-nx_dirty/2+i2; - if (ix2>=nu) ix2-=nu; - size_t jx2 = nv-ny_dirty/2+j2; - if (jx2>=nv) jx2-=nv; - if ((i>0)&&(i0)&&(i=nv)? jx+1-nv : jx+1) { + size_t j2 = min(j, ny_dirty-j); + auto ws = complex(cp[j2/vlen][j2%vlen],sp[j2/vlen][j2%vlen]); + grid.v(ix,jx) = dirty(i,j)*ws; // lower left grid.v(ix2,jx) = dirty(i2,j)*ws; // lower right - if ((j>0)&&(j0)&&(j=nv)? jx+1-nv : jx+1) + { + size_t j2 = min(j, ny_dirty-j); + auto ws = complex(cp[j2/vlen][j2%vlen],sp[j2/vlen][j2%vlen]); + grid.v(ix,jx) = dirty(i,j)*ws; // lower left + } } }); } @@ -485,39 +507,6 @@ template class GridderConfig v=fmod1(v_in*psy)*nv; iv0 = min(int(v+vshift)-int(nv), maxiv0); } - - void apply_wscreen(const mav,2> &dirty, - mav,2> &dirty2, double w, bool adjoint) const - { - checkShape(dirty.shape(), {nx_dirty, ny_dirty}); - checkShape(dirty2.shape(), {nx_dirty, ny_dirty}); - - double x0 = -0.5*nx_dirty*psx, - y0 = -0.5*ny_dirty*psy; - execStatic(nx_dirty/2+1, nthreads, 0, [&](Scheduler &sched) - { - while (auto rng=sched.getNext()) for(auto i=rng.lo; i0)&&(i0)&&(j0)&&(j Date: Fri, 31 Jul 2020 16:39:13 +0200 Subject: [PATCH 03/21] tiny savings --- python/gridder_cxx.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 764ff21..7943ba1 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -348,16 +348,16 @@ template class GridderConfig for (size_t j=0, jx=nv-ny_dirty/2; j=nv)? jx+1-nv : jx+1) { size_t j2 = min(j, ny_dirty-j); - auto ws = complex(cp[j2/vlen][j2%vlen],sp[j2/vlen][j2%vlen]); - dirty.v(i,j) += (tmav(ix,jx)*ws).real(); // lower left - dirty.v(i2,j) += (tmav(ix2,jx)*ws).real(); // lower right + T re = cp[j2/vlen][j2%vlen], im = sp[j2/vlen][j2%vlen]; + dirty.v(i,j) += tmav(ix,jx).real()*re - tmav(ix,jx).imag()*im; + dirty.v(i2,j) += tmav(ix2,jx).real()*re - tmav(ix2,jx).imag()*im; } else for (size_t j=0, jx=nv-ny_dirty/2; j=nv)? jx+1-nv : jx+1) { size_t j2 = min(j, ny_dirty-j); - auto ws = complex(cp[j2/vlen][j2%vlen],sp[j2/vlen][j2%vlen]); - dirty.v(i,j) += (tmav(ix,jx)*ws).real(); // lower left + T re = cp[j2/vlen][j2%vlen], im = sp[j2/vlen][j2%vlen]; + dirty.v(i,j) += tmav(ix,jx).real()*re - tmav(ix,jx).imag()*im; // lower left } } }); -- GitLab From cc57eeb27b3909f9c110d6c6a2ca0acaa2f3cec4 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Fri, 31 Jul 2020 19:40:01 +0200 Subject: [PATCH 04/21] improve scaling --- python/gridder_cxx.h | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 7943ba1..5155b1a 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -931,22 +931,23 @@ template class WgridHelper } #else // more efficient: precalculate final vector sizes and avoid reallocations + vector p0(nvis); + execStatic(nvis, nthreads, 0, [&](Scheduler &sched) + { + while (auto rng=sched.getNext()) for(auto i=rng.lo; i cnt(nplanes,0); for(size_t ipart=0; ipart ofs(nplanes, 0); for (size_t ipart=0; ipart vector getWgtIndices(const Baselines &baselines, { size_t nrow=baselines.Nrows(), nchan=baselines.Nchannels(), - nsafe=gconf.Nsafe(); + nsafe=gconf.Nsafe(), + nthreads=gconf.Nthreads(); bool have_wgt=wgt.size()!=0; if (have_wgt) checkShape(wgt.shape(),{nrow,nchan}); bool have_ms=ms.size()!=0; @@ -1074,10 +1076,13 @@ template vector getWgtIndices(const Baselines &baselines, vector acc(nbu*nbv+1,0); vector tmp(nrow*nchan); - for (idx_t irow=0, idx=0; irow vector getWgtIndices(const Baselines &baselines, gconf.getpix(uvw.u, uvw.v, u, v, iu0, iv0); iu0 = (iu0+nsafe)>>logsquare; iv0 = (iv0+nsafe)>>logsquare; - ++acc[nbv*iu0 + iv0 + 1]; tmp[idx] = nbv*iu0 + iv0; } else tmp[idx] = ~idx_t(0); + } + }); + for (idx_t idx=0; idx Date: Sat, 1 Aug 2020 11:14:32 +0200 Subject: [PATCH 05/21] reduce single-threaded sections --- python/gridder_cxx.h | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 5155b1a..3e12046 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -1073,11 +1073,12 @@ template vector getWgtIndices(const Baselines &baselines, constexpr int side=1<> logsquare, nbv = (gconf.Nv()+1+side-1) >> logsquare; - vector acc(nbu*nbv+1,0); + mav acc({nthreads, (nbu*nbv+16)}); // the 16 is safety distance to avoid false sharing vector tmp(nrow*nchan); execStatic(nrow, nthreads, 0, [&](Scheduler &sched) { + idx_t tid = sched.thread_num(); while (auto rng=sched.getNext()) for(auto irow=idx_t(rng.lo); irow vector getWgtIndices(const Baselines &baselines, iu0 = (iu0+nsafe)>>logsquare; iv0 = (iv0+nsafe)>>logsquare; tmp[idx] = nbv*iu0 + iv0; + ++acc.v(tid, tmp[idx]); } else tmp[idx] = ~idx_t(0); } }); - for (idx_t idx=0; idx res(acc.back()); - for (size_t irow=0, idx=0; irow res(offset); + execStatic(nrow, nthreads, 0, [&](Scheduler &sched) + { + idx_t tid = sched.thread_num(); + while (auto rng=sched.getNext()) for(auto irow=idx_t(rng.lo); irow Date: Sat, 1 Aug 2020 14:30:21 +0200 Subject: [PATCH 06/21] more scaling --- python/gridder_cxx.h | 85 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 13 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 3e12046..0ef23a5 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -844,12 +844,21 @@ template void apply_global_corrections(const GridderConfig &gconf }); } +auto calc_share(size_t nshares, size_t myshare, size_t nwork) + { + size_t nbase = nwork/nshares; + size_t additional = nwork%nshares; + size_t lo = myshare*nbase + ((myshare class WgridHelper { private: Serv &srv; double wmin, dw; - size_t nplanes, supp; + size_t nplanes, supp, nthreads; vector> minplane; size_t verbosity; @@ -872,11 +881,12 @@ template class WgridHelper } template static void update_idx(vector &v, const vector &add, - const vector &del) + const vector &del, size_t nthreads) { MR_assert(v.size()>=del.size(), "must not happen"); vector res; res.reserve((v.size()+add.size())-del.size()); +#if 0 auto iin=v.begin(), ein=v.end(); auto iadd=add.begin(), eadd=add.end(); auto irem=del.begin(), erem=del.end(); @@ -893,16 +903,51 @@ template class WgridHelper MR_assert(irem==erem, "must not happen"); while(iadd!=eadd) res.push_back(*(iadd++)); +#else + if (v.empty()) //special case + { + MR_assert(del.empty(), "must not happen"); + for (auto x: add) + res.push_back(x); + } + else + { + res.resize((v.size()+add.size())-del.size()); + execParallel(nthreads, [&](Scheduler &sched) { + auto tid = sched.thread_num(); + auto [lo, hi] = calc_share(nthreads, tid, v.size()); + if (lo==hi) return; // if interval is empty, do nothing + auto iin=v.begin()+lo, ein=v.begin()+hi; + auto iadd = (iin==v.begin()) ? add.begin() : lower_bound(add.begin(), add.end(), *iin); + auto eadd = (ein==v.end()) ? add.end() : lower_bound(add.begin(), add.end(), *ein); + auto irem = (iin==v.begin()) ? del.begin() : lower_bound(del.begin(), del.end(), *iin); + auto erem = (ein==v.end()) ? del.end() : lower_bound(del.begin(), del.end(), *ein); + auto iout = res.begin()+lo-(irem-del.begin())+(iadd-add.begin()); + while(iin!=ein) + { + if ((irem!=erem) && (*iin==*irem)) + { ++irem; ++iin; } // skip removed entry + else if ((iadd!=eadd) && (*iadd<*iin)) + *(iout++) = *(iadd++); // add new entry + else + *(iout++) = *(iin++); + } + MR_assert(irem==erem, "must not happen"); + while(iadd!=eadd) + *(iout++) = *(iadd++); + }); + } +#endif MR_assert(res.size()==(v.size()+add.size())-del.size(), "must not happen"); v.swap(res); } public: WgridHelper(const GridderConfig &gconf, Serv &srv_, size_t verbosity_) - : srv(srv_), supp(gconf.Supp()), verbosity(verbosity_), curplane(-1) + : srv(srv_), supp(gconf.Supp()), nthreads(gconf.Nthreads()), + verbosity(verbosity_), curplane(-1) { size_t nvis = srv.Nvis(); - size_t nthreads = gconf.Nthreads(); double wmax; wminmax(srv, wmin, wmax); @@ -932,22 +977,36 @@ template class WgridHelper #else // more efficient: precalculate final vector sizes and avoid reallocations vector p0(nvis); + mav cnt({nthreads, nplanes+16}); // safety distance against false sharing execStatic(nvis, nthreads, 0, [&](Scheduler &sched) { + auto tid=sched.thread_num(); while (auto rng=sched.getNext()) for(auto i=rng.lo; i cnt(nplanes,0); - for(size_t ipart=0; ipart ofs(nplanes, 0); - for (size_t ipart=0; ipart class WgridHelper bool advance() { if (++curplane>=int(nplanes)) return false; - update_idx(subidx, minplane[curplane], curplane>=int(supp) ? minplane[curplane-supp] : vector()); + update_idx(subidx, minplane[curplane], curplane>=int(supp) ? minplane[curplane-supp] : vector(), nthreads); if (verbosity>1) cout << "Working on plane " << curplane << " containing " << subidx.size() << " visibilities" << endl; -- GitLab From 0f626efd67cf8f1d6538c41be6e4dd70f1f26fcc Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sat, 1 Aug 2020 20:54:32 +0200 Subject: [PATCH 07/21] not yet beautiful but already guite fast --- python/gridder_cxx.h | 369 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 330 insertions(+), 39 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 0ef23a5..ce08b0d 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -511,19 +511,20 @@ template class GridderConfig constexpr int logsquare=4; -template> class Helper +template class Helper { private: + using T2=complex; const GridderConfig &gconf; int nu, nv, nsafe, supp; const T2 *grid_r; T2 *grid_w; - int su, sv; + int su, sv, svvec; int iu0, iv0; // start index of the current visibility int bu0, bv0; // start index of the current buffer T wfac; - vector rbuf, wbuf; + mav rbufr, rbufi, wbufr, wbufi; bool do_w_gridding; double w0, xdw; vector &locks; @@ -541,7 +542,7 @@ template> class Helper std::lock_guard lock(locks[idxu]); for (int iv=0; iv(wbufr(iu,iv), wbufi(iu,iv)); if (++idxv>=nv) idxv=0; } } @@ -558,7 +559,8 @@ template> class Helper int idxv = idxv0; for (int iv=0; iv=nv) idxv=0; } if (++idxu>=nu) idxu=0; @@ -567,8 +569,8 @@ template> class Helper public: size_t nvec; - const T2 *p0r; - T2 *p0w; + const T *p0rr, *p0ri; + T *p0wr, *p0wi; static constexpr size_t vlen=native_simd::size(); union kbuf { T scalar[64]; @@ -580,10 +582,15 @@ template> class Helper vector &locks_, double w0_=-1, double dw_=-1) : gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()), supp(gconf.Supp()), grid_r(grid_r_), - grid_w(grid_w_), su(2*nsafe+(1<0), w0(w0_), xdw(T(1)/dw_), @@ -594,7 +601,7 @@ template> class Helper } ~Helper() { if (grid_w) dump(); } - int lineJump() const { return sv; } + int lineJump() const { return svvec; } T Wfac() const { return wfac; } void prep(const UVW &in) { @@ -610,13 +617,20 @@ template> class Helper wfac = krn.eval_single(T(xdw*xsupp*abs(w0-in.w))); if ((iu0bu0+su) || (iv0+supp>bv0+sv)) { - if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); } + if (grid_w) + { + dump(); + wbufr.apply([](T &v){v=0;}); + wbufi.apply([](T &v){v=0;}); + } bu0=((((iu0+nsafe)>>logsquare)<>logsquare)< void x2grid_c size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); - + constexpr size_t vlen=native_simd::size(); + size_t nvec((supp+vlen-1)/vlen); size_t np = srv.Nvis(); + if (nvec==1) + execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) + { + Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); + int jump = hlp.lineJump(); + const T * DUCC0_RESTRICT ku = hlp.buf.scalar; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + + while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); + for (size_t cv=0; cv<1; ++cv) + { + auto tr = native_simd::loadu(ptrr+cv*hlp.vlen); + tr += tmp.real()*kv[cv]; + tr.storeu(ptrr+cv*hlp.vlen); + auto ti = native_simd::loadu(ptri+cv*hlp.vlen); + ti += tmp.imag()*kv[cv]; + ti.storeu(ptri+cv*hlp.vlen); + } + ptrr+=jump; + ptri+=jump; + } + } + }); + else if (nvec==2) + execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) + { + Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); + int jump = hlp.lineJump(); + const T * DUCC0_RESTRICT ku = hlp.buf.scalar; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + + while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); + for (size_t cv=0; cv<2; ++cv) + { + auto tr = native_simd::loadu(ptrr+cv*hlp.vlen); + tr += tmp.real()*kv[cv]; + tr.storeu(ptrr+cv*hlp.vlen); + auto ti = native_simd::loadu(ptri+cv*hlp.vlen); + ti += tmp.imag()*kv[cv]; + ti.storeu(ptri+cv*hlp.vlen); + } + ptrr+=jump; + ptri+=jump; + } + } + }); + else if (nvec==3) + execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) + { + Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); + int jump = hlp.lineJump(); + const T * DUCC0_RESTRICT ku = hlp.buf.scalar; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + + while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); + for (size_t cv=0; cv<3; ++cv) + { + auto tr = native_simd::loadu(ptrr+cv*hlp.vlen); + tr += tmp.real()*kv[cv]; + tr.storeu(ptrr+cv*hlp.vlen); + auto ti = native_simd::loadu(ptri+cv*hlp.vlen); + ti += tmp.imag()*kv[cv]; + ti.storeu(ptri+cv*hlp.vlen); + } + ptrr+=jump; + ptri+=jump; + } + } + }); + else if (nvec==4) + execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) + { + Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); + int jump = hlp.lineJump(); + const T * DUCC0_RESTRICT ku = hlp.buf.scalar; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + + while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); + for (size_t cv=0; cv<4; ++cv) + { + auto tr = native_simd::loadu(ptrr+cv*hlp.vlen); + tr += tmp.real()*kv[cv]; + tr.storeu(ptrr+cv*hlp.vlen); + auto ti = native_simd::loadu(ptri+cv*hlp.vlen); + ti += tmp.imag()*kv[cv]; + ti.storeu(ptri+cv*hlp.vlen); + } + ptrr+=jump; + ptri+=jump; + } + } + }); + else execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) { Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); int jump = hlp.lineJump(); const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const T * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.vlen*hlp.nvec; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); - size_t cv=0; - for (; cv+3::loadu(ptrr+cv*hlp.vlen); + tr += tmp.real()*kv[cv]; + tr.storeu(ptrr+cv*hlp.vlen); + auto ti = native_simd::loadu(ptri+cv*hlp.vlen); + ti += tmp.imag()*kv[cv]; + ti.storeu(ptri+cv*hlp.vlen); } - for (; cv void grid2x_c size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); + constexpr size_t vlen=native_simd::size(); + size_t nvec((supp+vlen-1)/vlen); // Loop over sampling points size_t np = srv.Nvis(); + if (nvec==1) execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) { Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); int jump = hlp.lineJump(); const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const T * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.vlen*hlp.nvec; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart void grid2x_c auto flip = coord.FixW(); hlp.prep(coord); complex r = 0; - const auto * DUCC0_RESTRICT ptr = hlp.p0r; + const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; + const auto * DUCC0_RESTRICT ptri = hlp.p0ri; for (size_t cu=0; cu tmp(0); - size_t cv=0; - for (; cv+3 tmpr(0), tmpi(0); + for (size_t cv=0; cv<1; ++cv) + { + tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); + tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); + } + r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + ptrr += jump; + ptri += jump; + } + if (flip) r=conj(r); + if (do_w_gridding) r*=hlp.Wfac(); + srv.addVis(ipart, r); + } + }); + else if (nvec==2) + execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) + { + Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); + int jump = hlp.lineJump(); + const T * DUCC0_RESTRICT ku = hlp.buf.scalar; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + + while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart r = 0; + const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; + const auto * DUCC0_RESTRICT ptri = hlp.p0ri; + for (size_t cu=0; cu tmpr(0), tmpi(0); + for (size_t cv=0; cv<2; ++cv) + { + tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); + tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); + } + r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + ptrr += jump; + ptri += jump; + } + if (flip) r=conj(r); + if (do_w_gridding) r*=hlp.Wfac(); + srv.addVis(ipart, r); + } + }); + else if (nvec==3) + execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) + { + Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); + int jump = hlp.lineJump(); + const T * DUCC0_RESTRICT ku = hlp.buf.scalar; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + + while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart r = 0; + const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; + const auto * DUCC0_RESTRICT ptri = hlp.p0ri; + for (size_t cu=0; cu tmpr(0), tmpi(0); + for (size_t cv=0; cv<3; ++cv) + { + tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); + tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); + } + r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + ptrr += jump; + ptri += jump; + } + if (flip) r=conj(r); + if (do_w_gridding) r*=hlp.Wfac(); + srv.addVis(ipart, r); + } + }); + else if (nvec==4) + execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) + { + Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); + int jump = hlp.lineJump(); + const T * DUCC0_RESTRICT ku = hlp.buf.scalar; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + + while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart r = 0; + const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; + const auto * DUCC0_RESTRICT ptri = hlp.p0ri; + for (size_t cu=0; cu tmpr(0), tmpi(0); + for (size_t cv=0; cv<4; ++cv) + { + tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); + tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); + } + r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + ptrr += jump; + ptri += jump; + } + if (flip) r=conj(r); + if (do_w_gridding) r*=hlp.Wfac(); + srv.addVis(ipart, r); + } + }); + else + execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) + { + Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); + int jump = hlp.lineJump(); + const T * DUCC0_RESTRICT ku = hlp.buf.scalar; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + + while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart r = 0; + const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; + const auto * DUCC0_RESTRICT ptri = hlp.p0ri; + for (size_t cu=0; cu tmpr(0), tmpi(0); + for (size_t cv=0; cv::loadu(ptrr+hlp.vlen*cv); + tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); + } + r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + ptrr += jump; + ptri += jump; } if (flip) r=conj(r); if (do_w_gridding) r*=hlp.Wfac(); -- GitLab From 35aadb76aee60415b39c0ea090bccbbc425726d8 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sat, 1 Aug 2020 21:38:11 +0200 Subject: [PATCH 08/21] even faster --- python/gridder_cxx.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index ce08b0d..3d5575a 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -924,7 +924,7 @@ template void grid2x_c UVW coord = srv.getCoord(ipart); auto flip = coord.FixW(); hlp.prep(coord); - complex r = 0; + native_simd rr=0, ri=0; const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; const auto * DUCC0_RESTRICT ptri = hlp.p0ri; for (size_t cu=0; cu void grid2x_c tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); } - r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + rr += ku[cu]*tmpr; + ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); ptrr += jump; ptri += jump; } + auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); if (flip) r=conj(r); if (do_w_gridding) r*=hlp.Wfac(); srv.addVis(ipart, r); @@ -957,7 +959,7 @@ template void grid2x_c UVW coord = srv.getCoord(ipart); auto flip = coord.FixW(); hlp.prep(coord); - complex r = 0; + native_simd rr=0, ri=0; const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; const auto * DUCC0_RESTRICT ptri = hlp.p0ri; for (size_t cu=0; cu void grid2x_c tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); } - r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + rr += ku[cu]*tmpr; + ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); ptrr += jump; ptri += jump; } + auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); if (flip) r=conj(r); if (do_w_gridding) r*=hlp.Wfac(); srv.addVis(ipart, r); @@ -990,7 +994,7 @@ template void grid2x_c UVW coord = srv.getCoord(ipart); auto flip = coord.FixW(); hlp.prep(coord); - complex r = 0; + native_simd rr=0, ri=0; const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; const auto * DUCC0_RESTRICT ptri = hlp.p0ri; for (size_t cu=0; cu void grid2x_c tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); } - r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + rr += ku[cu]*tmpr; + ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); ptrr += jump; ptri += jump; } + auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); if (flip) r=conj(r); if (do_w_gridding) r*=hlp.Wfac(); srv.addVis(ipart, r); @@ -1023,7 +1029,7 @@ template void grid2x_c UVW coord = srv.getCoord(ipart); auto flip = coord.FixW(); hlp.prep(coord); - complex r = 0; + native_simd rr=0, ri=0; const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; const auto * DUCC0_RESTRICT ptri = hlp.p0ri; for (size_t cu=0; cu void grid2x_c tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); } - r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + rr += ku[cu]*tmpr; + ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); ptrr += jump; ptri += jump; } + auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); if (flip) r=conj(r); if (do_w_gridding) r*=hlp.Wfac(); srv.addVis(ipart, r); @@ -1056,7 +1064,7 @@ template void grid2x_c UVW coord = srv.getCoord(ipart); auto flip = coord.FixW(); hlp.prep(coord); - complex r = 0; + native_simd rr=0, ri=0; const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; const auto * DUCC0_RESTRICT ptri = hlp.p0ri; for (size_t cu=0; cu void grid2x_c tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); } - r += ku[cu]*complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + rr += ku[cu]*tmpr; + ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); ptrr += jump; ptri += jump; } + auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); if (flip) r=conj(r); if (do_w_gridding) r*=hlp.Wfac(); srv.addVis(ipart, r); -- GitLab From 2d5ea51965133f222cbc6dcdea8417b60d1e8300 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 2 Aug 2020 11:02:17 +0200 Subject: [PATCH 09/21] shorter --- python/gridder_cxx.h | 293 ++++++++++--------------------------------- 1 file changed, 69 insertions(+), 224 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 3d5575a..ccba0bc 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -706,26 +706,22 @@ template MsServ makeMsServ const mav &idx, T2 &ms, const mav &wgt) { return MsServ(baselines, idx, ms, wgt); } -template void x2grid_c + +template void x2grid_c_helper (const GridderConfig &gconf, Serv &srv, mav,2> &grid, double w0=-1, double dw=-1) { - checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); - MR_assert(grid.contiguous(), "grid is not contiguous"); size_t supp = gconf.Supp(); size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); - constexpr size_t vlen=native_simd::size(); - size_t nvec((supp+vlen-1)/vlen); size_t np = srv.Nvis(); - if (nvec==1) execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) { Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); int jump = hlp.lineJump(); const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+NVEC; while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart void x2grid_c for (size_t cu=0; cu tmp(v*ku[cu]); - for (size_t cv=0; cv<1; ++cv) + for (size_t cv=0; cv::loadu(ptrr+cv*hlp.vlen); tr += tmp.real()*kv[cv]; @@ -754,42 +750,17 @@ template void x2grid_c } } }); - else if (nvec==2) - execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) - { - Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); - int jump = hlp.lineJump(); - const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + } - while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); - for (size_t cv=0; cv<2; ++cv) - { - auto tr = native_simd::loadu(ptrr+cv*hlp.vlen); - tr += tmp.real()*kv[cv]; - tr.storeu(ptrr+cv*hlp.vlen); - auto ti = native_simd::loadu(ptri+cv*hlp.vlen); - ti += tmp.imag()*kv[cv]; - ti.storeu(ptri+cv*hlp.vlen); - } - ptrr+=jump; - ptri+=jump; - } - } - }); - else if (nvec==3) +template void x2grid_c_helper_general + (const GridderConfig &gconf, Serv &srv, mav,2> &grid, + double w0=-1, double dw=-1) + { + size_t supp = gconf.Supp(); + size_t nthreads = gconf.Nthreads(); + bool do_w_gridding = dw>0; + vector locks(gconf.Nu()); + size_t np = srv.Nvis(); execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) { Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); @@ -810,7 +781,7 @@ template void x2grid_c for (size_t cu=0; cu tmp(v*ku[cu]); - for (size_t cv=0; cv<3; ++cv) + for (size_t cv=0; cv::loadu(ptrr+cv*hlp.vlen); tr += tmp.real()*kv[cv]; @@ -824,135 +795,46 @@ template void x2grid_c } } }); - else if (nvec==4) - execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) - { - Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); - int jump = hlp.lineJump(); - const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + } - while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); - for (size_t cv=0; cv<4; ++cv) - { - auto tr = native_simd::loadu(ptrr+cv*hlp.vlen); - tr += tmp.real()*kv[cv]; - tr.storeu(ptrr+cv*hlp.vlen); - auto ti = native_simd::loadu(ptri+cv*hlp.vlen); - ti += tmp.imag()*kv[cv]; - ti.storeu(ptri+cv*hlp.vlen); - } - ptrr+=jump; - ptri+=jump; - } - } - }); - else - execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) - { - Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); - int jump = hlp.lineJump(); - const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; +template void x2grid_c + (const GridderConfig &gconf, Serv &srv, mav,2> &grid, + double w0=-1, double dw=-1) + { + checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); + MR_assert(grid.contiguous(), "grid is not contiguous"); + constexpr size_t vlen=native_simd::size(); + size_t nvec((gconf.Supp()+vlen-1)/vlen); - while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); - for (size_t cv=0; cv::loadu(ptrr+cv*hlp.vlen); - tr += tmp.real()*kv[cv]; - tr.storeu(ptrr+cv*hlp.vlen); - auto ti = native_simd::loadu(ptri+cv*hlp.vlen); - ti += tmp.imag()*kv[cv]; - ti.storeu(ptri+cv*hlp.vlen); - } - ptrr+=jump; - ptri+=jump; - } - } - }); + if (nvec==1) + x2grid_c_helper<1>(gconf, srv, grid, w0, dw); + else if (nvec==2) + x2grid_c_helper<2>(gconf, srv, grid, w0, dw); + else if (nvec==3) + x2grid_c_helper<3>(gconf, srv, grid, w0, dw); + else if (nvec==4) + x2grid_c_helper<4>(gconf, srv, grid, w0, dw); + else + x2grid_c_helper_general(gconf, srv, grid, w0, dw); } -template void grid2x_c +template void grid2x_c_helper (const GridderConfig &gconf, const mav,2> &grid, Serv &srv, double w0=-1, double dw=-1) { - checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); - MR_assert(grid.contiguous(), "grid is not contiguous"); size_t supp = gconf.Supp(); size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); - constexpr size_t vlen=native_simd::size(); - size_t nvec((supp+vlen-1)/vlen); // Loop over sampling points size_t np = srv.Nvis(); - if (nvec==1) - execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) - { - Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); - int jump = hlp.lineJump(); - const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; - - while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart rr=0, ri=0; - const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; - const auto * DUCC0_RESTRICT ptri = hlp.p0ri; - for (size_t cu=0; cu tmpr(0), tmpi(0); - for (size_t cv=0; cv<1; ++cv) - { - tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); - tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); - } - rr += ku[cu]*tmpr; - ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); - ptrr += jump; - ptri += jump; - } - auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); - if (flip) r=conj(r); - if (do_w_gridding) r*=hlp.Wfac(); - srv.addVis(ipart, r); - } - }); - else if (nvec==2) execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) { Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); int jump = hlp.lineJump(); const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+NVEC; while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart void grid2x_c for (size_t cu=0; cu tmpr(0), tmpi(0); - for (size_t cv=0; cv<2; ++cv) + for (size_t cv=0; cv::loadu(ptrr+hlp.vlen*cv); tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); @@ -981,77 +863,19 @@ template void grid2x_c srv.addVis(ipart, r); } }); - else if (nvec==3) - execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) - { - Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); - int jump = hlp.lineJump(); - const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + } - while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart rr=0, ri=0; - const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; - const auto * DUCC0_RESTRICT ptri = hlp.p0ri; - for (size_t cu=0; cu tmpr(0), tmpi(0); - for (size_t cv=0; cv<3; ++cv) - { - tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); - tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); - } - rr += ku[cu]*tmpr; - ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); - ptrr += jump; - ptri += jump; - } - auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); - if (flip) r=conj(r); - if (do_w_gridding) r*=hlp.Wfac(); - srv.addVis(ipart, r); - } - }); - else if (nvec==4) - execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) - { - Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); - int jump = hlp.lineJump(); - const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; +template void grid2x_c_helper_general + (const GridderConfig &gconf, const mav,2> &grid, + Serv &srv, double w0=-1, double dw=-1) + { + size_t supp = gconf.Supp(); + size_t nthreads = gconf.Nthreads(); + bool do_w_gridding = dw>0; + vector locks(gconf.Nu()); - while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart rr=0, ri=0; - const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; - const auto * DUCC0_RESTRICT ptri = hlp.p0ri; - for (size_t cu=0; cu tmpr(0), tmpi(0); - for (size_t cv=0; cv<4; ++cv) - { - tmpr += kv[cv]*native_simd::loadu(ptrr+hlp.vlen*cv); - tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); - } - rr += ku[cu]*tmpr; - ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); - ptrr += jump; - ptri += jump; - } - auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); - if (flip) r=conj(r); - if (do_w_gridding) r*=hlp.Wfac(); - srv.addVis(ipart, r); - } - }); - else + // Loop over sampling points + size_t np = srv.Nvis(); execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) { Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); @@ -1088,6 +912,27 @@ template void grid2x_c }); } +template void grid2x_c + (const GridderConfig &gconf, const mav,2> &grid, + Serv &srv, double w0=-1, double dw=-1) + { + checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); + MR_assert(grid.contiguous(), "grid is not contiguous"); + constexpr size_t vlen=native_simd::size(); + size_t nvec((gconf.Supp()+vlen-1)/vlen); + + if (nvec==1) + grid2x_c_helper<1>(gconf, grid, srv, w0, dw); + else if (nvec==2) + grid2x_c_helper<2>(gconf, grid, srv, w0, dw); + else if (nvec==3) + grid2x_c_helper<3>(gconf, grid, srv, w0, dw); + else if (nvec==4) + grid2x_c_helper<4>(gconf, grid, srv, w0, dw); + else + grid2x_c_helper_general(gconf, grid, srv, w0, dw); + } + template void apply_global_corrections(const GridderConfig &gconf, mav &dirty, double dw, bool divide_by_n) { -- GitLab From 0595029670cd5439bde9dfe8dd0b25dc1aa697d9 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 2 Aug 2020 16:34:39 +0200 Subject: [PATCH 10/21] remove no-op statements --- python/gridder_cxx.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index ccba0bc..628b098 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -853,7 +853,7 @@ template void grid2x_c_helper tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); } rr += ku[cu]*tmpr; - ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + ri += ku[cu]*tmpi; ptrr += jump; ptri += jump; } @@ -900,7 +900,7 @@ template void grid2x_c_helper_general tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); } rr += ku[cu]*tmpr; - ri += ku[cu]*tmpi;complex(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>())); + ri += ku[cu]*tmpi; ptrr += jump; ptri += jump; } -- GitLab From 74437e10b1857ae3d1dd5627a4231c36a88a464e Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 2 Aug 2020 16:38:14 +0200 Subject: [PATCH 11/21] see if clang works on Debian buster --- .gitlab-ci.yml | 10 +++++----- Dockerfile | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8abc3de..2594f95 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -22,11 +22,11 @@ test_gcc: - python3 setup.py install --user -f - pytest-3 -q python/test -#test_clang: -# stage: testing -# script: -# - CC="clang -fsized-deallocation" python3 setup.py install --user -f -# - pytest-3 -q python/test +test_clang: + stage: testing + script: + - CC="clang -fsized-deallocation" python3 setup.py install --user -f + - pytest-3 -q python/test release: stage: build_tarballs diff --git a/Dockerfile b/Dockerfile index 00751db..d849a4e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,2 +1,2 @@ -FROM debian:testing-slim +FROM debian:buster-slim RUN apt-get update && apt-get install -y git python3-pip python3-scipy python3-pytest python3-pybind11 pybind11-dev clang && rm -rf /var/lib/apt/lists/* -- GitLab From 19ed6ac1a4c2b133cc69264583821494ccb96005 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 2 Aug 2020 17:03:20 +0200 Subject: [PATCH 12/21] skip a test if scipy functionality is not available --- python/test/test_pointing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/test/test_pointing.py b/python/test/test_pointing.py index 5528691..ce5218a 100644 --- a/python/test/test_pointing.py +++ b/python/test/test_pointing.py @@ -64,8 +64,11 @@ def testp2(): assert_((quat2==quat3).all(), "problem") quat2 *= np.sign(quat2[:,0]).reshape((-1,1)) - from scipy.spatial.transform import Rotation as R - from scipy.spatial.transform import Slerp + try: + from scipy.spatial.transform import Rotation as R + from scipy.spatial.transform import Slerp + except: + pytest.skip() times1 = t01 + 1./f1*np.arange(size1) r1 = R.from_quat(quat1) rrquat = R.from_quat(rquat) -- GitLab From 7be44996ec3c97e83c809c00369fc48e64c46c23 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 2 Aug 2020 18:09:44 +0200 Subject: [PATCH 13/21] one more opportunity for skipping --- python/test/test_wgridder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/test/test_wgridder.py b/python/test/test_wgridder.py index 6e05d53..1078e47 100644 --- a/python/test/test_wgridder.py +++ b/python/test/test_wgridder.py @@ -108,7 +108,7 @@ def test_adjointness_ms2dirty(nxdirty, nydirty, ofactor, nrow, nchan, epsilon, @pmp("fov", (1., 20.)) def test_ms2dirty_against_wdft2(nxdirty, nydirty, ofactor, nrow, nchan, epsilon, singleprec, wstacking, use_wgt, fov, nthreads): if singleprec and epsilon < 5e-5: - return + pytest.skip() rng = np.random.default_rng(42) pixsizex = fov*np.pi/180/nxdirty pixsizey = fov*np.pi/180/nydirty*1.1 -- GitLab From 6a8bc8c2a6d1a58f91189b1fce82f7ed0682f5e0 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 2 Aug 2020 20:56:41 +0200 Subject: [PATCH 14/21] ... and one more --- python/gridder_cxx.h | 174 +++++++++++++------------------------------ 1 file changed, 50 insertions(+), 124 deletions(-) diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 628b098..6fed9e7 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -707,11 +707,12 @@ template MsServ makeMsServ { return MsServ(baselines, idx, ms, wgt); } -template void x2grid_c_helper +template [[gnu::hot]] void x2grid_c_helper (const GridderConfig &gconf, Serv &srv, mav,2> &grid, double w0=-1, double dw=-1) { - size_t supp = gconf.Supp(); + constexpr size_t vlen=native_simd::size(); + constexpr size_t NVEC((SUPP+vlen-1)/vlen); size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); @@ -733,7 +734,7 @@ template void x2grid_c_helper auto v(srv.getVis(ipart)); if (do_w_gridding) v*=hlp.Wfac(); if (flip) v=conj(v); - for (size_t cu=0; cu tmp(v*ku[cu]); for (size_t cv=0; cv void x2grid_c_helper }); } -template void x2grid_c_helper_general - (const GridderConfig &gconf, Serv &srv, mav,2> &grid, - double w0=-1, double dw=-1) - { - size_t supp = gconf.Supp(); - size_t nthreads = gconf.Nthreads(); - bool do_w_gridding = dw>0; - vector locks(gconf.Nu()); - size_t np = srv.Nvis(); - execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) - { - Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); - int jump = hlp.lineJump(); - const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; - - while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); - for (size_t cv=0; cv::loadu(ptrr+cv*hlp.vlen); - tr += tmp.real()*kv[cv]; - tr.storeu(ptrr+cv*hlp.vlen); - auto ti = native_simd::loadu(ptri+cv*hlp.vlen); - ti += tmp.imag()*kv[cv]; - ti.storeu(ptri+cv*hlp.vlen); - } - ptrr+=jump; - ptri+=jump; - } - } - }); - } - template void x2grid_c (const GridderConfig &gconf, Serv &srv, mav,2> &grid, double w0=-1, double dw=-1) { checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); MR_assert(grid.contiguous(), "grid is not contiguous"); - constexpr size_t vlen=native_simd::size(); - size_t nvec((gconf.Supp()+vlen-1)/vlen); - - if (nvec==1) - x2grid_c_helper<1>(gconf, srv, grid, w0, dw); - else if (nvec==2) - x2grid_c_helper<2>(gconf, srv, grid, w0, dw); - else if (nvec==3) - x2grid_c_helper<3>(gconf, srv, grid, w0, dw); - else if (nvec==4) - x2grid_c_helper<4>(gconf, srv, grid, w0, dw); - else - x2grid_c_helper_general(gconf, srv, grid, w0, dw); - } - -template void grid2x_c_helper - (const GridderConfig &gconf, const mav,2> &grid, - Serv &srv, double w0=-1, double dw=-1) - { - size_t supp = gconf.Supp(); - size_t nthreads = gconf.Nthreads(); - bool do_w_gridding = dw>0; - vector locks(gconf.Nu()); - // Loop over sampling points - size_t np = srv.Nvis(); - execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) + switch(gconf.Supp()) { - Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); - int jump = hlp.lineJump(); - const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+NVEC; - - while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart rr=0, ri=0; - const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; - const auto * DUCC0_RESTRICT ptri = hlp.p0ri; - for (size_t cu=0; cu tmpr(0), tmpi(0); - for (size_t cv=0; cv::loadu(ptrr+hlp.vlen*cv); - tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); - } - rr += ku[cu]*tmpr; - ri += ku[cu]*tmpi; - ptrr += jump; - ptri += jump; - } - auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); - if (flip) r=conj(r); - if (do_w_gridding) r*=hlp.Wfac(); - srv.addVis(ipart, r); - } - }); + case 1: x2grid_c_helper< 1>(gconf, srv, grid, w0, dw); break; + case 2: x2grid_c_helper< 2>(gconf, srv, grid, w0, dw); break; + case 3: x2grid_c_helper< 3>(gconf, srv, grid, w0, dw); break; + case 4: x2grid_c_helper< 4>(gconf, srv, grid, w0, dw); break; + case 5: x2grid_c_helper< 5>(gconf, srv, grid, w0, dw); break; + case 6: x2grid_c_helper< 6>(gconf, srv, grid, w0, dw); break; + case 7: x2grid_c_helper< 7>(gconf, srv, grid, w0, dw); break; + case 8: x2grid_c_helper< 8>(gconf, srv, grid, w0, dw); break; + case 9: x2grid_c_helper< 9>(gconf, srv, grid, w0, dw); break; + case 10: x2grid_c_helper<10>(gconf, srv, grid, w0, dw); break; + case 11: x2grid_c_helper<11>(gconf, srv, grid, w0, dw); break; + case 12: x2grid_c_helper<12>(gconf, srv, grid, w0, dw); break; + case 13: x2grid_c_helper<13>(gconf, srv, grid, w0, dw); break; + case 14: x2grid_c_helper<14>(gconf, srv, grid, w0, dw); break; + case 15: x2grid_c_helper<15>(gconf, srv, grid, w0, dw); break; + case 16: x2grid_c_helper<16>(gconf, srv, grid, w0, dw); break; + default: MR_fail("must not happen"); + } } -template void grid2x_c_helper_general +template [[gnu::hot]] void grid2x_c_helper (const GridderConfig &gconf, const mav,2> &grid, Serv &srv, double w0=-1, double dw=-1) { - size_t supp = gconf.Supp(); + constexpr size_t vlen=native_simd::size(); + constexpr size_t NVEC((SUPP+vlen-1)/vlen); size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); @@ -881,7 +799,7 @@ template void grid2x_c_helper_general Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); int jump = hlp.lineJump(); const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+NVEC; while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart void grid2x_c_helper_general native_simd rr=0, ri=0; const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; const auto * DUCC0_RESTRICT ptri = hlp.p0ri; - for (size_t cu=0; cu tmpr(0), tmpi(0); - for (size_t cv=0; cv::loadu(ptrr+hlp.vlen*cv); tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); @@ -918,19 +836,27 @@ template void grid2x_c { checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); MR_assert(grid.contiguous(), "grid is not contiguous"); - constexpr size_t vlen=native_simd::size(); - size_t nvec((gconf.Supp()+vlen-1)/vlen); - - if (nvec==1) - grid2x_c_helper<1>(gconf, grid, srv, w0, dw); - else if (nvec==2) - grid2x_c_helper<2>(gconf, grid, srv, w0, dw); - else if (nvec==3) - grid2x_c_helper<3>(gconf, grid, srv, w0, dw); - else if (nvec==4) - grid2x_c_helper<4>(gconf, grid, srv, w0, dw); - else - grid2x_c_helper_general(gconf, grid, srv, w0, dw); + + switch(gconf.Supp()) + { + case 1: grid2x_c_helper< 1>(gconf, grid, srv, w0, dw); break; + case 2: grid2x_c_helper< 2>(gconf, grid, srv, w0, dw); break; + case 3: grid2x_c_helper< 3>(gconf, grid, srv, w0, dw); break; + case 4: grid2x_c_helper< 4>(gconf, grid, srv, w0, dw); break; + case 5: grid2x_c_helper< 5>(gconf, grid, srv, w0, dw); break; + case 6: grid2x_c_helper< 6>(gconf, grid, srv, w0, dw); break; + case 7: grid2x_c_helper< 7>(gconf, grid, srv, w0, dw); break; + case 8: grid2x_c_helper< 8>(gconf, grid, srv, w0, dw); break; + case 9: grid2x_c_helper< 9>(gconf, grid, srv, w0, dw); break; + case 10: grid2x_c_helper<10>(gconf, grid, srv, w0, dw); break; + case 11: grid2x_c_helper<11>(gconf, grid, srv, w0, dw); break; + case 12: grid2x_c_helper<12>(gconf, grid, srv, w0, dw); break; + case 13: grid2x_c_helper<13>(gconf, grid, srv, w0, dw); break; + case 14: grid2x_c_helper<14>(gconf, grid, srv, w0, dw); break; + case 15: grid2x_c_helper<15>(gconf, grid, srv, w0, dw); break; + case 16: grid2x_c_helper<16>(gconf, grid, srv, w0, dw); break; + default: MR_fail("must not happen"); + } } template void apply_global_corrections(const GridderConfig &gconf, -- GitLab From ec2491a9d456831bab0f68caf350e66b09eec0cb Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 3 Aug 2020 11:45:26 +0200 Subject: [PATCH 15/21] try to build from source distribution --- .gitlab-ci.yml | 10 ++++--- src/ducc0/infra/misc_utils.h | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 src/ducc0/infra/misc_utils.h diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2594f95..611fcc9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,14 +19,16 @@ build_docker: test_gcc: stage: testing script: - - python3 setup.py install --user -f - - pytest-3 -q python/test + - python3 setup.py sdist + - 'tar xvzf sdist/ducc0*.tar.gz' + - 'cd ducc0-* && python3 setup.py install --user -f && pytest-3 -q python/test' test_clang: stage: testing script: - - CC="clang -fsized-deallocation" python3 setup.py install --user -f - - pytest-3 -q python/test + - python3 setup.py sdist + - 'tar xvzf sdist/ducc0*.tar.gz' + - 'cd ducc0-* && CC="clang -fsized-deallocation" python3 setup.py install --user -f && pytest-3 -q python/test' release: stage: build_tarballs diff --git a/src/ducc0/infra/misc_utils.h b/src/ducc0/infra/misc_utils.h new file mode 100644 index 0000000..9c862f6 --- /dev/null +++ b/src/ducc0/infra/misc_utils.h @@ -0,0 +1,53 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Copyright (C) 2019-2020 Max-Planck-Society + Author: Martin Reinecke */ + +#ifndef DUCC0_MISC_UTILS_H +#define DUCC0_MISC_UTILS_H + +#include + +namespace ducc0 { + +namespace detail_misc_utils { + +using namespace std; + +template auto calcShare(size_t nshares, size_t myshare, + const T &begin, const T &end) + { + auto nwork = end-begin; + auto nbase = nwork/nshares; + auto additional = nwork%nshares; + auto lo = begin + (myshare*nbase + ((myshare auto calcShare(size_t nshares, size_t myshare, const T &end) + { return calcShare(nshares, myshare, T(0), end); } + +} + +using detail_misc_utils::calcShare; + +} + +#endif -- GitLab From 84976b7ab0fb74f481617431b9b644efe8279806 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 3 Aug 2020 11:50:58 +0200 Subject: [PATCH 16/21] fix --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 611fcc9..0ff1732 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -20,14 +20,14 @@ test_gcc: stage: testing script: - python3 setup.py sdist - - 'tar xvzf sdist/ducc0*.tar.gz' + - 'tar xvzf dist/ducc0*.tar.gz' - 'cd ducc0-* && python3 setup.py install --user -f && pytest-3 -q python/test' test_clang: stage: testing script: - python3 setup.py sdist - - 'tar xvzf sdist/ducc0*.tar.gz' + - 'tar xvzf dist/ducc0*.tar.gz' - 'cd ducc0-* && CC="clang -fsized-deallocation" python3 setup.py install --user -f && pytest-3 -q python/test' release: -- GitLab From 9b8d7eee8e0deb0c3f436e4a39f24d733c833d50 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 3 Aug 2020 11:53:37 +0200 Subject: [PATCH 17/21] factor out common code --- MANIFEST.in | 1 + python/gridder_cxx.h | 27 +++++---------------------- src/ducc0/math/fft.h | 6 ++---- 3 files changed, 8 insertions(+), 26 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 8197390..cac1ab8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,6 +5,7 @@ include LICENSE include src/ducc0/infra/aligned_array.h include src/ducc0/infra/error_handling.h include src/ducc0/infra/mav.h +include src/ducc0/infra/misc_utils.h include src/ducc0/infra/simd.h include src/ducc0/infra/string_utils.cc include src/ducc0/infra/string_utils.h diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 6fed9e7..b15e30a 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -33,6 +33,7 @@ #include "ducc0/infra/error_handling.h" #include "ducc0/math/fft.h" #include "ducc0/infra/threading.h" +#include "ducc0/infra/misc_utils.h" #include "ducc0/infra/useful_macros.h" #include "ducc0/infra/mav.h" #include "ducc0/infra/simd.h" @@ -916,15 +917,6 @@ template void apply_global_corrections(const GridderConfig &gconf }); } -auto calc_share(size_t nshares, size_t myshare, size_t nwork) - { - size_t nbase = nwork/nshares; - size_t additional = nwork%nshares; - size_t lo = myshare*nbase + ((myshare class WgridHelper { private: @@ -987,7 +979,7 @@ template class WgridHelper res.resize((v.size()+add.size())-del.size()); execParallel(nthreads, [&](Scheduler &sched) { auto tid = sched.thread_num(); - auto [lo, hi] = calc_share(nthreads, tid, v.size()); + auto [lo, hi] = calcShare(nthreads, tid, v.size()); if (lo==hi) return; // if interval is empty, do nothing auto iin=v.begin()+lo, ein=v.begin()+hi; auto iadd = (iin==v.begin()) ? add.begin() : lower_bound(add.begin(), add.end(), *iin); @@ -1179,16 +1171,6 @@ template void dirty2x( } } -void calc_share(size_t nshares, size_t myshare, size_t nwork, size_t &lo, - size_t &hi) - { - size_t nbase = nwork/nshares; - size_t additional = nwork%nshares; - lo = myshare*nbase + ((myshare vector getWgtIndices(const Baselines &baselines, const GridderConfig &gconf, const mav &wgt, const mav,2> &ms) @@ -1207,10 +1189,11 @@ template vector getWgtIndices(const Baselines &baselines, mav acc({nthreads, (nbu*nbv+16)}); // the 16 is safety distance to avoid false sharing vector tmp(nrow*nchan); - execStatic(nrow, nthreads, 0, [&](Scheduler &sched) + execParallel(nthreads, [&](Scheduler &sched) { idx_t tid = sched.thread_num(); - while (auto rng=sched.getNext()) for(auto irow=idx_t(rng.lo); irow #include #include "ducc0/infra/threading.h" +#include "ducc0/infra/misc_utils.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/mav.h" #ifndef DUCC0_NO_THREADING @@ -437,10 +438,7 @@ template class multi_iter if (nshares==1) return; if (nshares==0) throw std::runtime_error("can't run with zero threads"); if (myshare>=nshares) throw std::runtime_error("impossible share requested"); - size_t nbase = rem/nshares; - size_t additional = rem%nshares; - size_t lo = myshare*nbase + ((myshare Date: Mon, 3 Aug 2020 13:52:17 +0200 Subject: [PATCH 18/21] prepare for more flexible multithreading --- python/misc.cc | 4 ++++ src/ducc0/infra/threading.cc | 42 +++++++++--------------------------- src/ducc0/infra/threading.h | 5 +++-- src/ducc0/math/fft.h | 2 +- 4 files changed, 18 insertions(+), 35 deletions(-) diff --git a/python/misc.cc b/python/misc.cc index 7ef2c97..74791c0 100644 --- a/python/misc.cc +++ b/python/misc.cc @@ -203,6 +203,8 @@ py::array py_transpose(const py::array &in, py::array &out) MR_fail("unsupported datatype"); } +void py_set_thread_pool_size(size_t new_pool_size) + { set_pool_size(new_pool_size); } const char *misc_DS = R"""( Various unsorted utilities @@ -225,6 +227,8 @@ void add_misc(py::module &msup) m.def("ascontiguousarray",&py_ascontiguousarray, "in"_a); m.def("transpose",&py_transpose, "in"_a, "out"_a); + + m.def("set_thread_pool_size",&py_set_thread_pool_size, "new_pool_size"_a); } } diff --git a/src/ducc0/infra/threading.cc b/src/ducc0/infra/threading.cc index 04994f3..39d4b20 100644 --- a/src/ducc0/infra/threading.cc +++ b/src/ducc0/infra/threading.cc @@ -41,9 +41,7 @@ namespace detail_threading { #ifndef DUCC0_NO_THREADING -static const size_t max_threads_ = std::max(1, std::thread::hardware_concurrency()); - -std::atomic default_nthreads_(max_threads_); +std::atomic default_nthreads_(std::max(1, std::thread::hardware_concurrency())); size_t get_default_nthreads() { return default_nthreads_; } @@ -51,8 +49,6 @@ size_t get_default_nthreads() void set_default_nthreads(size_t new_default_nthreads) { default_nthreads_ = std::max(1, new_default_nthreads); } -size_t max_threads() { return max_threads_; } - class latch { std::atomic num_left_; @@ -156,8 +152,6 @@ class thread_pool threads_(nthreads) { create_threads(); } - thread_pool(): thread_pool(max_threads_) {} - ~thread_pool() { shutdown(); } size_t size() const { return threads_.size(); } @@ -182,9 +176,12 @@ class thread_pool } }; -inline thread_pool &get_pool() +thread_pool &get_pool(size_t nthreads=0) { - static thread_pool pool; + static std::unique_ptr pool; + if ((!pool) && (nthreads==0)) nthreads=default_nthreads_; + if ((!pool) || ((nthreads!=0) && (nthreads!=pool->size()))) // resize + pool = std::make_unique(nthreads); #if __has_include() static std::once_flag f; call_once(f, @@ -197,30 +194,12 @@ inline thread_pool &get_pool() }); #endif - return pool; - } -inline thread_pool &get_pool2(size_t nthreads=0) - { - static std::unique_ptr pool(std::make_unique(1)); - if ((!pool) || ((nthreads!=0) && (nthreads!=pool->size()))) // resize - { - pool = std::make_unique(nthreads); - } -#if __has_include() - static std::once_flag f; - call_once(f, - []{ - pthread_atfork( - +[]{ get_pool2().shutdown(); }, // prepare - +[]{ get_pool2().restart(); }, // parent - +[]{ get_pool2().restart(); } // child - ); - }); -#endif - return *pool; } +void set_pool_size(size_t new_pool_size) + { get_pool(new_pool_size); } + class Distribution { private: @@ -341,8 +320,7 @@ void Distribution::thread_map(std::function f) return; } - auto & pool = get_pool2(nthreads_); -// auto & pool = get_pool(); + auto &pool = get_pool(); latch counter(nthreads_); std::exception_ptr ex; std::mutex ex_mut; diff --git a/src/ducc0/infra/threading.h b/src/ducc0/infra/threading.h index 043de51..a4b4545 100644 --- a/src/ducc0/infra/threading.h +++ b/src/ducc0/infra/threading.h @@ -47,10 +47,11 @@ class Scheduler virtual Range getNext() = 0; }; -size_t max_threads(); void set_default_nthreads(size_t new_default_nthreads); size_t get_default_nthreads(); +void set_pool_size(size_t new_pool_size); + void execSingle(size_t nwork, std::function func); void execStatic(size_t nwork, size_t nthreads, size_t chunksize, @@ -63,7 +64,7 @@ void execParallel(size_t nthreads, std::function func); } // end of namespace detail_threading -using detail_threading::max_threads; +using detail_threading::set_pool_size; using detail_threading::get_default_nthreads; using detail_threading::set_default_nthreads; using detail_threading::Scheduler; diff --git a/src/ducc0/math/fft.h b/src/ducc0/math/fft.h index 0f2fe6d..18b6337 100644 --- a/src/ducc0/math/fft.h +++ b/src/ducc0/math/fft.h @@ -118,7 +118,7 @@ struct util // hack to avoid duplicate symbols size_t parallel = size / (info.shape(axis) * vlen); if (info.shape(axis) < 1000) parallel /= 4; - size_t max_threads = (nthreads==0) ? ducc0::max_threads() : nthreads; + size_t max_threads = (nthreads==0) ? ducc0::get_default_nthreads() : nthreads; return std::max(size_t(1), std::min(parallel, max_threads)); } #endif -- GitLab From b957f020f1eb1c96a63acbbc3d8bf21fad1237bf Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 3 Aug 2020 14:11:12 +0200 Subject: [PATCH 19/21] tweak single-value kernel evaluation --- src/ducc0/math/gridding_kernel.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/ducc0/math/gridding_kernel.h b/src/ducc0/math/gridding_kernel.h index 883b453..e44a2c4 100644 --- a/src/ducc0/math/gridding_kernel.h +++ b/src/ducc0/math/gridding_kernel.h @@ -161,6 +161,8 @@ template class HornerKernel: public GriddingKernel size_t W, D, nvec; vector coeff; + const T *scoeff; + size_t sstride; void (HornerKernel::* evalfunc) (T, native_simd *) const; T (HornerKernel::* evalsinglefunc) (T) const; @@ -182,11 +184,10 @@ template class HornerKernel: public GriddingKernel { auto nth = min(W-1, size_t(max(T(0), (x+1)*W*T(0.5)))); x = (x+1)*W-2*nth-1; - auto i = nth/vlen; - auto imod = nth%vlen; - auto tval = coeff[i][imod]; + auto ptr = scoeff+nth; + auto tval = *ptr; for (size_t j=1; j<=DEG; ++j) - tval = tval*x + coeff[j*nvec+i][imod]; + tval = tval*x + ptr[j*sstride]; return tval; } @@ -206,11 +207,10 @@ template class HornerKernel: public GriddingKernel { auto nth = min(W-1, size_t(max(T(0), (x+1)*W*T(0.5)))); x = (x+1)*W-2*nth-1; - auto i = nth/vlen; - auto imod = nth%vlen; - auto tval = coeff[i][imod]; + auto ptr = scoeff+nth; + auto tval = *ptr; for (size_t j=1; j<=D; ++j) - tval = tval*x + coeff[j*nvec+i][imod]; + tval = tval*x + ptr[j*sstride]; return tval; } @@ -274,7 +274,8 @@ template class HornerKernel: public GriddingKernel HornerKernel(size_t W_, size_t D_, const function &func, const KernelCorrection &corr_) : W(W_), D(D_), nvec((W+vlen-1)/vlen), - coeff(makeCoeff(W_, D_, func)), corr(corr_) + coeff(makeCoeff(W_, D_, func)), scoeff(reinterpret_cast(&coeff[0])), + sstride(vlen*nvec), corr(corr_) { wire_eval(); } virtual size_t support() const { return W; } -- GitLab From f6108f09178802c0fbbb1d1aaae359fb912666f6 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 3 Aug 2020 14:48:30 +0200 Subject: [PATCH 20/21] update ChangeLog --- ChangeLog | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ChangeLog b/ChangeLog index fb7dc3e..bb795a9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +0.3.0: +- general: + - The package should now be installable from PyPI via pip even on MacOS. + However, MacOS >= 10.15 is required. + +- wgridder: + - very substantial performance and scaling improvements + + 0.2.0: - wgridder: -- GitLab From 44592ac66487c3a1de13e76800330bd5b2433654 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 3 Aug 2020 15:38:22 +0200 Subject: [PATCH 21/21] update ChangeLog --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index bb795a9..5738603 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,7 @@ 0.3.0: - general: - The package should now be installable from PyPI via pip even on MacOS. - However, MacOS >= 10.15 is required. + However, MacOS >= 10.14 is required. - wgridder: - very substantial performance and scaling improvements -- GitLab