diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8abc3de77c916165572b74003474fb2032a516c6..0ff1732dc580acfe22d0831308851efc7005fce7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -19,14 +19,16 @@ build_docker: test_gcc: stage: testing script: - - python3 setup.py install --user -f - - pytest-3 -q python/test + - python3 setup.py sdist + - 'tar xvzf dist/ducc0*.tar.gz' + - 'cd ducc0-* && python3 setup.py install --user -f && pytest-3 -q python/test' -#test_clang: -# stage: testing -# script: -# - CC="clang -fsized-deallocation" python3 setup.py install --user -f -# - pytest-3 -q python/test +test_clang: + stage: testing + script: + - python3 setup.py sdist + - 'tar xvzf dist/ducc0*.tar.gz' + - 'cd ducc0-* && CC="clang -fsized-deallocation" python3 setup.py install --user -f && pytest-3 -q python/test' release: stage: build_tarballs diff --git a/ChangeLog b/ChangeLog index fb7dc3e5d84ad8fb30a266aa09413f3ddb0824b7..57386037e1effcd8ad9c072934b04c05a2252b0c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +0.3.0: +- general: + - The package should now be installable from PyPI via pip even on MacOS. + However, MacOS >= 10.14 is required. + +- wgridder: + - very substantial performance and scaling improvements + + 0.2.0: - wgridder: diff --git a/Dockerfile b/Dockerfile index 00751db6b7026c4feac1d074a5bb0b391405293b..d849a4e7897f8b64c7d23af677cb4b4ddaca4bd7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,2 +1,2 @@ -FROM debian:testing-slim +FROM debian:buster-slim RUN apt-get update && apt-get install -y git python3-pip python3-scipy python3-pytest python3-pybind11 pybind11-dev clang && rm -rf /var/lib/apt/lists/* diff --git a/MANIFEST.in b/MANIFEST.in index 8197390d15a28b7b6f88d2c037370b3803bef24e..cac1ab8ab188086c9a2d159f37b37e01b5e90b47 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,6 +5,7 @@ include LICENSE include src/ducc0/infra/aligned_array.h include src/ducc0/infra/error_handling.h include src/ducc0/infra/mav.h +include src/ducc0/infra/misc_utils.h include src/ducc0/infra/simd.h include src/ducc0/infra/string_utils.cc include src/ducc0/infra/string_utils.h diff --git a/python/gridder_cxx.h b/python/gridder_cxx.h index 9c48bc31ac25a0ea7b005dbaba65af59dc90a004..b15e30a6bf44937a57dab32d79f9c42dec2f0595 100644 --- a/python/gridder_cxx.h +++ b/python/gridder_cxx.h @@ -33,6 +33,7 @@ #include "ducc0/infra/error_handling.h" #include "ducc0/math/fft.h" #include "ducc0/infra/threading.h" +#include "ducc0/infra/misc_utils.h" #include "ducc0/infra/useful_macros.h" #include "ducc0/infra/mav.h" #include "ducc0/infra/simd.h" @@ -237,15 +238,15 @@ template class GridderConfig size_t vlim; bool uv_side_fast; - complex wscreen(T x, T y, T w, bool adjoint) const + T phase (T x, T y, T w, bool adjoint) const { constexpr T pi = T(3.141592653589793238462643383279502884197); T tmp = 1-x-y; if (tmp<=0) return 1; // no phase factor beyond the horizon T nm1 = (-x-y)/(sqrt(tmp)+1); // more accurate form of sqrt(1-x-y)-1 - T phase = 2*pi*w*nm1; - if (adjoint) phase *= -1; - return complex(cos(phase), sin(phase)); + T phs = 2*pi*w*nm1; + if (adjoint) phs *= -1; + return phs; } public: @@ -307,8 +308,6 @@ template class GridderConfig if (i2>=nu) i2-=nu; size_t j2 = nv-ny_dirty/2+j; if (j2>=nv) j2-=nv; - // FIXME: for some reason g++ warns about double-to-float conversion - // here, even though there is an explicit cast... dirty.v(i,j) = tmav(i2,j2)*T(cfu[icfu]*cfv[icfv]); } } @@ -322,33 +321,45 @@ template class GridderConfig y0 = -0.5*ny_dirty*psy; execStatic(nx_dirty/2+1, nthreads, 0, [&](Scheduler &sched) { + using vtype = native_simd; + constexpr size_t vlen=vtype::size(); + size_t nvec = (ny_dirty/2+1+(vlen-1))/vlen; + vector ph(nvec), sp(nvec), cp(nvec); while (auto rng=sched.getNext()) for(auto i=rng.lo; i=nu) ix-=nu; + size_t i2 = nx_dirty-i; + size_t ix2 = nu-nx_dirty/2+i2; + if (ix2>=nu) ix2-=nu; for (size_t j=0; j<=ny_dirty/2; ++j) { T fy = T(y0+j*psy); - auto ws = wscreen(fx, fy*fy, w, true); - size_t ix = nu-nx_dirty/2+i; - if (ix>=nu) ix-=nu; - size_t jx = nv-ny_dirty/2+j; - if (jx>=nv) jx-=nv; - dirty.v(i,j) += (tmav(ix,jx)*ws).real(); // lower left - size_t i2 = nx_dirty-i, j2 = ny_dirty-j; - size_t ix2 = nu-nx_dirty/2+i2; - if (ix2>=nu) ix2-=nu; - size_t jx2 = nv-ny_dirty/2+j2; - if (jx2>=nv) jx2-=nv; - if ((i>0)&&(i0)&&(i=nv)? jx+1-nv : jx+1) { - dirty.v(i2,j) += (tmav(ix2,jx)*ws).real(); // lower right - if ((j>0)&&(j=nv)? jx+1-nv : jx+1) + { + size_t j2 = min(j, ny_dirty-j); + T re = cp[j2/vlen][j2%vlen], im = sp[j2/vlen][j2%vlen]; + dirty.v(i,j) += tmav(ix,jx).real()*re - tmav(ix,jx).imag()*im; // lower left } - if ((j>0)&&(j class GridderConfig y0 = -0.5*ny_dirty*psy; execStatic(nx_dirty/2+1, nthreads, 0, [&](Scheduler &sched) { + using vtype = native_simd; + constexpr size_t vlen=vtype::size(); + size_t nvec = (ny_dirty/2+1+(vlen-1))/vlen; + vector ph(nvec), sp(nvec), cp(nvec); while (auto rng=sched.getNext()) for(auto i=rng.lo; i=nu) ix-=nu; + size_t i2 = nx_dirty-i; + size_t ix2 = nu-nx_dirty/2+i2; + if (ix2>=nu) ix2-=nu; for (size_t j=0; j<=ny_dirty/2; ++j) { T fy = T(y0+j*psy); - auto ws = wscreen(fx, fy*fy, w, false); - size_t ix = nu-nx_dirty/2+i; - if (ix>=nu) ix-=nu; - size_t jx = nv-ny_dirty/2+j; - if (jx>=nv) jx-=nv; - grid.v(ix,jx) = dirty(i,j)*ws; // lower left - size_t i2 = nx_dirty-i, j2 = ny_dirty-j; - size_t ix2 = nu-nx_dirty/2+i2; - if (ix2>=nu) ix2-=nu; - size_t jx2 = nv-ny_dirty/2+j2; - if (jx2>=nv) jx2-=nv; - if ((i>0)&&(i0)&&(i=nv)? jx+1-nv : jx+1) { + size_t j2 = min(j, ny_dirty-j); + auto ws = complex(cp[j2/vlen][j2%vlen],sp[j2/vlen][j2%vlen]); + grid.v(ix,jx) = dirty(i,j)*ws; // lower left grid.v(ix2,jx) = dirty(i2,j)*ws; // lower right - if ((j>0)&&(j0)&&(j=nv)? jx+1-nv : jx+1) + { + size_t j2 = min(j, ny_dirty-j); + auto ws = complex(cp[j2/vlen][j2%vlen],sp[j2/vlen][j2%vlen]); + grid.v(ix,jx) = dirty(i,j)*ws; // lower left + } } }); } @@ -485,56 +508,24 @@ template class GridderConfig v=fmod1(v_in*psy)*nv; iv0 = min(int(v+vshift)-int(nv), maxiv0); } - - void apply_wscreen(const mav,2> &dirty, - mav,2> &dirty2, double w, bool adjoint) const - { - checkShape(dirty.shape(), {nx_dirty, ny_dirty}); - checkShape(dirty2.shape(), {nx_dirty, ny_dirty}); - - double x0 = -0.5*nx_dirty*psx, - y0 = -0.5*ny_dirty*psy; - execStatic(nx_dirty/2+1, nthreads, 0, [&](Scheduler &sched) - { - while (auto rng=sched.getNext()) for(auto i=rng.lo; i0)&&(i0)&&(j0)&&(j> class Helper +template class Helper { private: + using T2=complex; const GridderConfig &gconf; int nu, nv, nsafe, supp; const T2 *grid_r; T2 *grid_w; - int su, sv; + int su, sv, svvec; int iu0, iv0; // start index of the current visibility int bu0, bv0; // start index of the current buffer T wfac; - vector rbuf, wbuf; + mav rbufr, rbufi, wbufr, wbufi; bool do_w_gridding; double w0, xdw; vector &locks; @@ -552,7 +543,7 @@ template> class Helper std::lock_guard lock(locks[idxu]); for (int iv=0; iv(wbufr(iu,iv), wbufi(iu,iv)); if (++idxv>=nv) idxv=0; } } @@ -569,7 +560,8 @@ template> class Helper int idxv = idxv0; for (int iv=0; iv=nv) idxv=0; } if (++idxu>=nu) idxu=0; @@ -578,8 +570,8 @@ template> class Helper public: size_t nvec; - const T2 *p0r; - T2 *p0w; + const T *p0rr, *p0ri; + T *p0wr, *p0wi; static constexpr size_t vlen=native_simd::size(); union kbuf { T scalar[64]; @@ -591,10 +583,15 @@ template> class Helper vector &locks_, double w0_=-1, double dw_=-1) : gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()), supp(gconf.Supp()), grid_r(grid_r_), - grid_w(grid_w_), su(2*nsafe+(1<0), w0(w0_), xdw(T(1)/dw_), @@ -605,7 +602,7 @@ template> class Helper } ~Helper() { if (grid_w) dump(); } - int lineJump() const { return sv; } + int lineJump() const { return svvec; } T Wfac() const { return wfac; } void prep(const UVW &in) { @@ -621,13 +618,20 @@ template> class Helper wfac = krn.eval_single(T(xdw*xsupp*abs(w0-in.w))); if ((iu0bu0+su) || (iv0+supp>bv0+sv)) { - if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); } + if (grid_w) + { + dump(); + wbufr.apply([](T &v){v=0;}); + wbufi.apply([](T &v){v=0;}); + } bu0=((((iu0+nsafe)>>logsquare)<>logsquare)< MsServ makeMsServ const mav &idx, T2 &ms, const mav &wgt) { return MsServ(baselines, idx, ms, wgt); } -template void x2grid_c + +template [[gnu::hot]] void x2grid_c_helper (const GridderConfig &gconf, Serv &srv, mav,2> &grid, double w0=-1, double dw=-1) { - checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); - MR_assert(grid.contiguous(), "grid is not contiguous"); - size_t supp = gconf.Supp(); + constexpr size_t vlen=native_simd::size(); + constexpr size_t NVEC((SUPP+vlen-1)/vlen); size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); - size_t np = srv.Nvis(); execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) { Helper hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); int jump = hlp.lineJump(); const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const T * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.vlen*hlp.nvec; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+NVEC; while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart tmp(v*ku[cu]); - size_t cv=0; - for (; cv+3::loadu(ptrr+cv*hlp.vlen); + tr += tmp.real()*kv[cv]; + tr.storeu(ptrr+cv*hlp.vlen); + auto ti = native_simd::loadu(ptri+cv*hlp.vlen); + ti += tmp.imag()*kv[cv]; + ti.storeu(ptri+cv*hlp.vlen); } - for (; cv void grid2x_c - (const GridderConfig &gconf, const mav,2> &grid, - Serv &srv, double w0=-1, double dw=-1) +template void x2grid_c + (const GridderConfig &gconf, Serv &srv, mav,2> &grid, + double w0=-1, double dw=-1) { checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); MR_assert(grid.contiguous(), "grid is not contiguous"); - size_t supp = gconf.Supp(); + + switch(gconf.Supp()) + { + case 1: x2grid_c_helper< 1>(gconf, srv, grid, w0, dw); break; + case 2: x2grid_c_helper< 2>(gconf, srv, grid, w0, dw); break; + case 3: x2grid_c_helper< 3>(gconf, srv, grid, w0, dw); break; + case 4: x2grid_c_helper< 4>(gconf, srv, grid, w0, dw); break; + case 5: x2grid_c_helper< 5>(gconf, srv, grid, w0, dw); break; + case 6: x2grid_c_helper< 6>(gconf, srv, grid, w0, dw); break; + case 7: x2grid_c_helper< 7>(gconf, srv, grid, w0, dw); break; + case 8: x2grid_c_helper< 8>(gconf, srv, grid, w0, dw); break; + case 9: x2grid_c_helper< 9>(gconf, srv, grid, w0, dw); break; + case 10: x2grid_c_helper<10>(gconf, srv, grid, w0, dw); break; + case 11: x2grid_c_helper<11>(gconf, srv, grid, w0, dw); break; + case 12: x2grid_c_helper<12>(gconf, srv, grid, w0, dw); break; + case 13: x2grid_c_helper<13>(gconf, srv, grid, w0, dw); break; + case 14: x2grid_c_helper<14>(gconf, srv, grid, w0, dw); break; + case 15: x2grid_c_helper<15>(gconf, srv, grid, w0, dw); break; + case 16: x2grid_c_helper<16>(gconf, srv, grid, w0, dw); break; + default: MR_fail("must not happen"); + } + } + +template [[gnu::hot]] void grid2x_c_helper + (const GridderConfig &gconf, const mav,2> &grid, + Serv &srv, double w0=-1, double dw=-1) + { + constexpr size_t vlen=native_simd::size(); + constexpr size_t NVEC((SUPP+vlen-1)/vlen); size_t nthreads = gconf.Nthreads(); bool do_w_gridding = dw>0; vector locks(gconf.Nu()); @@ -768,29 +800,30 @@ template void grid2x_c Helper hlp(gconf, grid.data(), nullptr, locks, w0, dw); int jump = hlp.lineJump(); const T * DUCC0_RESTRICT ku = hlp.buf.scalar; - const T * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.vlen*hlp.nvec; + const auto * DUCC0_RESTRICT kv = hlp.buf.simd+NVEC; while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart r = 0; - const auto * DUCC0_RESTRICT ptr = hlp.p0r; - for (size_t cu=0; cu rr=0, ri=0; + const auto * DUCC0_RESTRICT ptrr = hlp.p0rr; + const auto * DUCC0_RESTRICT ptri = hlp.p0ri; + for (size_t cu=0; cu tmp(0); - size_t cv=0; - for (; cv+3 tmpr(0), tmpi(0); + for (size_t cv=0; cv::loadu(ptrr+hlp.vlen*cv); + tmpi += kv[cv]*native_simd::loadu(ptri+hlp.vlen*cv); + } + rr += ku[cu]*tmpr; + ri += ku[cu]*tmpi; + ptrr += jump; + ptri += jump; } + auto r = complex(reduce(rr, std::plus<>()), reduce(ri, std::plus<>())); if (flip) r=conj(r); if (do_w_gridding) r*=hlp.Wfac(); srv.addVis(ipart, r); @@ -798,6 +831,35 @@ template void grid2x_c }); } +template void grid2x_c + (const GridderConfig &gconf, const mav,2> &grid, + Serv &srv, double w0=-1, double dw=-1) + { + checkShape(grid.shape(), {gconf.Nu(), gconf.Nv()}); + MR_assert(grid.contiguous(), "grid is not contiguous"); + + switch(gconf.Supp()) + { + case 1: grid2x_c_helper< 1>(gconf, grid, srv, w0, dw); break; + case 2: grid2x_c_helper< 2>(gconf, grid, srv, w0, dw); break; + case 3: grid2x_c_helper< 3>(gconf, grid, srv, w0, dw); break; + case 4: grid2x_c_helper< 4>(gconf, grid, srv, w0, dw); break; + case 5: grid2x_c_helper< 5>(gconf, grid, srv, w0, dw); break; + case 6: grid2x_c_helper< 6>(gconf, grid, srv, w0, dw); break; + case 7: grid2x_c_helper< 7>(gconf, grid, srv, w0, dw); break; + case 8: grid2x_c_helper< 8>(gconf, grid, srv, w0, dw); break; + case 9: grid2x_c_helper< 9>(gconf, grid, srv, w0, dw); break; + case 10: grid2x_c_helper<10>(gconf, grid, srv, w0, dw); break; + case 11: grid2x_c_helper<11>(gconf, grid, srv, w0, dw); break; + case 12: grid2x_c_helper<12>(gconf, grid, srv, w0, dw); break; + case 13: grid2x_c_helper<13>(gconf, grid, srv, w0, dw); break; + case 14: grid2x_c_helper<14>(gconf, grid, srv, w0, dw); break; + case 15: grid2x_c_helper<15>(gconf, grid, srv, w0, dw); break; + case 16: grid2x_c_helper<16>(gconf, grid, srv, w0, dw); break; + default: MR_fail("must not happen"); + } + } + template void apply_global_corrections(const GridderConfig &gconf, mav &dirty, double dw, bool divide_by_n) { @@ -860,7 +922,7 @@ template class WgridHelper private: Serv &srv; double wmin, dw; - size_t nplanes, supp; + size_t nplanes, supp, nthreads; vector> minplane; size_t verbosity; @@ -883,11 +945,12 @@ template class WgridHelper } template static void update_idx(vector &v, const vector &add, - const vector &del) + const vector &del, size_t nthreads) { MR_assert(v.size()>=del.size(), "must not happen"); vector res; res.reserve((v.size()+add.size())-del.size()); +#if 0 auto iin=v.begin(), ein=v.end(); auto iadd=add.begin(), eadd=add.end(); auto irem=del.begin(), erem=del.end(); @@ -904,16 +967,51 @@ template class WgridHelper MR_assert(irem==erem, "must not happen"); while(iadd!=eadd) res.push_back(*(iadd++)); +#else + if (v.empty()) //special case + { + MR_assert(del.empty(), "must not happen"); + for (auto x: add) + res.push_back(x); + } + else + { + res.resize((v.size()+add.size())-del.size()); + execParallel(nthreads, [&](Scheduler &sched) { + auto tid = sched.thread_num(); + auto [lo, hi] = calcShare(nthreads, tid, v.size()); + if (lo==hi) return; // if interval is empty, do nothing + auto iin=v.begin()+lo, ein=v.begin()+hi; + auto iadd = (iin==v.begin()) ? add.begin() : lower_bound(add.begin(), add.end(), *iin); + auto eadd = (ein==v.end()) ? add.end() : lower_bound(add.begin(), add.end(), *ein); + auto irem = (iin==v.begin()) ? del.begin() : lower_bound(del.begin(), del.end(), *iin); + auto erem = (ein==v.end()) ? del.end() : lower_bound(del.begin(), del.end(), *ein); + auto iout = res.begin()+lo-(irem-del.begin())+(iadd-add.begin()); + while(iin!=ein) + { + if ((irem!=erem) && (*iin==*irem)) + { ++irem; ++iin; } // skip removed entry + else if ((iadd!=eadd) && (*iadd<*iin)) + *(iout++) = *(iadd++); // add new entry + else + *(iout++) = *(iin++); + } + MR_assert(irem==erem, "must not happen"); + while(iadd!=eadd) + *(iout++) = *(iadd++); + }); + } +#endif MR_assert(res.size()==(v.size()+add.size())-del.size(), "must not happen"); v.swap(res); } public: WgridHelper(const GridderConfig &gconf, Serv &srv_, size_t verbosity_) - : srv(srv_), supp(gconf.Supp()), verbosity(verbosity_), curplane(-1) + : srv(srv_), supp(gconf.Supp()), nthreads(gconf.Nthreads()), + verbosity(verbosity_), curplane(-1) { size_t nvis = srv.Nvis(); - size_t nthreads = gconf.Nthreads(); double wmax; wminmax(srv, wmin, wmax); @@ -942,22 +1040,37 @@ template class WgridHelper } #else // more efficient: precalculate final vector sizes and avoid reallocations - vector cnt(nplanes,0); - for(size_t ipart=0; ipart p0(nvis); + mav cnt({nthreads, nplanes+16}); // safety distance against false sharing + execStatic(nvis, nthreads, 0, [&](Scheduler &sched) { - int plane0 = max(0,int(1+(abs(srv.getCoord(ipart).w)-(0.5*supp*dw)-wmin)/dw)); - ++cnt[plane0]; + auto tid=sched.thread_num(); + while (auto rng=sched.getNext()) for(auto i=rng.lo; i ofs(nplanes, 0); - for (size_t ipart=0; ipart class WgridHelper bool advance() { if (++curplane>=int(nplanes)) return false; - update_idx(subidx, minplane[curplane], curplane>=int(supp) ? minplane[curplane-supp] : vector()); + update_idx(subidx, minplane[curplane], curplane>=int(supp) ? minplane[curplane-supp] : vector(), nthreads); if (verbosity>1) cout << "Working on plane " << curplane << " containing " << subidx.size() << " visibilities" << endl; @@ -1058,23 +1171,14 @@ template void dirty2x( } } -void calc_share(size_t nshares, size_t myshare, size_t nwork, size_t &lo, - size_t &hi) - { - size_t nbase = nwork/nshares; - size_t additional = nwork%nshares; - lo = myshare*nbase + ((myshare vector getWgtIndices(const Baselines &baselines, const GridderConfig &gconf, const mav &wgt, const mav,2> &ms) { size_t nrow=baselines.Nrows(), nchan=baselines.Nchannels(), - nsafe=gconf.Nsafe(); + nsafe=gconf.Nsafe(), + nthreads=gconf.Nthreads(); bool have_wgt=wgt.size()!=0; if (have_wgt) checkShape(wgt.shape(),{nrow,nchan}); bool have_ms=ms.size()!=0; @@ -1082,13 +1186,18 @@ template vector getWgtIndices(const Baselines &baselines, constexpr int side=1<> logsquare, nbv = (gconf.Nv()+1+side-1) >> logsquare; - vector acc(nbu*nbv+1,0); + mav acc({nthreads, (nbu*nbv+16)}); // the 16 is safety distance to avoid false sharing vector tmp(nrow*nchan); - for (idx_t irow=0, idx=0; irow vector getWgtIndices(const Baselines &baselines, gconf.getpix(uvw.u, uvw.v, u, v, iu0, iv0); iu0 = (iu0+nsafe)>>logsquare; iv0 = (iv0+nsafe)>>logsquare; - ++acc[nbv*iu0 + iv0 + 1]; tmp[idx] = nbv*iu0 + iv0; + ++acc.v(tid, tmp[idx]); } else tmp[idx] = ~idx_t(0); + } + }); - for (size_t i=1; i res(acc.back()); - for (size_t irow=0, idx=0; irow res(offset); + execStatic(nrow, nthreads, 0, [&](Scheduler &sched) + { + idx_t tid = sched.thread_num(); + while (auto rng=sched.getNext()) for(auto irow=idx_t(rng.lo); irow + +namespace ducc0 { + +namespace detail_misc_utils { + +using namespace std; + +template auto calcShare(size_t nshares, size_t myshare, + const T &begin, const T &end) + { + auto nwork = end-begin; + auto nbase = nwork/nshares; + auto additional = nwork%nshares; + auto lo = begin + (myshare*nbase + ((myshare auto calcShare(size_t nshares, size_t myshare, const T &end) + { return calcShare(nshares, myshare, T(0), end); } + +} + +using detail_misc_utils::calcShare; + +} + +#endif diff --git a/src/ducc0/infra/threading.cc b/src/ducc0/infra/threading.cc index 8948fb3ebafaa3dc68275a1eafd821e41eabbd62..39d4b20265d380b0dd716f3bd365f5985b651320 100644 --- a/src/ducc0/infra/threading.cc +++ b/src/ducc0/infra/threading.cc @@ -29,6 +29,7 @@ #include #include #include +#include #if __has_include() #include #endif @@ -40,9 +41,7 @@ namespace detail_threading { #ifndef DUCC0_NO_THREADING -static const size_t max_threads_ = std::max(1, std::thread::hardware_concurrency()); - -std::atomic default_nthreads_(max_threads_); +std::atomic default_nthreads_(std::max(1, std::thread::hardware_concurrency())); size_t get_default_nthreads() { return default_nthreads_; } @@ -50,8 +49,6 @@ size_t get_default_nthreads() void set_default_nthreads(size_t new_default_nthreads) { default_nthreads_ = std::max(1, new_default_nthreads); } -size_t max_threads() { return max_threads_; } - class latch { std::atomic num_left_; @@ -155,10 +152,10 @@ class thread_pool threads_(nthreads) { create_threads(); } - thread_pool(): thread_pool(max_threads_) {} - ~thread_pool() { shutdown(); } + size_t size() const { return threads_.size(); } + void submit(std::function work) { work_queue_.push(move(work)); @@ -179,9 +176,12 @@ class thread_pool } }; -inline thread_pool &get_pool() +thread_pool &get_pool(size_t nthreads=0) { - static thread_pool pool; + static std::unique_ptr pool; + if ((!pool) && (nthreads==0)) nthreads=default_nthreads_; + if ((!pool) || ((nthreads!=0) && (nthreads!=pool->size()))) // resize + pool = std::make_unique(nthreads); #if __has_include() static std::once_flag f; call_once(f, @@ -194,9 +194,12 @@ inline thread_pool &get_pool() }); #endif - return pool; + return *pool; } +void set_pool_size(size_t new_pool_size) + { get_pool(new_pool_size); } + class Distribution { private: @@ -317,7 +320,7 @@ void Distribution::thread_map(std::function f) return; } - auto & pool = get_pool(); + auto &pool = get_pool(); latch counter(nthreads_); std::exception_ptr ex; std::mutex ex_mut; diff --git a/src/ducc0/infra/threading.h b/src/ducc0/infra/threading.h index 043de513a8056a626796da981e1acb376ecdcbda..a4b454577e3d1b9c697be7bfdcffbb1199d53a1b 100644 --- a/src/ducc0/infra/threading.h +++ b/src/ducc0/infra/threading.h @@ -47,10 +47,11 @@ class Scheduler virtual Range getNext() = 0; }; -size_t max_threads(); void set_default_nthreads(size_t new_default_nthreads); size_t get_default_nthreads(); +void set_pool_size(size_t new_pool_size); + void execSingle(size_t nwork, std::function func); void execStatic(size_t nwork, size_t nthreads, size_t chunksize, @@ -63,7 +64,7 @@ void execParallel(size_t nthreads, std::function func); } // end of namespace detail_threading -using detail_threading::max_threads; +using detail_threading::set_pool_size; using detail_threading::get_default_nthreads; using detail_threading::set_default_nthreads; using detail_threading::Scheduler; diff --git a/src/ducc0/math/fft.h b/src/ducc0/math/fft.h index fbfb97e017834abd565962516fa91e53c2449510..18b6337de421953538f562daa3083a3d5979483b 100644 --- a/src/ducc0/math/fft.h +++ b/src/ducc0/math/fft.h @@ -49,6 +49,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include "ducc0/infra/threading.h" +#include "ducc0/infra/misc_utils.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/mav.h" #ifndef DUCC0_NO_THREADING @@ -117,7 +118,7 @@ struct util // hack to avoid duplicate symbols size_t parallel = size / (info.shape(axis) * vlen); if (info.shape(axis) < 1000) parallel /= 4; - size_t max_threads = (nthreads==0) ? ducc0::max_threads() : nthreads; + size_t max_threads = (nthreads==0) ? ducc0::get_default_nthreads() : nthreads; return std::max(size_t(1), std::min(parallel, max_threads)); } #endif @@ -437,10 +438,7 @@ template class multi_iter if (nshares==1) return; if (nshares==0) throw std::runtime_error("can't run with zero threads"); if (myshare>=nshares) throw std::runtime_error("impossible share requested"); - size_t nbase = rem/nshares; - size_t additional = rem%nshares; - size_t lo = myshare*nbase + ((myshare class HornerKernel: public GriddingKernel size_t W, D, nvec; vector coeff; + const T *scoeff; + size_t sstride; void (HornerKernel::* evalfunc) (T, native_simd *) const; T (HornerKernel::* evalsinglefunc) (T) const; @@ -182,11 +184,10 @@ template class HornerKernel: public GriddingKernel { auto nth = min(W-1, size_t(max(T(0), (x+1)*W*T(0.5)))); x = (x+1)*W-2*nth-1; - auto i = nth/vlen; - auto imod = nth%vlen; - auto tval = coeff[i][imod]; + auto ptr = scoeff+nth; + auto tval = *ptr; for (size_t j=1; j<=DEG; ++j) - tval = tval*x + coeff[j*nvec+i][imod]; + tval = tval*x + ptr[j*sstride]; return tval; } @@ -206,11 +207,10 @@ template class HornerKernel: public GriddingKernel { auto nth = min(W-1, size_t(max(T(0), (x+1)*W*T(0.5)))); x = (x+1)*W-2*nth-1; - auto i = nth/vlen; - auto imod = nth%vlen; - auto tval = coeff[i][imod]; + auto ptr = scoeff+nth; + auto tval = *ptr; for (size_t j=1; j<=D; ++j) - tval = tval*x + coeff[j*nvec+i][imod]; + tval = tval*x + ptr[j*sstride]; return tval; } @@ -274,7 +274,8 @@ template class HornerKernel: public GriddingKernel HornerKernel(size_t W_, size_t D_, const function &func, const KernelCorrection &corr_) : W(W_), D(D_), nvec((W+vlen-1)/vlen), - coeff(makeCoeff(W_, D_, func)), corr(corr_) + coeff(makeCoeff(W_, D_, func)), scoeff(reinterpret_cast(&coeff[0])), + sstride(vlen*nvec), corr(corr_) { wire_eval(); } virtual size_t support() const { return W; }