From f78699ac21170da40dbbf05735134e0cc59199f2 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 30 Oct 2019 14:06:29 +0100 Subject: [PATCH] try to make index calculation parallel --- gridder_cxx.h | 190 ++++++++++++++++++++++++++++++++++++++++++++++- nifty_gridder.cc | 15 ++++ 2 files changed, 201 insertions(+), 4 deletions(-) diff --git a/gridder_cxx.h b/gridder_cxx.h index 14a013a..06f1f15 100644 --- a/gridder_cxx.h +++ b/gridder_cxx.h @@ -1618,6 +1618,20 @@ void fillIdx(const Baselines &baselines, res[acc[tmp[idx++]]++] = baselines.getIdx(irow, ichan); } } +class SimpleTimer + { + private: + using clock = std::chrono::steady_clock; + clock::time_point starttime; + + public: + SimpleTimer() + : starttime(clock::now()) {} + double operator()() const + { + return std::chrono::duration(clock::now() - starttime).count(); + } + }; template vector getWgtIndices(const Baselines &baselines, const GridderConfig &gconf, const const_mav &wgt) @@ -1631,9 +1645,19 @@ template vector getWgtIndices(const Baselines &baselines, size_t nbu = (gconf.Nu()+1+side-1) >> logsquare, nbv = (gconf.Nv()+1+side-1) >> logsquare; vector acc(nbu*nbv+1, 0); - vector tmp(nrow*nchan); + vector tmp(nrow*nchan,~idx_t(0)); - for (idx_t irow=0, idx=0; irow lacc(nbu*nbv+1, 0); + + for (idx_t irow=lo, idx=lo*nchan; irow vector getWgtIndices(const Baselines &baselines, gconf.getpix(uvw.u, uvw.v, u, v, iu0, iv0); iu0 = (iu0+nsafe)>>logsquare; iv0 = (iv0+nsafe)>>logsquare; - ++acc[nbv*iu0 + iv0 + 1]; + ++lacc[nbv*iu0 + iv0 + 1]; tmp[idx++] = nbv*iu0 + iv0; } - +#pragma omp barrier +#pragma omp critical(xyz) + for (size_t i=0; i res(acc.back()); for (size_t irow=0, idx=0; irow void dirty2ms(const const_mav &uvw, dirty2x(gconf, dirty, makeMsServ(baselines,idx2,ms,wgt), do_wstacking, verbosity); } +static const uint16_t utab[] = { +#define Z(a) 0x##a##0, 0x##a##1, 0x##a##4, 0x##a##5 +#define Y(a) Z(a##0), Z(a##1), Z(a##4), Z(a##5) +#define X(a) Y(a##0), Y(a##1), Y(a##4), Y(a##5) +X(0),X(1),X(4),X(5) +#undef X +#undef Y +#undef Z +}; + +uint32_t coord2morton2D_32 (uint32_t x, uint32_t y) + { + typedef uint32_t I; + return (I)(utab[x&0xff]) | ((I)(utab[(x>>8)&0xff])<<16) + | ((I)(utab[y&0xff])<<1) | ((I)(utab[(y>>8)&0xff])<<17); + } + +static const uint8_t m2p2D_1[4][4] = { +{ 4, 1, 11, 2},{0,15, 5, 6},{10,9,3,12},{14,7,13,8}}; +static uint8_t m2p2D_3[4][64]; +static const uint8_t p2m2D_1[4][4] = { +{ 4, 1, 3, 10},{0,6,7,13},{15,9,8,2},{11,14,12,5}}; +static uint8_t p2m2D_3[4][64]; +static int peano2d_done=0; + +static void init_peano2d (void) + { + peano2d_done=1; + + for (int d=0; d<4; ++d) + for (uint32_t p=0; p<64; ++p) + { + unsigned rot = d; + uint32_t v = p<<26; + uint32_t res = 0; + for (int i=0; i<3; ++i) + { + unsigned tab=m2p2D_1[rot][v>>30]; + v<<=2; + res = (res<<2) | (tab&0x3); + rot = tab>>2; + } + m2p2D_3[d][p]=res|(rot<<6); + } + for (int d=0; d<4; ++d) + for (uint32_t p=0; p<64; ++p) + { + unsigned rot = d; + uint32_t v = p<<26; + uint32_t res = 0; + for (int i=0; i<3; ++i) + { + unsigned tab=p2m2D_1[rot][v>>30]; + v<<=2; + res = (res<<2) | (tab&0x3); + rot = tab>>2; + } + p2m2D_3[d][p]=res|(rot<<6); + } + } + +uint32_t morton2peano2D_32(uint32_t v, int bits) + { + if (!peano2d_done) init_peano2d(); + unsigned rot = 0; + uint32_t res = 0; + v<<=32-(bits<<1); + int i=0; + for (; i>26]; + v<<=6; + res = (res<<6) | (tab&0x3fu); + rot = tab>>6; + } + for (; i>30]; + v<<=2; + res = (res<<2) | (tab&0x3); + rot = tab>>2; + } + return res; + } + +template class IdxComp__ + { + private: + It begin; + Comp comp; + public: + IdxComp__ (It begin_, Comp comp_): begin(begin_), comp(comp_) {} + bool operator() (std::size_t a, std::size_t b) const + { return comp(*(begin+a),*(begin+b)); } + }; + +/*! Performs an indirect sort on the supplied iterator range and returns in + \a idx a \a vector containing the indices of the smallest, second smallest, + third smallest, etc. element, according to \a comp. */ +template + inline void buildIndex (It begin, It end, std::vector &idx, Comp comp) + { + using namespace std; + T2 num=end-begin; + idx.resize(num); + for (T2 i=0; i(begin,comp)); + } + +/*! Performs an indirect sort on the supplied iterator range and returns in + \a idx a \a vector containing the indices of the smallest, second smallest, + third smallest, etc. element. */ +template inline void buildIndex (It begin, It end, + std::vector &idx) + { + using namespace std; + typedef typename iterator_traits::value_type T; + buildIndex(begin,end,idx,less()); + } + +void getRowOrdering(const const_mav &uvw, const mav &idx) + { + myassert(uvw.shape(1)==3, "second dimension of uvw array must be 3"); + size_t nrow = uvw.shape(0); + myassert(nrow>0, "zero-sized array"); + myassert(nrow==idx.shape(0), "dimension mismatch"); + double umin=uvw(0,0), umax=uvw(0,0), vmin=uvw(0,1), vmax=uvw(0,1); + for (size_t i=1; i peano(nrow); + for (size_t i=0; i tidx; + buildIndex(peano.begin(), peano.end(), tidx); + for (size_t i=0; i(uvw_, "uvw"); + auto uvw2 = make_const_mav<2>(uvw); + auto res = makeArray({uvw2.shape(0)}); + auto res2 = make_mav<1>(res); + { + py::gil_scoped_release release; + getRowOrdering(uvw2, res2); + } + return res; + } + } // unnamed namespace PYBIND11_MODULE(nifty_gridder, m) @@ -1117,4 +1130,6 @@ PYBIND11_MODULE(nifty_gridder, m) m.def("dirty2ms", &Pydirty2ms, dirty2ms_DS, "uvw"_a, "freq"_a, "dirty"_a, "wgt"_a=None, "pixsize_x"_a, "pixsize_y"_a, "epsilon"_a, "do_wstacking"_a=false, "nthreads"_a=1, "verbosity"_a=0); + + m.def("getRowOrdering", &PygetRowOrdering, "uvw"_a); } -- GitLab