Commit 42892ca2 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

Merge branch 'improve_scaling' into 'master'

Improve scaling

See merge request !25
parents 93bc5bec 3533fd55
......@@ -29,6 +29,10 @@
#include <vector>
#include <array>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "pocketfft_hdronly.h"
#if defined(__GNUC__)
......@@ -720,6 +724,38 @@ class GridderConfig
constexpr int logsquare=4;
#ifdef _OPENMP
class Lock
{
private:
omp_lock_t lck;
Lock(const Lock &) = delete;
Lock& operator=(const Lock &) = delete;
public:
Lock() { omp_init_lock(&lck); }
~Lock() { omp_destroy_lock(&lck); }
void lock() { omp_set_lock(&lck); }
void unlock() { omp_unset_lock(&lck); }
};
int my_max_threads() { return omp_get_max_threads(); }
int my_num_threads() { return omp_get_num_threads(); }
int my_thread_num() { return omp_get_thread_num(); }
#else
class Lock
{
public:
Lock() {}
~Lock() {}
void lock() {}
void unlock() {}
};
int my_max_threads() { return 1; }
int my_num_threads() { return 1; }
int my_thread_num() { return 0; }
#endif
template<typename T, typename T2=complex<T>> class Helper
{
private:
......@@ -737,6 +773,7 @@ template<typename T, typename T2=complex<T>> class Helper
double w0, xdw;
size_t nexp;
size_t nvecs;
vector<Lock> &locks;
void dump() const
{
......@@ -749,11 +786,13 @@ template<typename T, typename T2=complex<T>> class Helper
for (int iu=0; iu<su; ++iu)
{
int idxv = idxv0;
locks[idxu].lock();
for (int iv=0; iv<sv; ++iv)
{
grid_w[idxu*nv + idxv] += wbuf[iu*sv + iv];
if (++idxv>=nv) idxv=0;
}
locks[idxu].unlock();
if (++idxu>=nu) idxu=0;
}
}
......@@ -781,7 +820,7 @@ template<typename T, typename T2=complex<T>> class Helper
T kernel[64] ALIGNED(64);
Helper(const GridderConfig &gconf_, const T2 *grid_r_, T2 *grid_w_,
double w0_=-1, double dw_=-1)
vector<Lock> &locks_, double w0_=-1, double dw_=-1)
: gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()),
supp(gconf.Supp()), beta(T(gconf.Beta())), grid_r(grid_r_),
grid_w(grid_w_), su(2*nsafe+(1<<logsquare)), sv(2*nsafe+(1<<logsquare)),
......@@ -792,7 +831,8 @@ template<typename T, typename T2=complex<T>> class Helper
w0(w0_),
xdw(T(1)/dw_),
nexp(2*supp + do_w_gridding),
nvecs(VLEN<T>::val*((nexp+VLEN<T>::val-1)/VLEN<T>::val))
nvecs(VLEN<T>::val*((nexp+VLEN<T>::val-1)/VLEN<T>::val)),
locks(locks_)
{}
~Helper() { if (grid_w) dump(); }
......@@ -945,10 +985,11 @@ template<typename T, typename Serv> void x2grid_c
size_t supp = gconf.Supp();
size_t nthreads = gconf.Nthreads();
bool do_w_gridding = dw>0;
vector<Lock> locks(gconf.Nu());
#pragma omp parallel num_threads(nthreads)
{
Helper<T> hlp(gconf, nullptr, grid.data(), w0, dw);
Helper<T> hlp(gconf, nullptr, grid.data(), locks, w0, dw);
int jump = hlp.lineJump();
const T * RESTRICT ku = hlp.kernel;
const T * RESTRICT kv = hlp.kernel+supp;
......@@ -1005,11 +1046,12 @@ template<typename T, typename Serv> void grid2x_c
size_t supp = gconf.Supp();
size_t nthreads = gconf.Nthreads();
bool do_w_gridding = dw>0;
vector<Lock> locks(gconf.Nu());
// Loop over sampling points
#pragma omp parallel num_threads(nthreads)
{
Helper<T> hlp(gconf, grid.data(), nullptr, w0, dw);
Helper<T> hlp(gconf, grid.data(), nullptr, locks, w0, dw);
int jump = hlp.lineJump();
const T * RESTRICT ku = hlp.kernel;
const T * RESTRICT kv = hlp.kernel+supp;
......@@ -1068,11 +1110,12 @@ template<typename T> void apply_holo
checkShape(ogrid.shape(), grid.shape());
ogrid.fill(0);
size_t supp = gconf.Supp();
vector<Lock> locks(gconf.Nu());
// Loop over sampling points
#pragma omp parallel num_threads(nthreads)
{
Helper<T> hlp(gconf, grid.data(), ogrid.data());
Helper<T> hlp(gconf, grid.data(), ogrid.data(), locks);
int jump = hlp.lineJump();
const T * RESTRICT ku = hlp.kernel;
const T * RESTRICT kv = hlp.kernel+supp;
......@@ -1138,6 +1181,7 @@ template<typename T> void get_correlations
myassert(size_t(abs(dv))<supp, "|dv| must be smaller than Supp");
size_t nthreads = gconf.Nthreads();
ogrid.fill(0);
vector<Lock> locks(gconf.Nu());
size_t u0, u1, v0, v1;
if (du>=0)
......@@ -1152,7 +1196,7 @@ template<typename T> void get_correlations
// Loop over sampling points
#pragma omp parallel num_threads(nthreads)
{
Helper<T,T> hlp(gconf, nullptr, ogrid.data());
Helper<T,T> hlp(gconf, nullptr, ogrid.data(), locks);
int jump = hlp.lineJump();
const T * RESTRICT ku = hlp.kernel;
const T * RESTRICT kv = hlp.kernel+supp;
......@@ -1240,24 +1284,55 @@ template<typename Serv> void wminmax(const GridderConfig &gconf,
}
}
template<typename T> void update_idx(vector<T> &v, vector<T> &vold,
const vector<T> &add, const vector<T> &del)
{
vold.resize(0);
vold.reserve((v.size()+add.size())-del.size());
auto iin=v.begin(), ein=v.end();
auto iadd=add.begin(), eadd=add.end();
auto irem=del.begin(), erem=del.end();
while(iin!=ein)
{
if (irem!=erem)
{
while ((iin!=ein) && (*iin==*irem))
{ ++irem; ++iin; } // skip the entries to be removed
if (iin==ein) break;
}
if (iadd!=eadd)
while ((iadd!=eadd) && (*iadd<*iin))
vold.push_back(*(iadd++));
vold.push_back(*(iin++));
}
while(iadd!=eadd)
vold.push_back(*(iadd++));
v.swap(vold);
}
template<typename Serv> void wstack_common(
const GridderConfig &gconf, const Serv &srv, double &wmin,
double &dw, size_t &nplanes, vector<size_t> &nvis_plane,
vector<int> &minplane, size_t verbosity)
vector<vector<uint32_t>> &minplane, size_t verbosity)
{
size_t nvis = srv.Nvis();
size_t nthreads = gconf.Nthreads();
double wmax;
wminmax(gconf, srv, wmin, wmax);
#ifdef _OPENMP
if (verbosity>0) cout << "Using " << nthreads << " threads" << endl;
#else
if (verbosity>0) cout << "Using single-threaded mode" << endl;
#endif
if (verbosity>0) cout << "W range: " << wmin << " to " << wmax << endl;
double x0 = -0.5*gconf.Nxdirty()*gconf.Pixsize_x(),
y0 = -0.5*gconf.Nydirty()*gconf.Pixsize_y();
double nmin = sqrt(max(1.-x0*x0-y0*y0,0.))-1.;
dw = 0.25/abs(nmin);
nplanes = size_t((wmax-wmin)/dw+2);
dw = (wmax-wmin)/(nplanes-1);
dw = (1.+1e-13)*(wmax-wmin)/(nplanes-1);
auto supp = gconf.Supp();
wmin -= (0.5*supp-1)*dw;
......@@ -1266,21 +1341,49 @@ template<typename Serv> void wstack_common(
if (verbosity>0) cout << "Kernel support: " << supp << endl;
if (verbosity>0) cout << "nplanes: " << nplanes << endl;
nvis_plane.resize(nplanes,0);
minplane.resize(nvis);
minplane.resize(nplanes);
vector<size_t> tcnt(my_max_threads()*nplanes,0);
#pragma omp parallel num_threads(nthreads)
{
vector<size_t> nvploc(nplanes,0);
#pragma omp for
vector<size_t> mytcnt(nplanes,0);
vector<size_t> nvp(nplanes,0);
auto mythread = my_thread_num();
#pragma omp for schedule(static)
for (size_t ipart=0; ipart<nvis; ++ipart)
{
int plane0 = int((abs(srv.getCoord(ipart).w)-wmin)/dw+0.5*supp)-int(supp-1);
minplane[ipart]=plane0;
for (int i=max<int>(0,plane0); i<min<int>(nplanes,plane0+supp); ++i)
++nvploc[i];
int plane0 = max(0,int(1+(abs(srv.getCoord(ipart).w)-(0.5*supp*dw)-wmin)/dw));
++mytcnt[plane0];
for (int i=plane0; i<min<int>(nplanes,plane0+supp); ++i)
++nvp[i];
}
#pragma omp critical
#pragma omp critical (wstack_common)
{
for (size_t i=0; i<nplanes; ++i)
nvis_plane[i]+=nvploc[i];
nvis_plane[i] += nvp[i];
for (size_t i=0; i<nplanes; ++i)
tcnt[mythread*nplanes+i] = mytcnt[i];
}
#pragma omp barrier
#pragma omp single
for (size_t j=0; j<nplanes; ++j)
{
size_t l=0;
for (size_t i=0; i<my_num_threads(); ++i)
l+=tcnt[i*nplanes+j];
minplane[j].resize(l);
}
#pragma omp barrier
vector<size_t> myofs(nplanes, 0);
for (size_t j=0; j<nplanes; ++j)
for (int i=0; i<mythread; ++i)
myofs[j]+=tcnt[i*nplanes+j];
#pragma omp for schedule(static)
for (size_t ipart=0; ipart<nvis; ++ipart)
{
int plane0 = max(0,int(1+(abs(srv.getCoord(ipart).w)-(0.5*supp*dw)-wmin)/dw));
minplane[plane0][myofs[plane0]++]=uint32_t(ipart);
}
}
}
......@@ -1292,19 +1395,17 @@ template<typename T, typename Serv> void x2dirty(
{
auto nx_dirty=gconf.Nxdirty();
auto ny_dirty=gconf.Nydirty();
size_t nvis = srv.Nvis();
auto supp = gconf.Supp();
size_t nthreads = gconf.Nthreads();
double wmin;
double dw;
size_t nplanes;
vector<size_t> nvis_plane;
vector<int> minplane;
vector<vector<uint32_t>> minplane;
if (verbosity>0) cout << "Gridding using improved w-stacking" << endl;
wstack_common(gconf, srv, wmin, dw, nplanes, nvis_plane, minplane, verbosity);
dirty.fill(0);
vector<uint32_t> subidx;
vector<uint32_t> subidx, subidx_old;
tmpStorage<complex<T>,2> grid_({gconf.Nu(),gconf.Nv()});
auto grid=grid_.getMav();
tmpStorage<complex<T>,2> tdirty_(dirty.shape());
......@@ -1317,12 +1418,7 @@ template<typename T, typename Serv> void x2dirty(
<< " visibilities" << endl;
double wcur = wmin+iw*dw;
size_t cnt=0;
subidx.resize(nvis_plane[iw]);
// FIXME: this loop becomes a bottleneck when using many threads
for (size_t ipart=0; ipart<nvis; ++ipart)
if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp))
subidx[cnt++] = ipart;
update_idx(subidx, subidx_old, minplane[iw], iw>=supp ? minplane[iw-supp] : vector<uint32_t>());
auto subidx2 = const_mav<uint32_t, 1>(subidx.data(), {subidx.size()});
auto subsrv = srv.getSubserv(subidx2);
grid.fill(0);
......@@ -1371,14 +1467,13 @@ template<typename T, typename Serv> void dirty2x(
{
auto nx_dirty=gconf.Nxdirty();
auto ny_dirty=gconf.Nydirty();
size_t nvis = srv.Nvis();
auto supp = gconf.Supp();
size_t nthreads = gconf.Nthreads();
double wmin;
double dw;
size_t nplanes;
vector<size_t> nvis_plane;
vector<int> minplane;
vector<vector<uint32_t>> minplane;
if (verbosity>0) cout << "Degridding using improved w-stacking" << endl;
wstack_common(gconf, srv, wmin, dw, nplanes, nvis_plane, minplane, verbosity);
......@@ -1390,7 +1485,7 @@ template<typename T, typename Serv> void dirty2x(
// correct for w gridding
apply_wcorr(gconf, tdirty, ES_Kernel(supp, nthreads), dw);
vector<uint32_t> subidx;
vector<uint32_t> subidx, subidx_old;
tmpStorage<complex<T>,2> grid_({gconf.Nu(),gconf.Nv()});
auto grid=grid_.getMav();
tmpStorage<complex<T>,2> tdirty2_({nx_dirty,ny_dirty});
......@@ -1410,12 +1505,7 @@ template<typename T, typename Serv> void dirty2x(
gconf.apply_wscreen(cmav(tdirty2), tdirty2, wcur, false);
gconf.dirty2grid_c(cmav(tdirty2), grid);
subidx.resize(nvis_plane[iw]);
size_t cnt=0;
// FIXME: this loop becomes a bottleneck when using many threads
for (size_t ipart=0; ipart<nvis; ++ipart)
if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp))
subidx[cnt++] = ipart;
update_idx(subidx, subidx_old, minplane[iw], iw>=supp ? minplane[iw-supp] : vector<uint32_t>());
auto subidx2 = const_mav<uint32_t, 1>(subidx.data(), {subidx.size()});
auto subsrv = srv.getSubserv(subidx2);
grid2x_c(gconf, cmav(grid), subsrv, wcur, dw);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment