Commit f21a205b authored by Martin Reinecke's avatar Martin Reinecke
Browse files

more tweaks

parent 4475d484
Pipeline #81409 passed with stages
in 12 minutes and 20 seconds
......@@ -56,8 +56,6 @@ inline complex<float> hsum_cmplx(native_simd<float> vr, native_simd<float> vi)
auto t2 = _mm_hadd_ps(_mm256_extractf128_ps(t1, 0), _mm256_extractf128_ps(t1, 1));
t2 += _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,0,3,2));
return complex<float>(t2[0], t2[1]);
//FIXME perhaps some shuffling?
return complex<float>(t2[0]+t2[2], t2[1]+t2[3]);
}
#endif
......@@ -78,9 +76,11 @@ template<typename T> void complex2hartley
MR_assert(grid.conformable(grid2), "shape mismatch");
size_t nu=grid.shape(0), nv=grid.shape(1);
execStatic(nu, nthreads, 0, [&](Scheduler &sched)
execParallel(nthreads, [&](Scheduler &sched)
{
while (auto rng=sched.getNext()) for(auto u=rng.lo; u<rng.hi; ++u)
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nu);
for(auto u=lo; u<hi; ++u)
{
size_t xu = (u==0) ? 0 : nu-u;
for (size_t v=0; v<nv; ++v)
......@@ -99,9 +99,11 @@ template<typename T> void hartley2complex
MR_assert(grid.conformable(grid2), "shape mismatch");
size_t nu=grid.shape(0), nv=grid.shape(1);
execStatic(nu, nthreads, 0, [&](Scheduler &sched)
execParallel(nthreads, [&](Scheduler &sched)
{
while (auto rng=sched.getNext()) for(auto u=rng.lo; u<rng.hi; ++u)
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nu);
for(auto u=lo; u<hi; ++u)
{
size_t xu = (u==0) ? 0 : nu-u;
for (size_t v=0; v<nv; ++v)
......@@ -133,9 +135,12 @@ template<typename T> void hartley2_2D(mav<T,2> &arr, size_t vlim,
}
else
r2r_separable_hartley(farr, farr, {0,1}, T(1), nthreads);
execStatic((nu+1)/2-1, nthreads, 0, [&](Scheduler &sched)
execParallel(nthreads, [&](Scheduler &sched)
{
while (auto rng=sched.getNext()) for(auto i=rng.lo+1; i<rng.hi+1; ++i)
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, (nu+1)/2-1);
for(auto i=lo+1; i<hi+1; ++i)
for(size_t j=1; j<(nv+1)/2; ++j)
{
T a = arr(i,j);
......@@ -288,9 +293,11 @@ template<typename T> class Params
checkShape(dirty.shape(), {nxdirty, nydirty});
auto cfu = krn->corfunc(nxdirty/2+1, 1./nu, nthreads);
auto cfv = krn->corfunc(nydirty/2+1, 1./nv, nthreads);
execStatic(nxdirty, nthreads, 0, [&](Scheduler &sched)
execParallel(nthreads, [&](Scheduler &sched)
{
while (auto rng=sched.getNext()) for(auto i=rng.lo; i<rng.hi; ++i)
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nxdirty);
for (auto i=lo; i<hi; ++i)
{
int icfu = abs(int(nxdirty/2)-int(i));
for (size_t j=0; j<nydirty; ++j)
......@@ -311,13 +318,15 @@ template<typename T> class Params
checkShape(dirty.shape(), {nxdirty,nydirty});
double x0 = -0.5*nxdirty*pixsize_x,
y0 = -0.5*nydirty*pixsize_y;
execStatic(nxdirty/2+1, nthreads, 0, [&](Scheduler &sched)
execParallel(nthreads, [&](Scheduler &sched)
{
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nxdirty/2+1);
using vtype = native_simd<T>;
constexpr size_t vlen=vtype::size();
size_t nvec = (nydirty/2+1+(vlen-1))/vlen;
vector<vtype> ph(nvec), sp(nvec), cp(nvec);
while (auto rng=sched.getNext()) for(auto i=rng.lo; i<rng.hi; ++i)
for (auto i=lo; i<hi; ++i)
{
T fx = T(x0+i*pixsize_x);
fx *= fx;
......@@ -398,10 +407,25 @@ template<typename T> class Params
auto cfu = krn->corfunc(nxdirty/2+1, 1./nu, nthreads);
auto cfv = krn->corfunc(nydirty/2+1, 1./nv, nthreads);
// FIXME: maybe we don't have to fill everything and can save some time
grid.fill(0);
execStatic(nxdirty, nthreads, 0, [&](Scheduler &sched)
// grid.fill(0);
execParallel(nthreads, [&](Scheduler &sched)
{
while (auto rng=sched.getNext()) for(auto i=rng.lo; i<rng.hi; ++i)
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nu);
for (auto i=lo; i<hi; ++i)
{
size_t lo2=0, hi2=nv;
if ((i<nxdirty/2) || (i>=nu-nxdirty/2))
{ lo2=nydirty/2; hi2=nv-nydirty/2+1; }
for (auto j=lo2; j<hi2; ++j)
grid.v(i,j) = 0;
}
});
execParallel(nthreads, [&](Scheduler &sched)
{
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nxdirty);
for (auto i=lo; i<hi; ++i)
{
int icfu = abs(int(nxdirty/2)-int(i));
for (size_t j=0; j<nydirty; ++j)
......@@ -422,17 +446,32 @@ template<typename T> class Params
checkShape(dirty.shape(), {nxdirty, nydirty});
checkShape(grid.shape(), {nu, nv});
// FIXME: maybe we don't have to fill everything and can save some time
grid.fill(0);
// grid.fill(0);
execParallel(nthreads, [&](Scheduler &sched)
{
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nu);
for (auto i=lo; i<hi; ++i)
{
size_t lo2=0, hi2=nv;
if ((i<nxdirty/2) || (i>=nu-nxdirty/2))
{ lo2=nydirty/2; hi2=nv-nydirty/2+1; }
for (auto j=lo2; j<hi2; ++j)
grid.v(i,j) = 0;
}
});
double x0 = -0.5*nxdirty*pixsize_x,
y0 = -0.5*nydirty*pixsize_y;
execStatic(nxdirty/2+1, nthreads, 0, [&](Scheduler &sched)
execParallel(nthreads, [&](Scheduler &sched)
{
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nxdirty/2+1);
using vtype = native_simd<T>;
constexpr size_t vlen=vtype::size();
size_t nvec = (nydirty/2+1+(vlen-1))/vlen;
vector<vtype> ph(nvec), sp(nvec), cp(nvec);
while (auto rng=sched.getNext()) for(auto i=rng.lo; i<rng.hi; ++i)
for(auto i=lo; i<hi; ++i)
{
T fx = T(x0+i*pixsize_x);
fx *= fx;
......@@ -511,7 +550,7 @@ template<typename T> class Params
iv0 = min(int(v+vshift)-int(nv), maxiv0);
}
void report()
void report()
{
if (verbosity==0) return;
cout << (gridding ? "Gridding" : "Degridding")
......@@ -527,7 +566,7 @@ void report()
<< ", wmax/dw=" << wmax_d/dw << ", nranges=" << ranges.size() << endl;
}
void scanData()
void scanData()
{
timers.push("Initial scan");
size_t nrow=bl.Nrows(),
......@@ -573,7 +612,7 @@ void scanData()
timers.pop();
}
auto getNuNv()
auto getNuNv()
{
timers.push("parameter calculation");
double x0 = -0.5*nxdirty*pixsize_x,
......@@ -620,7 +659,7 @@ auto getNuNv()
return minidx;
}
void countRanges()
void countRanges()
{
timers.push("range count");
size_t nrow=bl.Nrows(),
......@@ -707,16 +746,18 @@ void countRanges()
timers.pop();
}
void apply_global_corrections(mav<T,2> &dirty)
void apply_global_corrections(mav<T,2> &dirty)
{
timers.push("global corrections");
double x0 = -0.5*nxdirty*pixsize_x,
y0 = -0.5*nydirty*pixsize_y;
auto cfu = krn->corfunc(nxdirty/2+1, 1./nu, nthreads);
auto cfv = krn->corfunc(nydirty/2+1, 1./nv, nthreads);
execStatic(nxdirty/2+1, nthreads, 0, [&](Scheduler &sched)
execParallel(nthreads, [&](Scheduler &sched)
{
while (auto rng=sched.getNext()) for(auto i=rng.lo; i<rng.hi; ++i)
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nxdirty/2+1);
for(auto i=lo; i<hi; ++i)
{
auto fx = T(x0+i*pixsize_x);
fx *= fx;
......@@ -759,7 +800,8 @@ void apply_global_corrections(mav<T,2> &dirty)
});
timers.pop();
}
template<size_t supp, bool wgrid> class HelperX2g2
template<size_t supp, bool wgrid> class HelperX2g2
{
public:
static constexpr size_t vlen = native_simd<T>::size();
......@@ -771,7 +813,7 @@ template<size_t supp, bool wgrid> class HelperX2g2
static constexpr int sv = 2*nsafe+(1<<logsquare);
static constexpr int svvec = ((sv+vlen-1)/vlen)*vlen;
static constexpr double xsupp=2./supp;
const Params *parent;
const Params *parent;
TemplateKernel<supp, T> tkrn;
mav<complex<T>,2> &grid;
int iu0, iv0; // start index of the current visibility
......@@ -855,7 +897,7 @@ const Params *parent;
};
template<size_t SUPP, bool wgrid> [[gnu::hot]] void x2grid_c_helper
template<size_t SUPP, bool wgrid> [[gnu::hot]] void x2grid_c_helper
(mav<complex<T>,2> &grid,
size_t p0, double w0)
{
......@@ -922,7 +964,7 @@ template<size_t SUPP, bool wgrid> [[gnu::hot]] void x2grid_c_helper
});
}
template<bool wgrid> void x2grid_c
template<bool wgrid> void x2grid_c
(mav<complex<T>,2> &grid,
size_t p0, double w0=-1)
{
......@@ -960,7 +1002,7 @@ template<bool wgrid> void x2grid_c
timers.pop();
}
void x2dirty()
void x2dirty()
{
if (do_wgridding)
{
......@@ -973,7 +1015,19 @@ void x2dirty()
{
double w = wmin+pl*dw;
timers.push("zeroing grid");
#if 0
//FIXME: we don't need to zero the entire array here...
execParallel(nthreads, [&](Scheduler &sched)
{
auto tid = sched.thread_num();
auto [lo, hi] = calcShare(nthreads, tid, nu);
for (auto i=lo; i<hi; ++i)
for (size_t j=0; j<nv; ++j)
grid.v(i,j) = 0;
});
#else
grid.fill(0);
#endif
timers.pop();
x2grid_c<true>(grid, pl, w);
grid2dirty_c_overwrite_wscreen_add(grid, dirty_out, T(w));
......@@ -995,7 +1049,7 @@ void x2dirty()
grid2dirty_overwrite(rgrid, dirty_out);
}
}
template<size_t supp, bool wgrid> class HelperG2x2
template<size_t supp, bool wgrid> class HelperG2x2
{
public:
static constexpr size_t vlen = native_simd<T>::size();
......@@ -1007,7 +1061,7 @@ template<size_t supp, bool wgrid> class HelperG2x2
static constexpr int sv = 2*nsafe+(1<<logsquare);
static constexpr int svvec = ((sv+vlen-1)/vlen)*vlen;
static constexpr double xsupp=2./supp;
const Params *parent;
const Params *parent;
TemplateKernel<supp, T> tkrn;
const mav<complex<T>,2> &grid;
......@@ -1083,7 +1137,7 @@ const Params *parent;
}
};
template<size_t SUPP, bool wgrid> [[gnu::hot]] void grid2x_c_helper
template<size_t SUPP, bool wgrid> [[gnu::hot]] void grid2x_c_helper
(const mav<complex<T>,2> &grid,
size_t p0, double w0)
{
......@@ -1112,15 +1166,15 @@ template<size_t SUPP, bool wgrid> [[gnu::hot]] void grid2x_c_helper
native_simd<T> rr=0, ri=0;
for (size_t cu=0; cu<SUPP; ++cu)
{
// if constexpr(NVEC==1)
// {
// auto fct = kv[0]*ku[cu];
// const auto * DUCC0_RESTRICT pxr = hlp.p0r + cu*jump;
// const auto * DUCC0_RESTRICT pxi = hlp.p0i + cu*jump;
// rr += native_simd<T>::loadu(pxr)*fct;
// ri += native_simd<T>::loadu(pxi)*fct;
// }
// else
// if constexpr(NVEC==1)
// {
// auto fct = kv[0]*ku[cu];
// const auto * DUCC0_RESTRICT pxr = hlp.p0r + cu*jump;
// const auto * DUCC0_RESTRICT pxi = hlp.p0i + cu*jump;
// rr += native_simd<T>::loadu(pxr)*fct;
// ri += native_simd<T>::loadu(pxi)*fct;
// }
// else
{
native_simd<T> tmpr(0), tmpi(0);
for (size_t cv=0; cv<NVEC; ++cv)
......@@ -1135,7 +1189,7 @@ template<size_t SUPP, bool wgrid> [[gnu::hot]] void grid2x_c_helper
}
}
auto r = hsum_cmplx(rr,ri);
// auto r = complex<T>(reduce(rr, std::plus<>()), reduce(ri, std::plus<>()));
// auto r = complex<T>(reduce(rr, std::plus<>()), reduce(ri, std::plus<>()));
if (flip) r=conj(r);
if (have_wgt) r*=wgt(row, ch);
ms_out.v(row, ch) += r;
......@@ -1145,7 +1199,7 @@ template<size_t SUPP, bool wgrid> [[gnu::hot]] void grid2x_c_helper
});
}
template<bool wgrid> void grid2x_c
template<bool wgrid> void grid2x_c
(const mav<complex<T>,2> &grid,
size_t p0, double w0=-1)
{
......@@ -1183,7 +1237,7 @@ template<bool wgrid> void grid2x_c
timers.pop();
}
void dirty2x()
void dirty2x()
{
if (do_wgridding)
{
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment