Commit 0f626efd authored by Martin Reinecke's avatar Martin Reinecke

not yet beautiful but already guite fast

parent ea7f33cd
Pipeline #79874 passed with stages
in 13 minutes and 26 seconds
...@@ -511,19 +511,20 @@ template<typename T> class GridderConfig ...@@ -511,19 +511,20 @@ template<typename T> class GridderConfig
constexpr int logsquare=4; constexpr int logsquare=4;
template<typename T, typename T2=complex<T>> class Helper template<typename T> class Helper
{ {
private: private:
using T2=complex<T>;
const GridderConfig<T> &gconf; const GridderConfig<T> &gconf;
int nu, nv, nsafe, supp; int nu, nv, nsafe, supp;
const T2 *grid_r; const T2 *grid_r;
T2 *grid_w; T2 *grid_w;
int su, sv; int su, sv, svvec;
int iu0, iv0; // start index of the current visibility int iu0, iv0; // start index of the current visibility
int bu0, bv0; // start index of the current buffer int bu0, bv0; // start index of the current buffer
T wfac; T wfac;
vector<T2> rbuf, wbuf; mav<T,2> rbufr, rbufi, wbufr, wbufi;
bool do_w_gridding; bool do_w_gridding;
double w0, xdw; double w0, xdw;
vector<std::mutex> &locks; vector<std::mutex> &locks;
...@@ -541,7 +542,7 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -541,7 +542,7 @@ template<typename T, typename T2=complex<T>> class Helper
std::lock_guard<std::mutex> lock(locks[idxu]); std::lock_guard<std::mutex> lock(locks[idxu]);
for (int iv=0; iv<sv; ++iv) for (int iv=0; iv<sv; ++iv)
{ {
grid_w[idxu*nv + idxv] += wbuf[iu*sv + iv]; grid_w[idxu*nv + idxv] += complex<T>(wbufr(iu,iv), wbufi(iu,iv));
if (++idxv>=nv) idxv=0; if (++idxv>=nv) idxv=0;
} }
} }
...@@ -558,7 +559,8 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -558,7 +559,8 @@ template<typename T, typename T2=complex<T>> class Helper
int idxv = idxv0; int idxv = idxv0;
for (int iv=0; iv<sv; ++iv) for (int iv=0; iv<sv; ++iv)
{ {
rbuf[iu*sv + iv] = grid_r[idxu*nv + idxv]; rbufr.v(iu,iv) = grid_r[idxu*nv + idxv].real();
rbufi.v(iu,iv) = grid_r[idxu*nv + idxv].imag();
if (++idxv>=nv) idxv=0; if (++idxv>=nv) idxv=0;
} }
if (++idxu>=nu) idxu=0; if (++idxu>=nu) idxu=0;
...@@ -567,8 +569,8 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -567,8 +569,8 @@ template<typename T, typename T2=complex<T>> class Helper
public: public:
size_t nvec; size_t nvec;
const T2 *p0r; const T *p0rr, *p0ri;
T2 *p0w; T *p0wr, *p0wi;
static constexpr size_t vlen=native_simd<T>::size(); static constexpr size_t vlen=native_simd<T>::size();
union kbuf { union kbuf {
T scalar[64]; T scalar[64];
...@@ -580,10 +582,15 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -580,10 +582,15 @@ template<typename T, typename T2=complex<T>> class Helper
vector<std::mutex> &locks_, double w0_=-1, double dw_=-1) vector<std::mutex> &locks_, double w0_=-1, double dw_=-1)
: gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()), : gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()),
supp(gconf.Supp()), grid_r(grid_r_), supp(gconf.Supp()), grid_r(grid_r_),
grid_w(grid_w_), su(2*nsafe+(1<<logsquare)), sv(2*nsafe+(1<<logsquare)), grid_w(grid_w_),
su(2*nsafe+(1<<logsquare)),
sv(2*nsafe+(1<<logsquare)),
svvec(((sv+vlen-1)/vlen)*vlen),
bu0(-1000000), bv0(-1000000), bu0(-1000000), bv0(-1000000),
rbuf(su*sv*(grid_r!=nullptr),T(0)), rbufr({size_t(su),size_t(svvec)*(grid_r!=nullptr)}),
wbuf(su*sv*(grid_w!=nullptr),T(0)), rbufi({size_t(su),size_t(svvec)*(grid_r!=nullptr)}),
wbufr({size_t(su),size_t(svvec)*(grid_w!=nullptr)}),
wbufi({size_t(su),size_t(svvec)*(grid_w!=nullptr)}),
do_w_gridding(dw_>0), do_w_gridding(dw_>0),
w0(w0_), w0(w0_),
xdw(T(1)/dw_), xdw(T(1)/dw_),
...@@ -594,7 +601,7 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -594,7 +601,7 @@ template<typename T, typename T2=complex<T>> class Helper
} }
~Helper() { if (grid_w) dump(); } ~Helper() { if (grid_w) dump(); }
int lineJump() const { return sv; } int lineJump() const { return svvec; }
T Wfac() const { return wfac; } T Wfac() const { return wfac; }
void prep(const UVW &in) void prep(const UVW &in)
{ {
...@@ -610,13 +617,20 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -610,13 +617,20 @@ template<typename T, typename T2=complex<T>> class Helper
wfac = krn.eval_single(T(xdw*xsupp*abs(w0-in.w))); wfac = krn.eval_single(T(xdw*xsupp*abs(w0-in.w)));
if ((iu0<bu0) || (iv0<bv0) || (iu0+supp>bu0+su) || (iv0+supp>bv0+sv)) if ((iu0<bu0) || (iv0<bv0) || (iu0+supp>bu0+su) || (iv0+supp>bv0+sv))
{ {
if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); } if (grid_w)
{
dump();
wbufr.apply([](T &v){v=0;});
wbufi.apply([](T &v){v=0;});
}
bu0=((((iu0+nsafe)>>logsquare)<<logsquare))-nsafe; bu0=((((iu0+nsafe)>>logsquare)<<logsquare))-nsafe;
bv0=((((iv0+nsafe)>>logsquare)<<logsquare))-nsafe; bv0=((((iv0+nsafe)>>logsquare)<<logsquare))-nsafe;
if (grid_r) load(); if (grid_r) load();
} }
p0r = grid_r ? rbuf.data() + sv*(iu0-bu0) + iv0-bv0 : nullptr; p0rr = grid_r ? &rbufr(iu0-bu0, iv0-bv0) : nullptr;
p0w = grid_w ? wbuf.data() + sv*(iu0-bu0) + iv0-bv0 : nullptr; p0ri = grid_r ? &rbufi(iu0-bu0, iv0-bv0) : nullptr;
p0wr = grid_w ? &wbufr.v(iu0-bu0, iv0-bv0) : nullptr;
p0wi = grid_w ? &wbufi.v(iu0-bu0, iv0-bv0) : nullptr;
} }
}; };
...@@ -702,38 +716,181 @@ template<typename T, typename Serv> void x2grid_c ...@@ -702,38 +716,181 @@ template<typename T, typename Serv> void x2grid_c
size_t nthreads = gconf.Nthreads(); size_t nthreads = gconf.Nthreads();
bool do_w_gridding = dw>0; bool do_w_gridding = dw>0;
vector<std::mutex> locks(gconf.Nu()); vector<std::mutex> locks(gconf.Nu());
constexpr size_t vlen=native_simd<T>::size();
size_t nvec((supp+vlen-1)/vlen);
size_t np = srv.Nvis(); size_t np = srv.Nvis();
if (nvec==1)
execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched)
{
Helper<T> hlp(gconf, nullptr, grid.vdata(), locks, w0, dw);
int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{
UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
auto * DUCC0_RESTRICT ptrr = hlp.p0wr;
auto * DUCC0_RESTRICT ptri = hlp.p0wi;
auto v(srv.getVis(ipart));
if (do_w_gridding) v*=hlp.Wfac();
if (flip) v=conj(v);
for (size_t cu=0; cu<supp; ++cu)
{
complex<T> tmp(v*ku[cu]);
for (size_t cv=0; cv<1; ++cv)
{
auto tr = native_simd<T>::loadu(ptrr+cv*hlp.vlen);
tr += tmp.real()*kv[cv];
tr.storeu(ptrr+cv*hlp.vlen);
auto ti = native_simd<T>::loadu(ptri+cv*hlp.vlen);
ti += tmp.imag()*kv[cv];
ti.storeu(ptri+cv*hlp.vlen);
}
ptrr+=jump;
ptri+=jump;
}
}
});
else if (nvec==2)
execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched)
{
Helper<T> hlp(gconf, nullptr, grid.vdata(), locks, w0, dw);
int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{
UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
auto * DUCC0_RESTRICT ptrr = hlp.p0wr;
auto * DUCC0_RESTRICT ptri = hlp.p0wi;
auto v(srv.getVis(ipart));
if (do_w_gridding) v*=hlp.Wfac();
if (flip) v=conj(v);
for (size_t cu=0; cu<supp; ++cu)
{
complex<T> tmp(v*ku[cu]);
for (size_t cv=0; cv<2; ++cv)
{
auto tr = native_simd<T>::loadu(ptrr+cv*hlp.vlen);
tr += tmp.real()*kv[cv];
tr.storeu(ptrr+cv*hlp.vlen);
auto ti = native_simd<T>::loadu(ptri+cv*hlp.vlen);
ti += tmp.imag()*kv[cv];
ti.storeu(ptri+cv*hlp.vlen);
}
ptrr+=jump;
ptri+=jump;
}
}
});
else if (nvec==3)
execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched)
{
Helper<T> hlp(gconf, nullptr, grid.vdata(), locks, w0, dw);
int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{
UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
auto * DUCC0_RESTRICT ptrr = hlp.p0wr;
auto * DUCC0_RESTRICT ptri = hlp.p0wi;
auto v(srv.getVis(ipart));
if (do_w_gridding) v*=hlp.Wfac();
if (flip) v=conj(v);
for (size_t cu=0; cu<supp; ++cu)
{
complex<T> tmp(v*ku[cu]);
for (size_t cv=0; cv<3; ++cv)
{
auto tr = native_simd<T>::loadu(ptrr+cv*hlp.vlen);
tr += tmp.real()*kv[cv];
tr.storeu(ptrr+cv*hlp.vlen);
auto ti = native_simd<T>::loadu(ptri+cv*hlp.vlen);
ti += tmp.imag()*kv[cv];
ti.storeu(ptri+cv*hlp.vlen);
}
ptrr+=jump;
ptri+=jump;
}
}
});
else if (nvec==4)
execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched)
{
Helper<T> hlp(gconf, nullptr, grid.vdata(), locks, w0, dw);
int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{
UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
auto * DUCC0_RESTRICT ptrr = hlp.p0wr;
auto * DUCC0_RESTRICT ptri = hlp.p0wi;
auto v(srv.getVis(ipart));
if (do_w_gridding) v*=hlp.Wfac();
if (flip) v=conj(v);
for (size_t cu=0; cu<supp; ++cu)
{
complex<T> tmp(v*ku[cu]);
for (size_t cv=0; cv<4; ++cv)
{
auto tr = native_simd<T>::loadu(ptrr+cv*hlp.vlen);
tr += tmp.real()*kv[cv];
tr.storeu(ptrr+cv*hlp.vlen);
auto ti = native_simd<T>::loadu(ptri+cv*hlp.vlen);
ti += tmp.imag()*kv[cv];
ti.storeu(ptri+cv*hlp.vlen);
}
ptrr+=jump;
ptri+=jump;
}
}
});
else
execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched) execGuided(np, nthreads, 100, 0.2, [&](Scheduler &sched)
{ {
Helper<T> hlp(gconf, nullptr, grid.vdata(), locks, w0, dw); Helper<T> hlp(gconf, nullptr, grid.vdata(), locks, w0, dw);
int jump = hlp.lineJump(); int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar; const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const T * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.vlen*hlp.nvec; const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart) while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{ {
UVW coord = srv.getCoord(ipart); UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW(); auto flip = coord.FixW();
hlp.prep(coord); hlp.prep(coord);
auto * DUCC0_RESTRICT ptr = hlp.p0w; auto * DUCC0_RESTRICT ptrr = hlp.p0wr;
auto * DUCC0_RESTRICT ptri = hlp.p0wi;
auto v(srv.getVis(ipart)); auto v(srv.getVis(ipart));
if (do_w_gridding) v*=hlp.Wfac(); if (do_w_gridding) v*=hlp.Wfac();
if (flip) v=conj(v); if (flip) v=conj(v);
for (size_t cu=0; cu<supp; ++cu) for (size_t cu=0; cu<supp; ++cu)
{ {
complex<T> tmp(v*ku[cu]); complex<T> tmp(v*ku[cu]);
size_t cv=0; for (size_t cv=0; cv<hlp.nvec; ++cv)
for (; cv+3<supp; cv+=4)
{ {
ptr[cv ] += tmp*kv[cv ]; auto tr = native_simd<T>::loadu(ptrr+cv*hlp.vlen);
ptr[cv+1] += tmp*kv[cv+1]; tr += tmp.real()*kv[cv];
ptr[cv+2] += tmp*kv[cv+2]; tr.storeu(ptrr+cv*hlp.vlen);
ptr[cv+3] += tmp*kv[cv+3]; auto ti = native_simd<T>::loadu(ptri+cv*hlp.vlen);
ti += tmp.imag()*kv[cv];
ti.storeu(ptri+cv*hlp.vlen);
} }
for (; cv<supp; ++cv) ptrr+=jump;
ptr[cv] += tmp*kv[cv]; ptri+=jump;
ptr+=jump;
} }
} }
}); });
...@@ -749,15 +906,18 @@ template<typename T, typename Serv> void grid2x_c ...@@ -749,15 +906,18 @@ template<typename T, typename Serv> void grid2x_c
size_t nthreads = gconf.Nthreads(); size_t nthreads = gconf.Nthreads();
bool do_w_gridding = dw>0; bool do_w_gridding = dw>0;
vector<std::mutex> locks(gconf.Nu()); vector<std::mutex> locks(gconf.Nu());
constexpr size_t vlen=native_simd<T>::size();
size_t nvec((supp+vlen-1)/vlen);
// Loop over sampling points // Loop over sampling points
size_t np = srv.Nvis(); size_t np = srv.Nvis();
if (nvec==1)
execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched) execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched)
{ {
Helper<T> hlp(gconf, grid.data(), nullptr, locks, w0, dw); Helper<T> hlp(gconf, grid.data(), nullptr, locks, w0, dw);
int jump = hlp.lineJump(); int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar; const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const T * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.vlen*hlp.nvec; const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart) while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{ {
...@@ -765,20 +925,151 @@ template<typename T, typename Serv> void grid2x_c ...@@ -765,20 +925,151 @@ template<typename T, typename Serv> void grid2x_c
auto flip = coord.FixW(); auto flip = coord.FixW();
hlp.prep(coord); hlp.prep(coord);
complex<T> r = 0; complex<T> r = 0;
const auto * DUCC0_RESTRICT ptr = hlp.p0r; const auto * DUCC0_RESTRICT ptrr = hlp.p0rr;
const auto * DUCC0_RESTRICT ptri = hlp.p0ri;
for (size_t cu=0; cu<supp; ++cu) for (size_t cu=0; cu<supp; ++cu)
{ {
complex<T> tmp(0); native_simd<T> tmpr(0), tmpi(0);
size_t cv=0; for (size_t cv=0; cv<1; ++cv)
for (; cv+3<supp; cv+=4) {
tmp += ptr[cv ]*kv[cv ] tmpr += kv[cv]*native_simd<T>::loadu(ptrr+hlp.vlen*cv);
+ ptr[cv+1]*kv[cv+1] tmpi += kv[cv]*native_simd<T>::loadu(ptri+hlp.vlen*cv);
+ ptr[cv+2]*kv[cv+2] }
+ ptr[cv+3]*kv[cv+3]; r += ku[cu]*complex<T>(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>()));
for (; cv<supp; ++cv) ptrr += jump;
tmp += ptr[cv] * kv[cv]; ptri += jump;
r += tmp*ku[cu]; }
ptr += jump; if (flip) r=conj(r);
if (do_w_gridding) r*=hlp.Wfac();
srv.addVis(ipart, r);
}
});
else if (nvec==2)
execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched)
{
Helper<T> hlp(gconf, grid.data(), nullptr, locks, w0, dw);
int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{
UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
complex<T> r = 0;
const auto * DUCC0_RESTRICT ptrr = hlp.p0rr;
const auto * DUCC0_RESTRICT ptri = hlp.p0ri;
for (size_t cu=0; cu<supp; ++cu)
{
native_simd<T> tmpr(0), tmpi(0);
for (size_t cv=0; cv<2; ++cv)
{
tmpr += kv[cv]*native_simd<T>::loadu(ptrr+hlp.vlen*cv);
tmpi += kv[cv]*native_simd<T>::loadu(ptri+hlp.vlen*cv);
}
r += ku[cu]*complex<T>(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>()));
ptrr += jump;
ptri += jump;
}
if (flip) r=conj(r);
if (do_w_gridding) r*=hlp.Wfac();
srv.addVis(ipart, r);
}
});
else if (nvec==3)
execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched)
{
Helper<T> hlp(gconf, grid.data(), nullptr, locks, w0, dw);
int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{
UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
complex<T> r = 0;
const auto * DUCC0_RESTRICT ptrr = hlp.p0rr;
const auto * DUCC0_RESTRICT ptri = hlp.p0ri;
for (size_t cu=0; cu<supp; ++cu)
{
native_simd<T> tmpr(0), tmpi(0);
for (size_t cv=0; cv<3; ++cv)
{
tmpr += kv[cv]*native_simd<T>::loadu(ptrr+hlp.vlen*cv);
tmpi += kv[cv]*native_simd<T>::loadu(ptri+hlp.vlen*cv);
}
r += ku[cu]*complex<T>(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>()));
ptrr += jump;
ptri += jump;
}
if (flip) r=conj(r);
if (do_w_gridding) r*=hlp.Wfac();
srv.addVis(ipart, r);
}
});
else if (nvec==4)
execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched)
{
Helper<T> hlp(gconf, grid.data(), nullptr, locks, w0, dw);
int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{
UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
complex<T> r = 0;
const auto * DUCC0_RESTRICT ptrr = hlp.p0rr;
const auto * DUCC0_RESTRICT ptri = hlp.p0ri;
for (size_t cu=0; cu<supp; ++cu)
{
native_simd<T> tmpr(0), tmpi(0);
for (size_t cv=0; cv<4; ++cv)
{
tmpr += kv[cv]*native_simd<T>::loadu(ptrr+hlp.vlen*cv);
tmpi += kv[cv]*native_simd<T>::loadu(ptri+hlp.vlen*cv);
}
r += ku[cu]*complex<T>(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>()));
ptrr += jump;
ptri += jump;
}
if (flip) r=conj(r);
if (do_w_gridding) r*=hlp.Wfac();
srv.addVis(ipart, r);
}
});
else
execGuided(np, nthreads, 1000, 0.5, [&](Scheduler &sched)
{
Helper<T> hlp(gconf, grid.data(), nullptr, locks, w0, dw);
int jump = hlp.lineJump();
const T * DUCC0_RESTRICT ku = hlp.buf.scalar;
const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec;
while (auto rng=sched.getNext()) for(auto ipart=rng.lo; ipart<rng.hi; ++ipart)
{
UVW coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
complex<T> r = 0;
const auto * DUCC0_RESTRICT ptrr = hlp.p0rr;
const auto * DUCC0_RESTRICT ptri = hlp.p0ri;
for (size_t cu=0; cu<supp; ++cu)
{
native_simd<T> tmpr(0), tmpi(0);
for (size_t cv=0; cv<hlp.nvec; ++cv)
{
tmpr += kv[cv]*native_simd<T>::loadu(ptrr+hlp.vlen*cv);
tmpi += kv[cv]*native_simd<T>::loadu(ptri+hlp.vlen*cv);
}
r += ku[cu]*complex<T>(reduce(tmpr, std::plus<>()), reduce(tmpi, std::plus<>()));
ptrr += jump;
ptri += jump;
} }
if (flip) r=conj(r); if (flip) r=conj(r);
if (do_w_gridding) r*=hlp.Wfac(); if (do_w_gridding) r*=hlp.Wfac();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment