Commit 9728e8e8 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

speedup kernel application

parent b88f9d5c
......@@ -34,9 +34,11 @@
#if defined(__GNUC__)
#define NOINLINE __attribute__((noinline))
#define ALIGNED(align) __attribute__ ((aligned(align)))
#define RESTRICT __restrict__
#else
#define NOINLINE
#define ALIGNED(align)
#define RESTRICT
#endif
namespace gridder {
......@@ -785,8 +787,8 @@ template<typename T, typename Serv> void x2grid_c
{
Helper<T> hlp(gconf, nullptr, grid.data(), w0, dw);
int jump = hlp.lineJump();
const T * ku = hlp.kernel;
const T * kv = hlp.kernel+supp;
const T * RESTRICT ku = hlp.kernel;
const T * RESTRICT kv = hlp.kernel+supp;
size_t np = srv.Nvis();
// Loop over sampling points
......@@ -796,14 +798,22 @@ template<typename T, typename Serv> void x2grid_c
UVW<T> coord = srv.getCoord(ipart);
auto flip = coord.FixW();
hlp.prep(coord);
auto * ptr = hlp.p0w;
auto * RESTRICT ptr = hlp.p0w;
auto v(srv.getVis(ipart));
if (do_w_gridding) v*=hlp.Wfac();
if (flip) v=conj(v);
for (size_t cu=0; cu<supp; ++cu)
{
complex<T> tmp(v*ku[cu]);
for (size_t cv=0; cv<supp; ++cv)
size_t cv=0;
for (; cv<supp-3; cv+=4)
{
ptr[cv ] += tmp*kv[cv ];
ptr[cv+1] += tmp*kv[cv+1];
ptr[cv+2] += tmp*kv[cv+2];
ptr[cv+3] += tmp*kv[cv+3];
}
for (; cv<supp; ++cv)
ptr[cv] += tmp*kv[cv];
ptr+=jump;
}
......@@ -843,8 +853,8 @@ template<typename T, typename Serv> void grid2x_c
{
Helper<T> hlp(gconf, grid.data(), nullptr, w0, dw);
int jump = hlp.lineJump();
const T * ku = hlp.kernel;
const T * kv = hlp.kernel+supp;
const T * RESTRICT ku = hlp.kernel;
const T * RESTRICT kv = hlp.kernel+supp;
size_t np = srv.Nvis();
#pragma omp for schedule(guided,100)
......@@ -854,11 +864,17 @@ template<typename T, typename Serv> void grid2x_c
auto flip = coord.FixW();
hlp.prep(coord);
complex<T> r = 0;
const auto * ptr = hlp.p0r;
const auto * RESTRICT ptr = hlp.p0r;
for (size_t cu=0; cu<supp; ++cu)
{
complex<T> tmp(0);
for (size_t cv=0; cv<supp; ++cv)
size_t cv=0;
for (; cv<supp-3; cv+=4)
tmp += ptr[cv ]*kv[cv ]
+ ptr[cv+1]*kv[cv+1]
+ ptr[cv+2]*kv[cv+2]
+ ptr[cv+3]*kv[cv+3];
for (; cv<supp; ++cv)
tmp += ptr[cv] * kv[cv];
r += tmp*ku[cu];
ptr += jump;
......@@ -1115,6 +1131,7 @@ template<typename T, typename Serv> void wstack_common(
wmin=T(1e38);
T wmax=T(-1e38);
// FIXME maybe this can be done more intelligently
#pragma omp parallel for num_threads(nthreads) reduction(min:wmin) reduction(max:wmax)
for (size_t ipart=0; ipart<nvis; ++ipart)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment