Commit 3dcf3ba0 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

WIP: better vectorization for kernel evaluations

parent 043a57c1
...@@ -38,6 +38,22 @@ namespace detail { ...@@ -38,6 +38,22 @@ namespace detail {
using namespace std; using namespace std;
template<typename T> struct VLEN { static constexpr size_t val=1; };
#if (defined(__AVX512F__))
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
#elif (defined(__AVX__))
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
#elif (defined(__SSE2__))
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
#elif (defined(__VSX__))
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
#endif
template<typename T, size_t ndim> class mav template<typename T, size_t ndim> class mav
{ {
static_assert((ndim>0) && (ndim<3), "only supports 1D and 2D arrays"); static_assert((ndim>0) && (ndim<3), "only supports 1D and 2D arrays");
...@@ -577,6 +593,7 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -577,6 +593,7 @@ template<typename T, typename T2=complex<T>> class Helper
int bu0, bv0; // start index of the current buffer int bu0, bv0; // start index of the current buffer
vector<T2> rbuf, wbuf; vector<T2> rbuf, wbuf;
size_t nvecs;
void dump() const void dump() const
{ {
...@@ -618,7 +635,7 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -618,7 +635,7 @@ template<typename T, typename T2=complex<T>> class Helper
public: public:
const T2 *p0r; const T2 *p0r;
T2 *p0w; T2 *p0w;
vector<T> kernel; T kernel[64] __attribute__ ((aligned(64)));
Helper(const GridderConfig<T> &gconf_, const T2 *grid_r_, T2 *grid_w_) Helper(const GridderConfig<T> &gconf_, const T2 *grid_r_, T2 *grid_w_)
: gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()), : gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()),
...@@ -627,7 +644,7 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -627,7 +644,7 @@ template<typename T, typename T2=complex<T>> class Helper
bu0(-1000000), bv0(-1000000), bu0(-1000000), bv0(-1000000),
rbuf(su*sv*(grid_r!=nullptr),T(0)), rbuf(su*sv*(grid_r!=nullptr),T(0)),
wbuf(su*sv*(grid_w!=nullptr),T(0)), wbuf(su*sv*(grid_w!=nullptr),T(0)),
kernel(2*supp) nvecs(VLEN<T>::val*((2*supp+VLEN<T>::val-1)/VLEN<T>::val))
{} {}
~Helper() { if (grid_w) dump(); } ~Helper() { if (grid_w) dump(); }
...@@ -645,8 +662,12 @@ template<typename T, typename T2=complex<T>> class Helper ...@@ -645,8 +662,12 @@ template<typename T, typename T2=complex<T>> class Helper
kernel[i ] = x0+i*xsupp; kernel[i ] = x0+i*xsupp;
kernel[i+supp] = y0+i*xsupp; kernel[i+supp] = y0+i*xsupp;
} }
for (auto &k : kernel) for (size_t i=2*supp; i<nvecs; ++i)
k = exp(beta*sqrt(T(1)-k*k)); kernel[i]=0;
for (size_t i=0; i<nvecs; ++i)
kernel[i] = exp(beta*sqrt(T(1)-kernel[i]*kernel[i]));
// for (auto &k : kernel)
// k = exp(beta*sqrt(T(1)-k*k));
if ((iu0<bu0) || (iv0<bv0) || (iu0+supp>bu0+su) || (iv0+supp>bv0+sv)) if ((iu0<bu0) || (iv0<bv0) || (iu0+supp>bu0+su) || (iv0+supp>bv0+sv))
{ {
if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); } if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); }
...@@ -751,8 +772,8 @@ template<typename T, typename Serv> void x2grid_c ...@@ -751,8 +772,8 @@ template<typename T, typename Serv> void x2grid_c
Helper<T> hlp(gconf, nullptr, grid.data()); Helper<T> hlp(gconf, nullptr, grid.data());
T emb = exp(-2*beta); T emb = exp(-2*beta);
int jump = hlp.lineJump(); int jump = hlp.lineJump();
const T * ku = hlp.kernel.data(); const T * ku = hlp.kernel;
const T * kv = hlp.kernel.data()+supp; const T * kv = hlp.kernel+supp;
size_t np = srv.Nvis(); size_t np = srv.Nvis();
// Loop over sampling points // Loop over sampling points
...@@ -808,8 +829,8 @@ template<typename T, typename Serv> void grid2x_c ...@@ -808,8 +829,8 @@ template<typename T, typename Serv> void grid2x_c
Helper<T> hlp(gconf, grid.data(), nullptr); Helper<T> hlp(gconf, grid.data(), nullptr);
T emb = exp(-2*beta); T emb = exp(-2*beta);
int jump = hlp.lineJump(); int jump = hlp.lineJump();
const T * ku = hlp.kernel.data(); const T * ku = hlp.kernel;
const T * kv = hlp.kernel.data()+supp; const T * kv = hlp.kernel+supp;
size_t np = srv.Nvis(); size_t np = srv.Nvis();
#pragma omp for schedule(guided,100) #pragma omp for schedule(guided,100)
...@@ -873,8 +894,8 @@ template<typename T> void apply_holo ...@@ -873,8 +894,8 @@ template<typename T> void apply_holo
Helper<T> hlp(gconf, grid.data(), ogrid.data()); Helper<T> hlp(gconf, grid.data(), ogrid.data());
T emb = exp(-2*beta); T emb = exp(-2*beta);
int jump = hlp.lineJump(); int jump = hlp.lineJump();
const T * ku = hlp.kernel.data(); const T * ku = hlp.kernel;
const T * kv = hlp.kernel.data()+supp; const T * kv = hlp.kernel+supp;
#pragma omp for schedule(guided,100) #pragma omp for schedule(guided,100)
for (size_t ipart=0; ipart<nvis; ++ipart) for (size_t ipart=0; ipart<nvis; ++ipart)
...@@ -944,8 +965,8 @@ template<typename T> void get_correlations ...@@ -944,8 +965,8 @@ template<typename T> void get_correlations
Helper<T,T> hlp(gconf, nullptr, ogrid.data()); Helper<T,T> hlp(gconf, nullptr, ogrid.data());
T emb = exp(-2*beta); T emb = exp(-2*beta);
int jump = hlp.lineJump(); int jump = hlp.lineJump();
const T * ku = hlp.kernel.data(); const T * ku = hlp.kernel;
const T * kv = hlp.kernel.data()+supp; const T * kv = hlp.kernel+supp;
#pragma omp for schedule(guided,100) #pragma omp for schedule(guided,100)
for (size_t ipart=0; ipart<nvis; ++ipart) for (size_t ipart=0; ipart<nvis; ++ipart)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment