Commit 57745eea authored by Martin Reinecke's avatar Martin Reinecke
Browse files

significant speedup

parent 73736903
Pipeline #77274 passed with stages
in 13 minutes and 1 second
......@@ -434,7 +434,14 @@ template<typename T> class Interpolator
auto idx = getIdx(ptg);
execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched)
{
vector<T> wt(supp), wp(supp);
union {
native_simd<T> simd[64/vl];
T scalar[64];
} kdata;
T *wt(kdata.scalar), *wp(kdata.scalar+supp);
size_t nvec = (2*supp+vl-1)/vl;
for (size_t i=0; i<nvec; ++i)
kdata.simd[i] = 0;
vector<T> psiarr(2*kmax+1);
#ifdef SIMD_INTERPOL
vector<native_simd<T>> psiarr2((2*kmax+1+vl-1)/vl);
......@@ -446,11 +453,13 @@ template<typename T> class Interpolator
T f0=T(0.5*supp+ptg(i,0)*xdtheta);
size_t i0 = size_t(f0+T(1));
for (size_t t=0; t<supp; ++t)
wt[t] = kernel((t+i0-f0)*delta - 1);
wt[t] = (t+i0-f0)*delta - 1;
T f1=T(0.5)*supp+ptg(i,1)*xdphi;
size_t i1 = size_t(f1+1.);
for (size_t t=0; t<supp; ++t)
wp[t] = kernel((t+i1-f1)*delta - 1);
wp[t] = (t+i1-f1)*delta - 1;
for (size_t t=0; t<nvec; ++t)
kdata.simd[t] = kernel(kdata.simd[t]);
psiarr[0]=1.;
double psi=ptg(i,2);
double cpsi=cos(psi), spsi=sin(psi);
......@@ -484,25 +493,25 @@ template<typename T> class Interpolator
{
#ifdef SPECIAL_CASING
case 1:
interpol_help0<1,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<1,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 2:
interpol_help0<2,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<2,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 3:
interpol_help0<3,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<3,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 4:
interpol_help0<4,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<4,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 5:
interpol_help0<5,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<5,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 6:
interpol_help0<6,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<6,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 7:
interpol_help0<7,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<7,1>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
#endif
default:
......@@ -549,25 +558,25 @@ template<typename T> class Interpolator
{
#ifdef SPECIAL_CASING
case 1:
interpol_help0<1,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<1,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 2:
interpol_help0<2,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<2,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 3:
interpol_help0<3,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<3,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 4:
interpol_help0<4,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<4,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 5:
interpol_help0<5,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<5,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 6:
interpol_help0<6,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<6,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
case 7:
interpol_help0<7,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), res, i);
interpol_help0<7,3>(wt, wp, p, d0, d1, psiarr2.data(), res, i);
break;
#endif
default:
......@@ -629,7 +638,14 @@ template<typename T> class Interpolator
execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched)
{
size_t b_theta=99999999999999, b_phi=9999999999999999;
vector<T> wt(supp), wp(supp);
union {
native_simd<T> simd[64/vl];
T scalar[64];
} kdata;
T *wt(kdata.scalar), *wp(kdata.scalar+supp);
size_t nvec = (2*supp+vl-1)/vl;
for (size_t i=0; i<nvec; ++i)
kdata.simd[i] = 0;
vector<T> psiarr(2*kmax+1);
#ifdef SIMD_INTERPOL
vector<native_simd<T>> psiarr2((2*kmax+1+vl-1)/vl);
......@@ -641,11 +657,13 @@ template<typename T> class Interpolator
T f0=T(0.5)*supp+ptg(i,0)*xdtheta;
size_t i0 = size_t(f0+1.);
for (size_t t=0; t<supp; ++t)
wt[t] = kernel((t+i0-f0)*delta - 1);
wt[t] = (t+i0-f0)*delta - 1;
T f1=T(0.5)*supp+ptg(i,1)*xdphi;
size_t i1 = size_t(f1+1.);
for (size_t t=0; t<supp; ++t)
wp[t] = kernel((t+i1-f1)*delta - 1);
wp[t] = (t+i1-f1)*delta - 1;
for (size_t t=0; t<nvec; ++t)
kdata.simd[t] = kernel(kdata.simd[t]);
psiarr[0]=1.;
double psi=ptg(i,2);
double cpsi=cos(psi), spsi=sin(psi);
......@@ -696,25 +714,25 @@ template<typename T> class Interpolator
{
#ifdef SPECIAL_CASING
case 1:
deinterpol_help0<1,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<1,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 2:
deinterpol_help0<2,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<2,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 3:
deinterpol_help0<3,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<3,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 4:
deinterpol_help0<4,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<4,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 5:
deinterpol_help0<5,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<5,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 6:
deinterpol_help0<6,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<6,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 7:
deinterpol_help0<7,1>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<7,1>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
#endif
default:
......@@ -759,25 +777,25 @@ template<typename T> class Interpolator
{
#ifdef SPECIAL_CASING
case 1:
deinterpol_help0<1,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<1,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 2:
deinterpol_help0<2,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<2,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 3:
deinterpol_help0<3,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<3,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 4:
deinterpol_help0<4,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<4,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 5:
deinterpol_help0<5,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<5,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 6:
deinterpol_help0<6,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<6,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
case 7:
deinterpol_help0<7,3>(wt.data(), wp.data(), p, d0, d1, psiarr2.data(), data, i);
deinterpol_help0<7,3>(wt, wp, p, d0, d1, psiarr2.data(), data, i);
break;
#endif
default:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment