Commit 99c2c493 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

tweaks

parent 5a56be08
Pipeline #97610 passed with stages
in 17 minutes and 45 seconds
......@@ -910,8 +910,6 @@ DUCC0_NOINLINE static void calc_alm2map_deriv1(const dcmplx * DUCC0_RESTRICT alm
}
#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
template<typename T> DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode,
mav<complex<double>,2> &almtmp,
mav<complex<T>,3> &phase, const vector<ringdata> &rdata,
......@@ -932,19 +930,22 @@ template<typename T> DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode,
}
constexpr size_t nval=nv0*VLEN;
s0data_u d;
array<size_t, nval> idx, midx;
Tbv0 cth;
size_t ith=0;
std::array<size_t,nval> itgt;
while (ith<rdata.size())
{
s0data_u d;
VZERO(d.s.p1r); VZERO(d.s.p1i); VZERO(d.s.p2r); VZERO(d.s.p2i);
size_t nth=0;
while ((nth<nval)&&(ith<rdata.size()))
{
if (rdata[ith].mlim>=gen.m)
{
itgt[nth] = ith;
d.s.csq[nth]=rdata[ith].cth*rdata[ith].cth;
idx[nth] = rdata[ith].idx;
midx[nth] = rdata[ith].midx;
auto lcth = rdata[ith].cth;
cth[nth/VLEN][nth%VLEN] = lcth;
d.s.csq[nth]=lcth*lcth;
d.s.sth[nth]=rdata[ith].sth;
++nth;
}
......@@ -954,25 +955,33 @@ template<typename T> DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode,
}
if (nth>0)
{
size_t i2=((nth+VLEN-1)/VLEN)*VLEN;
size_t nvec = (nth+VLEN-1)/VLEN;
size_t i2 = nvec*VLEN;
for (auto i=nth; i<i2; ++i)
{
d.s.csq[i]=d.s.csq[nth-1];
d.s.sth[i]=d.s.sth[nth-1];
d.s.p1r[i]=d.s.p1i[i]=d.s.p2r[i]=d.s.p2i[i]=0.;
}
for (size_t i=0; i<nvec; ++i)
d.v.p1r[i] = d.v.p1i[i] = d.v.p2r[i] = d.v.p2i[i] = 0;
calc_alm2map (almtmp.cdata(), gen, d.v, nth);
for (size_t i=0; i<nvec; ++i)
{
auto t1r = d.v.p1r[i];
auto t2r = d.v.p2r[i]*cth[i];
auto t1i = d.v.p1i[i];
auto t2i = d.v.p2i[i]*cth[i];
d.v.p1r[i] = t1r+t2r;
d.v.p1i[i] = t1i+t2i;
d.v.p2r[i] = t1r-t2r;
d.v.p2i[i] = t1i-t2i;
}
for (size_t i=0; i<nth; ++i)
{
auto tgt=itgt[i];
//adjust for new algorithm
d.s.p2r[i]*=rdata[tgt].cth;
d.s.p2i[i]*=rdata[tgt].cth;
dcmplx r1(d.s.p1r[i], d.s.p1i[i]),
r2(d.s.p2r[i], d.s.p2i[i]);
phase.v(0, rdata[tgt].idx, mi) = complex<T>(r1+r2);
if (rdata[tgt].idx!=rdata[tgt].midx)
phase.v(0, rdata[tgt].midx, mi) = complex<T>(r1-r2);
phase.v(0, idx[i], mi) = complex<T>(T(d.s.p1r[i]),T(d.s.p1i[i]));
if (idx[i]!=midx[i])
phase.v(0, midx[i], mi) = complex<T>(T(d.s.p2r[i]),T(d.s.p2i[i]));
}
}
}
......@@ -985,19 +994,18 @@ template<typename T> DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode,
almtmp.v(l,i)*=gen.alpha[l];
constexpr size_t nval=nvx*VLEN;
sxdata_u d;
array<size_t, nval> idx, midx;
size_t ith=0;
std::array<size_t,nval> itgt;
while (ith<rdata.size())
{
sxdata_u d;
VZERO(d.s.p1pr); VZERO(d.s.p1pi); VZERO(d.s.p2pr); VZERO(d.s.p2pi);
VZERO(d.s.p1mr); VZERO(d.s.p1mi); VZERO(d.s.p2mr); VZERO(d.s.p2mi);
size_t nth=0;
while ((nth<nval)&&(ith<rdata.size()))
{
if (rdata[ith].mlim>=gen.m)
{
itgt[nth] = ith;
idx[nth] = rdata[ith].idx;
midx[nth] = rdata[ith].midx;
d.s.cth[nth]=rdata[ith].cth; d.s.sth[nth]=rdata[ith].sth;
++nth;
}
......@@ -1010,35 +1018,47 @@ template<typename T> DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode,
}
if (nth>0)
{
size_t i2=((nth+VLEN-1)/VLEN)*VLEN;
size_t nvec = (nth+VLEN-1)/VLEN;
size_t i2 = nvec*VLEN;
for (size_t i=nth; i<i2; ++i)
{
d.s.cth[i]=d.s.cth[nth-1];
d.s.sth[i]=d.s.sth[nth-1];
// FIXME are those two lines needed?
d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
}
for (size_t i=0; i<nvec; ++i)
d.v.p1pr[i] = d.v.p1pi[i] = d.v.p2pr[i] = d.v.p2pi[i] =
d.v.p1mr[i] = d.v.p1mi[i] = d.v.p2mr[i] = d.v.p2mi[i] = 0;
(mode==ALM2MAP) ?
calc_alm2map_spin (almtmp.cdata(), gen, d.v, nth) :
calc_alm2map_deriv1(almtmp.cdata(), gen, d.v, nth);
double fct = ((gen.mhi-gen.m+gen.s)&1) ? -1.: 1.;
for (size_t i=0; i<nvec; ++i)
{
auto p1pr=d.v.p1pr[i], p1pi=d.v.p1pi[i],
p2pr=d.v.p2pr[i], p2pi=d.v.p2pi[i],
p1mr=d.v.p1mr[i], p1mi=d.v.p1mi[i],
p2mr=d.v.p2mr[i], p2mi=d.v.p2mi[i];
d.v.p1pr[i] = p1pr+p2pr;
d.v.p1pi[i] = p1pi+p2pi;
d.v.p1mr[i] = p1mr+p2mr;
d.v.p1mi[i] = p1mi+p2mi;
d.v.p2pr[i] = fct*(p1pr-p2pr);
d.v.p2pi[i] = fct*(p1pi-p2pi);
d.v.p2mr[i] = fct*(p1mr-p2mr);
d.v.p2mi[i] = fct*(p1mi-p2mi);
}
for (size_t i=0; i<nth; ++i)
{
auto tgt=itgt[i];
dcmplx q1(d.s.p1pr[i], d.s.p1pi[i]),
q2(d.s.p2pr[i], d.s.p2pi[i]),
u1(d.s.p1mr[i], d.s.p1mi[i]),
u2(d.s.p2mr[i], d.s.p2mi[i]);
phase.v(0, rdata[tgt].idx, mi) = complex<T>(q1+q2);
phase.v(1, rdata[tgt].idx, mi) = complex<T>(u1+u2);
if (rdata[tgt].idx!=rdata[tgt].midx)
phase.v(0, idx[i], mi) = complex<T>(T(d.s.p1pr[i]), T(d.s.p1pi[i]));
phase.v(1, idx[i], mi) = complex<T>(T(d.s.p1mr[i]), T(d.s.p1mi[i]));
if (idx[i]!=midx[i])
{
auto *phQ = &(phase.v(0, rdata[tgt].midx, mi)),
*phU = &(phase.v(1, rdata[tgt].midx, mi));
*phQ = complex<T>(q1-q2);
*phU = complex<T>(u1-u2);
if ((gen.mhi-gen.m+gen.s)&1)
{ *phQ=-(*phQ); *phU=-(*phU); }
phase.v(0, midx[i], mi) = complex<T>(T(d.s.p2pr[i]), T(d.s.p2pi[i]));
phase.v(1, midx[i], mi) = complex<T>(T(d.s.p2mr[i]), T(d.s.p2mi[i]));
}
}
}
......@@ -1150,8 +1170,6 @@ template<typename T> DUCC0_NOINLINE static void inner_loop_m2a(
}
}
#undef VZERO
template<typename T> DUCC0_NOINLINE void inner_loop(SHT_mode mode,
mav<complex<double>,2> &almtmp,
mav<complex<T>,3> &phase, const vector<ringdata> &rdata,
......@@ -1266,10 +1284,12 @@ void get_gridweights(const string &type, mav<double,1> &wgt)
/* Weights from Waldvogel 2006: BIT Numerical Mathematics 46, p. 195 */
vector<double> xwgt(nrings);
xwgt[0]=2.;
UnityRoots<double,dcmplx> roots(2*nrings);
for (size_t k=1; k<=(nrings-1)/2; ++k)
{
xwgt[2*k-1]=2./(1.-4.*k*k)*cos((k*pi)/nrings);
xwgt[2*k ]=2./(1.-4.*k*k)*sin((k*pi)/nrings);
auto tmp = roots[k];
xwgt[2*k-1]=2./(1.-4.*k*k)*tmp.real();
xwgt[2*k ]=2./(1.-4.*k*k)*tmp.imag();
}
if ((nrings&1)==0) xwgt[nrings-1]=0.;
pocketfft_r<double> plan(nrings);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment