From dc5fbb0afde98acfa944d940ea1d148e37b26d37 Mon Sep 17 00:00:00 2001 From: Martin Reinecke <martin@mpa-garching.mpg.de> Date: Mon, 21 Jan 2019 23:45:44 +0100 Subject: [PATCH] tweak alm2map_deriv1 --- libsharp/sharp_core_inc.c | 84 ++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c index 96981f6..b875877 100644 --- a/libsharp/sharp_core_inc.c +++ b/libsharp/sharp_core_inc.c @@ -770,6 +770,7 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm, int l, int lmax, int nv2) { + int lsave=l; while (l<=lmax) { Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); @@ -779,20 +780,30 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, for (int i=0; i<nv2; ++i) { d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; - d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - Tv lw=d->l2p[i]+d->l2m[i]; - d->p1pr[i] += ar1*lw; - d->p1pi[i] += ai1*lw; - Tv lx=d->l2m[i]-d->l2p[i]; - d->p2mr[i] += ai1*lx; - d->p2mi[i] -= ar1*lx; - lw=d->l1p[i]+d->l1m[i]; - d->p2pr[i] += ar2*lw; - d->p2pi[i] += ai2*lw; - lx=d->l1m[i]-d->l1p[i]; - d->p1mr[i] += ai2*lx; - d->p1mi[i] -= ar2*lx; + d->p1pr[i] += ar1*d->l2p[i]; + d->p1pi[i] += ai1*d->l2p[i]; + + d->p1mr[i] -= ai2*d->l1p[i]; + d->p1mi[i] += ar2*d->l1p[i]; d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; + } + l+=2; + } + l=lsave; + while (l<=lmax) + { + Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b); + Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b); + Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), + ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); + for (int i=0; i<nv2; ++i) + { + d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; + d->p2mr[i] += ai1*d->l2m[i]; + d->p2mi[i] -= ar1*d->l2m[i]; + + d->p2pr[i] += ar2*d->l1m[i]; + d->p2pi[i] += ai2*d->l1m[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; } l+=2; @@ -807,7 +818,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, iter_to_ieee_spin(gen, d, &l, nv2); job->opcnt += (l-gen->mhi) * 7*nth; if (l>lmax) return; - job->opcnt += (lmax+1-l) * 17*nth; + job->opcnt += (lmax+1-l) * 15*nth; const sharp_ylmgen_dbl2 * restrict fx = gen->coef; const dcmplx * restrict alm=job->almtmp; @@ -831,34 +842,32 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, { d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i]; - Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i]; - d->p1pr[i] += ar1*lw; - d->p1pi[i] += ai1*lw; - Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i]; - d->p2mr[i] += ai1*lx; - d->p2mi[i] -= ar1*lx; - lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i]; - d->p2pr[i] += ar2*lw; - d->p2pi[i] += ai2*lw; - lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i]; - d->p1mr[i] += ai2*lx; - d->p1mi[i] -= ar2*lx; + + Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i]; + Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i]; + + d->p1pr[i] += ar1*l2p; + d->p1pi[i] += ai1*l2p; + d->p1mr[i] -= ai2*l1p; + d->p1mi[i] += ar2*l1p; + + d->p2pr[i] += ar2*l1m; + d->p2pi[i] += ai2*l1m; + d->p2mr[i] += ai1*l2m; + d->p2mi[i] -= ar1*l2m; + d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i]; d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i]; if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol))) - { getCorfac(d->scp[i], &d->cfp[i], gen->cf); - full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); - } + full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale))); if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol))) - { getCorfac(d->scm[i], &d->cfm[i], gen->cf); - full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); - } + full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale))); } l+=2; } - if (l>lmax) return; +// if (l>lmax) return; for (int i=0; i<nv2; ++i) { @@ -868,6 +877,15 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, d->l2m[i] *= d->cfm[i]; } alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2); + + for (int i=0; i<nv2; ++i) + { + Tv tmp; + tmp = d->p1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp; + tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp; + tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp; + tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp; + } } -- GitLab