diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c index 3cd2b358f0fd04fcaecf6d29607124f940ea76f3..146be769a39999a58333b95358098c242eef0bc4 100644 --- a/libsharp/sharp_core.c +++ b/libsharp/sharp_core.c @@ -190,8 +190,8 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, { if (l+4>gen->lmax) {*l_=gen->lmax+1;return;} below_limit=1; - Tv a1=vload(gen->ab[il ].f[0]), b1=vload(gen->ab[il ].f[1]); - Tv a2=vload(gen->ab[il+1].f[0]), b2=vload(gen->ab[il+1].f[1]); + Tv a1=vload(gen->coef[il ][0]), b1=vload(gen->coef[il ][1]); + Tv a2=vload(gen->coef[il+1][0]), b2=vload(gen->coef[il+1][1]); for (int i=0; i<nv2; ++i) { d->lam1[i] = (a1*d->csq[i] + b1)*d->lam2[i] + d->lam1[i]; @@ -205,7 +205,7 @@ NOINLINE static void iter_to_ieee(const sharp_Ylmgen_C * restrict gen, } NOINLINE static void alm2map_kernel(s0data_v * restrict d, - const sharp_ylmgen_dbl2 * restrict ab, const dcmplx * restrict alm, + const sharp_ylmgen_dbl2 * restrict coef, const dcmplx * restrict alm, int l, int il, int lmax, int nv2) { if (nv2==nv0) @@ -216,8 +216,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); - Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]); - Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]); + Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); + Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); for (int i=0; i<nv0; ++i) { d->p1r[i] += d->lam2[i]*ar1; @@ -241,8 +241,8 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); Tv ar3=vload(creal(alm[l+2])), ai3=vload(cimag(alm[l+2])); Tv ar4=vload(creal(alm[l+3])), ai4=vload(cimag(alm[l+3])); - Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]); - Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]); + Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); + Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); for (int i=0; i<nv2; ++i) { d->p1r[i] += d->lam2[i]*ar1; @@ -262,7 +262,7 @@ NOINLINE static void alm2map_kernel(s0data_v * restrict d, { Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]); + Tv a=vload(coef[il][0]), b=vload(coef[il][1]); for (int i=0; i<nv2; ++i) { d->p1r[i] += d->lam2[i]*ar1; @@ -286,7 +286,7 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 6*nth; - const sharp_ylmgen_dbl2 * restrict ab = gen->ab; + const sharp_ylmgen_dbl2 * restrict coef = gen->coef; const dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; i<nv2; ++i) @@ -299,7 +299,7 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job, { Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])); Tv ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); - Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]); + Tv a=vload(coef[il][0]), b=vload(coef[il][1]); full_ieee=1; for (int i=0; i<nv2; ++i) { @@ -323,17 +323,17 @@ NOINLINE static void calc_alm2map (sharp_job * restrict job, d->lam1[i] *= d->corfac[i]; d->lam2[i] *= d->corfac[i]; } - alm2map_kernel(d, ab, alm, l, il, lmax, nv2); + alm2map_kernel(d, coef, alm, l, il, lmax, nv2); } NOINLINE static void map2alm_kernel(s0data_v * restrict d, - const sharp_ylmgen_dbl2 * restrict ab, dcmplx * restrict alm, int l, + const sharp_ylmgen_dbl2 * restrict coef, dcmplx * restrict alm, int l, int il, int lmax, int nv2) { for (; l<=lmax-2; il+=2, l+=4) { - Tv a1=vload(ab[il ].f[0]), b1=vload(ab[il ].f[1]); - Tv a2=vload(ab[il+1].f[0]), b2=vload(ab[il+1].f[1]); + Tv a1=vload(coef[il ][0]), b1=vload(coef[il ][1]); + Tv a2=vload(coef[il+1][0]), b2=vload(coef[il+1][1]); Tv atmp1[4] = {vzero, vzero, vzero, vzero}; Tv atmp2[4] = {vzero, vzero, vzero, vzero}; for (int i=0; i<nv2; ++i) @@ -354,7 +354,7 @@ NOINLINE static void map2alm_kernel(s0data_v * restrict d, } for (; l<=lmax; ++il, l+=2) { - Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]); + Tv a=vload(coef[il][0]), b=vload(coef[il][1]); Tv atmp[4] = {vzero, vzero, vzero, vzero}; for (int i=0; i<nv2; ++i) { @@ -380,7 +380,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 6*nth; - const sharp_ylmgen_dbl2 * restrict ab = gen->ab; + const sharp_ylmgen_dbl2 * restrict coef = gen->coef; dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; i<nv2; ++i) @@ -391,7 +391,7 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job, while((!full_ieee) && (l<=lmax)) { - Tv a=vload(ab[il].f[0]), b=vload(ab[il].f[1]); + Tv a=vload(coef[il][0]), b=vload(coef[il][1]); Tv atmp[4] = {vzero, vzero, vzero, vzero}; full_ieee=1; for (int i=0; i<nv2; ++i) @@ -417,13 +417,13 @@ NOINLINE static void calc_map2alm (sharp_job * restrict job, d->lam1[i] *= d->corfac[i]; d->lam2[i] *= d->corfac[i]; } - map2alm_kernel(d, ab, alm, l, il, lmax, nv2); + map2alm_kernel(d, coef, alm, l, il, lmax, nv2); } NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, sxdata_v * restrict d, int * restrict l_, int nv2) { - const sharp_ylmgen_dbl2 * restrict fx = gen->fx; + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; Tv prefac=vload(gen->prefac[gen->m]), prescale=vload(gen->fscale[gen->m]); Tv limscale=vload(sharp_limscale); @@ -474,8 +474,8 @@ NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen_C * restrict gen, { if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} below_limit=1; - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); for (int i=0; i<nv2; ++i) { d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i]; @@ -500,8 +500,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, int lsave = l; while (l<=lmax) { - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), @@ -525,8 +525,8 @@ NOINLINE static void alm2map_spin_kernel(sxdata_v * restrict d, l=lsave; while (l<=lmax) { - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), @@ -559,7 +559,7 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 23*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->fx; + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; const dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; i<nv2; ++i) @@ -572,8 +572,8 @@ NOINLINE static void calc_alm2map_spin (sharp_job * restrict job, while((!full_ieee) && (l<=lmax)) { - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); Tv agr1=vload(creal(alm[2*l ])), agi1=vload(cimag(alm[2*l ])), acr1=vload(creal(alm[2*l+1])), aci1=vload(cimag(alm[2*l+1])); Tv agr2=vload(creal(alm[2*l+2])), agi2=vload(cimag(alm[2*l+2])), @@ -636,8 +636,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, int lsave=l; while (l<=lmax) { - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; for (int i=0; i<nv2; ++i) @@ -660,8 +660,8 @@ NOINLINE static void map2alm_spin_kernel(sxdata_v * restrict d, l=lsave; while (l<=lmax) { - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; for (int i=0; i<nv2; ++i) @@ -693,7 +693,7 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 23*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->fx; + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; i<nv2; ++i) @@ -714,8 +714,8 @@ NOINLINE static void calc_map2alm_spin (sharp_job * restrict job, while((!full_ieee) && (l<=lmax)) { - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); Tv agr1=vzero, agi1=vzero, acr1=vzero, aci1=vzero; Tv agr2=vzero, agi2=vzero, acr2=vzero, aci2=vzero; full_ieee=1; @@ -766,8 +766,8 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d, { while (l<=lmax) { - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); for (int i=0; i<nv2; ++i) @@ -803,7 +803,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, if (l>lmax) return; job->opcnt += (lmax+1-l) * 17*nth; - const sharp_ylmgen_dbl2 * restrict fx = gen->fx; + const sharp_ylmgen_dbl2 * restrict fx = gen->coef; const dcmplx * restrict alm=job->almtmp; int full_ieee=1; for (int i=0; i<nv2; ++i) @@ -816,8 +816,8 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job, while((!full_ieee) && (l<=lmax)) { - Tv fx10=vload(fx[l+1].f[0]),fx11=vload(fx[l+1].f[1]); - Tv fx20=vload(fx[l+2].f[0]),fx21=vload(fx[l+2].f[1]); + Tv fx10=vload(fx[l+1][0]),fx11=vload(fx[l+1][1]); + Tv fx20=vload(fx[l+2][0]),fx21=vload(fx[l+2][1]); Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])), ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1])); full_ieee=1; diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c index e3c055b8f0e8c68381fc27869017a8eaad13d955..ffa3e0f0235d89e1e471f81a6513d48331bba3d3 100644 --- a/libsharp/sharp_ylmgen_c.c +++ b/libsharp/sharp_ylmgen_c.c @@ -82,14 +82,14 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin) } gen->eps=RALLOC(double, gen->lmax+4); gen->alpha=RALLOC(double, gen->lmax/2+2); - gen->ab=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+2); + gen->coef=RALLOC(sharp_ylmgen_dbl2, gen->lmax/2+2); } else { gen->m=gen->mlo=gen->mhi=-1234567890; - ALLOC(gen->fx,sharp_ylmgen_dbl2,gen->lmax+3); + ALLOC(gen->coef,sharp_ylmgen_dbl2,gen->lmax+3); for (int m=0; m<gen->lmax+3; ++m) - gen->fx[m].f[0]=gen->fx[m].f[1]=0.; + gen->coef[m][0]=gen->coef[m][1]=0.; ALLOC(gen->alpha,double,gen->lmax+3); ALLOC(gen->inv,double,gen->lmax+2); gen->inv[0]=0; @@ -134,19 +134,17 @@ void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen) { DEALLOC(gen->cf); DEALLOC(gen->powlimit); + DEALLOC(gen->alpha); + DEALLOC(gen->coef); if (gen->s==0) { DEALLOC(gen->mfac); DEALLOC(gen->root); DEALLOC(gen->iroot); DEALLOC(gen->eps); - DEALLOC(gen->alpha); - DEALLOC(gen->ab); } else { - DEALLOC(gen->fx); - DEALLOC(gen->alpha); DEALLOC(gen->prefac); DEALLOC(gen->fscale); DEALLOC(gen->flm1); @@ -174,9 +172,9 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m) /(gen->eps[l+2]*gen->eps[l+3]*gen->alpha[il]); for (int il=0, l=m; l<gen->lmax+2; ++il, l+=2) { - gen->ab[il].f[0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il]; + gen->coef[il][0] = ((il&1) ? -1 : 1)*gen->alpha[il]*gen->alpha[il]; double t1 = gen->eps[l+2], t2 = gen->eps[l+1]; - gen->ab[il].f[1] = -gen->ab[il].f[0]*(t1*t1+t2*t2); + gen->coef[il][1] = -gen->coef[il][0]*(t1*t1+t2*t2); } } else @@ -190,7 +188,7 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m) if (!ms_similar) { gen->alpha[gen->mhi] = 1.; - gen->fx[gen->mhi].f[0] = gen->fx[gen->mhi].f[1] = 0.; + gen->coef[gen->mhi][0] = gen->coef[gen->mhi][1] = 0.; for (int l=gen->mhi; l<gen->lmax+1; ++l) { double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m] @@ -206,8 +204,8 @@ void sharp_Ylmgen_prepare (sharp_Ylmgen_C *gen, int m) gen->alpha[l+1] = gen->alpha[l-1]*flp12; else gen->alpha[l+1] = 1.; - gen->fx[l+1].f[0] = flp10*gen->alpha[l]/gen->alpha[l+1]; - gen->fx[l+1].f[1] = flp11*gen->fx[l+1].f[0]; + gen->coef[l+1][0] = flp10*gen->alpha[l]/gen->alpha[l+1]; + gen->coef[l+1][1] = flp11*gen->coef[l+1][0]; } } diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h index cc9260f6de0f6ca9a088a6464751e326774372c8..b36346afd84c913a974d317e60ffcd17e3789cc5 100644 --- a/libsharp/sharp_ylmgen_c.h +++ b/libsharp/sharp_ylmgen_c.h @@ -41,7 +41,7 @@ static const double sharp_fbig=0x1p+800,sharp_fsmall=0x1p-800; static const double sharp_ftol=0x1p-60; static const double sharp_fbighalf=0x1p+400; -typedef struct { double f[2]; } sharp_ylmgen_dbl2; +typedef double sharp_ylmgen_dbl2[2]; typedef struct { @@ -54,16 +54,15 @@ typedef struct int m; double *alpha; + sharp_ylmgen_dbl2 *coef; /* used if s==0 */ double *mfac, *eps; - sharp_ylmgen_dbl2 *ab; /* used if s!=0 */ int sinPow, cosPow, preMinus_p, preMinus_m; double *prefac; int *fscale; - sharp_ylmgen_dbl2 *fx; /* internal usage only */ /* used if s==0 */