Commit e8ec0694 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

sync with latest libsharp

parent e21669e4
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH default
#define GENERIC_ARCH
#include "sharp_core_inc.c"
#undef GENERIC_ARCH
#undef ARCH
typedef void (*t_inner_loop) (sharp_job *job, const int *ispair,
......@@ -18,7 +16,12 @@ static t_veclen veclen_ = NULL;
static t_max_nvec max_nvec_ = NULL;
static t_architecture architecture_ = NULL;
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#ifdef MULTIARCH
#if (defined(___AVX512F__) || defined(__FMA4__) || defined(__FMA__) || \
defined(__AVX2__) || defined(__AVX__))
#error MULTIARCH specified but platform-specific flags detected
#endif
#define DECL(arch) \
static int XCONCATX2(have,arch)(void) \
......@@ -39,27 +42,17 @@ int XCONCATX2(sharp_veclen,arch) (void); \
int XCONCATX2(sharp_max_nvec,arch) (int spin); \
const char *XCONCATX2(sharp_architecture,arch) (void);
#if (!defined(__AVX512F__))
DECL(avx512f)
#endif
#if (!defined(__FMA4__))
DECL(fma4)
#endif
#if (!defined(__FMA__))
DECL(fma)
#endif
#if (!defined(__AVX2__))
DECL(avx2)
#endif
#if (!defined(__AVX__))
DECL(avx)
#endif
#endif
static void assign_funcs(void)
{
#if defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#ifdef MULTIARCH
#define DECL2(arch) \
if (XCONCATX2(have,arch)()) \
{ \
......@@ -69,21 +62,11 @@ static void assign_funcs(void)
architecture_ = XCONCATX2(sharp_architecture,arch); \
return; \
}
#if (!defined(__AVX512F__))
DECL2(avx512f)
#endif
#if (!defined(__FMA4__))
DECL2(fma4)
#endif
#if (!defined(__FMA__))
DECL2(fma)
#endif
#if (!defined(__AVX2__))
DECL2(avx2)
#endif
#if (!defined(__AVX__))
DECL2(avx)
#endif
#endif
inner_loop_ = inner_loop_default;
veclen_ = sharp_veclen_default;
......
#if (!defined(__AVX__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx
#pragma GCC target("avx")
#include "sharp_core_inc.c"
#endif
#if (!defined(__AVX2__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx2
#pragma GCC target("avx2")
#include "sharp_core_inc.c"
#endif
#if (!defined(__AVX512F__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH avx512f
#pragma GCC target("avx512f")
#include "sharp_core_inc.c"
#endif
#if (!defined(__FMA__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH fma
#pragma GCC target("fma")
#include "sharp_core_inc.c"
#endif
#if (!defined(__FMA4__)) && defined(__GNUC__) && defined (__x86_64__) && (__GNUC__>=6)
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#define ARCH fma4
#pragma GCC target("fma4")
#include "sharp_core_inc.c"
#endif
......@@ -29,6 +29,12 @@
* \author Martin Reinecke
*/
#if (defined(MULTIARCH) || defined(GENERIC_ARCH))
#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)
#include <complex.h>
#include <math.h>
#include <string.h>
......@@ -764,6 +770,7 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
const sharp_ylmgen_dbl2 * restrict fx, const dcmplx * restrict alm,
int l, int lmax, int nv2)
{
int lsave=l;
while (l<=lmax)
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
......@@ -773,20 +780,30 @@ NOINLINE static void alm2map_deriv1_kernel(sxdata_v * restrict d,
for (int i=0; i<nv2; ++i)
{
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
Tv lw=d->l2p[i]+d->l2m[i];
d->p1pr[i] += ar1*lw;
d->p1pi[i] += ai1*lw;
Tv lx=d->l2m[i]-d->l2p[i];
d->p2mr[i] += ai1*lx;
d->p2mi[i] -= ar1*lx;
lw=d->l1p[i]+d->l1m[i];
d->p2pr[i] += ar2*lw;
d->p2pi[i] += ai2*lw;
lx=d->l1m[i]-d->l1p[i];
d->p1mr[i] += ai2*lx;
d->p1mi[i] -= ar2*lx;
d->p1pr[i] += ar1*d->l2p[i];
d->p1pi[i] += ai1*d->l2p[i];
d->p1mr[i] -= ai2*d->l1p[i];
d->p1mi[i] += ar2*d->l1p[i];
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
}
l+=2;
}
l=lsave;
while (l<=lmax)
{
Tv fx10=vload(fx[l+1].a),fx11=vload(fx[l+1].b);
Tv fx20=vload(fx[l+2].a),fx21=vload(fx[l+2].b);
Tv ar1=vload(creal(alm[l ])), ai1=vload(cimag(alm[l ])),
ar2=vload(creal(alm[l+1])), ai2=vload(cimag(alm[l+1]));
for (int i=0; i<nv2; ++i)
{
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
d->p2mr[i] += ai1*d->l2m[i];
d->p2mi[i] -= ar1*d->l2m[i];
d->p2pr[i] += ar2*d->l1m[i];
d->p2pi[i] += ai2*d->l1m[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
}
l+=2;
......@@ -801,7 +818,7 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
iter_to_ieee_spin(gen, d, &l, nv2);
job->opcnt += (l-gen->mhi) * 7*nth;
if (l>lmax) return;
job->opcnt += (lmax+1-l) * 17*nth;
job->opcnt += (lmax+1-l) * 15*nth;
const sharp_ylmgen_dbl2 * restrict fx = gen->coef;
const dcmplx * restrict alm=job->almtmp;
......@@ -825,34 +842,32 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
{
d->l1p[i] = (d->cth[i]*fx10 - fx11)*d->l2p[i] - d->l1p[i];
d->l1m[i] = (d->cth[i]*fx10 + fx11)*d->l2m[i] - d->l1m[i];
Tv lw=d->l2p[i]*d->cfp[i]+d->l2m[i]*d->cfm[i];
d->p1pr[i] += ar1*lw;
d->p1pi[i] += ai1*lw;
Tv lx=d->l2m[i]*d->cfm[i]-d->l2p[i]*d->cfp[i];
d->p2mr[i] += ai1*lx;
d->p2mi[i] -= ar1*lx;
lw=d->l1p[i]*d->cfp[i]+d->l1m[i]*d->cfm[i];
d->p2pr[i] += ar2*lw;
d->p2pi[i] += ai2*lw;
lx=d->l1m[i]*d->cfm[i]-d->l1p[i]*d->cfp[i];
d->p1mr[i] += ai2*lx;
d->p1mi[i] -= ar2*lx;
Tv l2p=d->l2p[i]*d->cfp[i], l2m=d->l2m[i]*d->cfm[i];
Tv l1m=d->l1m[i]*d->cfm[i], l1p=d->l1p[i]*d->cfp[i];
d->p1pr[i] += ar1*l2p;
d->p1pi[i] += ai1*l2p;
d->p1mr[i] -= ai2*l1p;
d->p1mi[i] += ar2*l1p;
d->p2pr[i] += ar2*l1m;
d->p2pi[i] += ai2*l1m;
d->p2mr[i] += ai1*l2m;
d->p2mi[i] -= ar1*l2m;
d->l2p[i] = (d->cth[i]*fx20 - fx21)*d->l1p[i] - d->l2p[i];
d->l2m[i] = (d->cth[i]*fx20 + fx21)*d->l1m[i] - d->l2m[i];
if (rescale(&d->l1p[i], &d->l2p[i], &d->scp[i], vload(sharp_ftol)))
{
getCorfac(d->scp[i], &d->cfp[i], gen->cf);
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
}
full_ieee &= vallTrue(vge(d->scp[i],vload(sharp_minscale)));
if (rescale(&d->l1m[i], &d->l2m[i], &d->scm[i], vload(sharp_ftol)))
{
getCorfac(d->scm[i], &d->cfm[i], gen->cf);
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
}
full_ieee &= vallTrue(vge(d->scm[i],vload(sharp_minscale)));
}
l+=2;
}
if (l>lmax) return;
// if (l>lmax) return;
for (int i=0; i<nv2; ++i)
{
......@@ -862,6 +877,15 @@ NOINLINE static void calc_alm2map_deriv1(sharp_job * restrict job,
d->l2m[i] *= d->cfm[i];
}
alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2);
for (int i=0; i<nv2; ++i)
{
Tv tmp;
tmp = d->p1pr[i]; d->p1pr[i] -= d->p2mi[i]; d->p2mi[i] += tmp;
tmp = d->p1pi[i]; d->p1pi[i] += d->p2mr[i]; d->p2mr[i] -= tmp;
tmp = d->p1mr[i]; d->p1mr[i] += d->p2pi[i]; d->p2pi[i] -= tmp;
tmp = d->p1mi[i]; d->p1mi[i] -= d->p2pr[i]; d->p2pr[i] += tmp;
}
}
......@@ -1179,3 +1203,5 @@ const char *XARCH(sharp_architecture)(void)
{
return xstr(ARCH);
}
#endif
......@@ -40,8 +40,6 @@
#include "sharp.h"
#include "sharp_ylmgen_c.h"
#define SHARP_MAXTRANS 100
typedef struct
{
sharp_jobtype type;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment