From 18c82762c3a4262d363c4feadeb79fbb693d00b9 Mon Sep 17 00:00:00 2001 From: Martin Reinecke <martin@mpa-garching.mpg.de> Date: Fri, 26 Oct 2018 14:36:25 +0200 Subject: [PATCH] updates --- c_utils/c_utils.c | 6 +- c_utils/c_utils.h | 8 +- libsharp/sharp.c | 195 +++++++++++++++++++++------------ libsharp/sharp_almhelpers.c | 2 +- libsharp/sharp_almhelpers.h | 2 +- libsharp/sharp_complex_hacks.h | 31 +++++- libsharp/sharp_core_inc.c | 93 ++++++++++------ libsharp/sharp_core_inc0.c | 3 +- libsharp/sharp_core_inc2.c | 185 +++++++++++++++++-------------- libsharp/sharp_cxx.h | 70 ++++++++++-- libsharp/sharp_geomhelpers.c | 6 +- libsharp/sharp_testsuite.c | 8 +- libsharp/sharp_vecsupport.h | 14 ++- libsharp/sharp_ylmgen_c.c | 9 +- libsharp/sharp_ylmgen_c.h | 3 +- 15 files changed, 420 insertions(+), 215 deletions(-) diff --git a/c_utils/c_utils.c b/c_utils/c_utils.c index 96bd765..9344a6d 100644 --- a/c_utils/c_utils.c +++ b/c_utils/c_utils.c @@ -25,7 +25,7 @@ /* * Convenience functions * - * Copyright (C) 2008, 2009, 2010, 2011, 2012 Max-Planck-Society + * Copyright (C) 2008-2017 Max-Planck-Society * Author: Martin Reinecke */ @@ -44,7 +44,7 @@ void util_warn_ (const char *file, int line, const char *func, const char *msg) /* This function tries to avoid allocations with a total size close to a high power of two (called the "critical stride" here), by adding a few more bytes - if necssary. This lowers the probability that two arrays differ by a multiple + if necessary. This lowers the probability that two arrays differ by a multiple of the critical stride in their starting address, which in turn lowers the risk of cache line contention. */ static size_t manipsize(size_t sz) @@ -61,7 +61,7 @@ void *util_malloc_ (size_t sz) { void *res; if (sz==0) return NULL; - res = _mm_malloc(manipsize(sz),16); + res = _mm_malloc(manipsize(sz),32); UTIL_ASSERT(res,"_mm_malloc() failed"); return res; } diff --git a/c_utils/c_utils.h b/c_utils/c_utils.h index 0503449..01c64ad 100644 --- a/c_utils/c_utils.h +++ b/c_utils/c_utils.h @@ -25,7 +25,7 @@ /*! \file c_utils.h * Convenience functions * - * Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society + * Copyright (C) 2008-2017 Max-Planck-Society * \author Martin Reinecke * \note This file should only be included from .c files, NOT from .h files. */ @@ -144,4 +144,10 @@ void util_free_ (void *ptr); } #endif +#ifdef __GNUC__ +#define NOINLINE __attribute__((noinline)) +#else +#define NOINLINE +#endif + #endif diff --git a/libsharp/sharp.c b/libsharp/sharp.c index b1b9277..884a644 100644 --- a/libsharp/sharp.c +++ b/libsharp/sharp.c @@ -25,11 +25,12 @@ /*! \file sharp.c * Spherical transform library * - * Copyright (C) 2006-2013 Max-Planck-Society + * Copyright (C) 2006-2016 Max-Planck-Society * \author Martin Reinecke \author Dag Sverre Seljebotn */ #include <math.h> +#include <string.h> #include "pocketfft/pocketfft.h" #include "sharp_ylmgen_c.h" #include "sharp_internal.h" @@ -63,7 +64,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize) *nchunks = (ndata+(*chunksize)-1)/(*chunksize); } -int sharp_get_mlim (int lmax, int spin, double sth, double cth) +NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth) { double ofs=lmax*0.01; if (ofs<100.) ofs=100.; @@ -83,12 +84,13 @@ typedef struct dcmplx *shiftarr; int s_shift; rfft_plan plan; + int length; int norot; } ringhelper; static void ringhelper_init (ringhelper *self) { - static ringhelper rh_null = { 0, NULL, 0, NULL, 0 }; + static ringhelper rh_null = { 0, NULL, 0, NULL, 0, 0 }; *self = rh_null; } @@ -99,7 +101,7 @@ static void ringhelper_destroy (ringhelper *self) ringhelper_init(self); } -static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) +NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) { self->norot = (fabs(phi0)<1e-14); if (!(self->norot)) @@ -110,12 +112,15 @@ static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) self->phi0_ = phi0; for (int m=0; m<=mmax; ++m) self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); +// double *tmp=(double *) self->shiftarr; +// sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2); } if (!self->plan) self->plan=make_rfft_plan(nph); - if (nph!=(int)rfft_length(self->plan)) + if (nph!=(int)self->length) { destroy_rfft_plan(self->plan); self->plan=make_rfft_plan(nph); + self->length=nph; } } @@ -127,6 +132,7 @@ static int ringinfo_compare (const void *xa, const void *xb) static int ringpair_compare (const void *xa, const void *xb) { const sharp_ringpair *a=xa, *b=xb; +// return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0; if (a->r1.nph==b->r1.nph) return (a->r1.phi0 < b->r1.phi0) ? -1 : ((a->r1.phi0 > b->r1.phi0) ? 1 : @@ -261,6 +267,7 @@ void sharp_destroy_geom_info (sharp_geom_info *geom_info) distribution are permissible. */ static int sharp_get_mmax (int *mval, int nm) { + //FIXME: if gaps are allowed, we have to search the maximum m in the array int *mcheck=RALLOC(int,nm); SET_ARRAY(mcheck,0,nm,0); for (int i=0; i<nm; ++i) @@ -274,7 +281,7 @@ static int sharp_get_mmax (int *mval, int nm) return nm-1; } -static void ringhelper_phase2ring (ringhelper *self, +NOINLINE static void ringhelper_phase2ring (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase, int pstride, int flags) { @@ -288,13 +295,19 @@ static void ringhelper_phase2ring (ringhelper *self, if (nph>=2*mmax+1) { - for (int m=0; m<=mmax; ++m) - { - dcmplx tmp = phase[m*pstride]*wgt; - if(!self->norot) tmp*=self->shiftarr[m]; - data[2*m]=creal(tmp); - data[2*m+1]=cimag(tmp); - } + if (self->norot) + for (int m=0; m<=mmax; ++m) + { + data[2*m]=creal(phase[m*pstride])*wgt; + data[2*m+1]=cimag(phase[m*pstride])*wgt; + } + else + for (int m=0; m<=mmax; ++m) + { + dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; + data[2*m]=creal(tmp)*wgt; + data[2*m+1]=cimag(tmp)*wgt; + } for (int m=2*(mmax+1); m<nph+2; ++m) data[m]=0.; } @@ -326,7 +339,7 @@ static void ringhelper_phase2ring (ringhelper *self, rfft_backward (self->plan, &(data[1]), 1.); } -static void ringhelper_ring2phase (ringhelper *self, +NOINLINE static void ringhelper_ring2phase (ringhelper *self, const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase, int pstride, int flags) { @@ -376,7 +389,7 @@ static void ringhelper_ring2phase (ringhelper *self, phase[m*pstride]=0.; } -static void fill_map (const sharp_geom_info *ginfo, void *map, double value, +NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map, int flags) { if (flags & SHARP_NO_FFT) @@ -386,50 +399,55 @@ static void fill_map (const sharp_geom_info *ginfo, void *map, double value, if (flags&SHARP_DP) { for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) - ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] - =value; + ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0; for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) - ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] - =value; + ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0; } else { for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) - ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] - =(float)value; + ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0; for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) - ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] - =(float)value; + ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0; } } } else { - for (int j=0;j<ginfo->npairs;++j) + if (flags&SHARP_DP) { - if (flags&SHARP_DP) + for (int j=0;j<ginfo->npairs;++j) { - for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) - ((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] - =value; - for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) - ((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] - =value; + double *dmap=(double *)map; + if (ginfo->pair[j].r1.stride==1) + memset(&dmap[ginfo->pair[j].r1.ofs],0, + ginfo->pair[j].r1.nph*sizeof(double)); + else + for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) + dmap[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0; + if ((ginfo->pair[j].r2.nph>0)&&(ginfo->pair[j].r2.stride==1)) + memset(&dmap[ginfo->pair[j].r2.ofs],0, + ginfo->pair[j].r2.nph*sizeof(double)); + else + for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) + dmap[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0; } - else + } + else + { + for (int j=0;j<ginfo->npairs;++j) { for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i) - ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] - =(float)value; + ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0; for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i) - ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] - =(float)value; + ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0; } } } } -static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags) +NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm, + int flags) { #define CLEARLOOP(real_t,body) \ { \ @@ -465,7 +483,7 @@ static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags) } } -static void init_output (sharp_job *job) +NOINLINE static void init_output (sharp_job *job) { if (job->flags&SHARP_ADD) return; if (job->type == SHARP_MAP2ALM) @@ -473,21 +491,21 @@ static void init_output (sharp_job *job) clear_alm (job->ainfo,job->alm[i],job->flags); else for (int i=0; i<job->ntrans*job->nmaps; ++i) - fill_map (job->ginfo,job->map[i],0.,job->flags); + clear_map (job->ginfo,job->map[i],job->flags); } -static void alloc_phase (sharp_job *job, int nm, int ntheta) +NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta) { if (job->type==SHARP_MAP2ALM) { - if ((nm&1023)==0) nm+=3; // hack to avoid critical strides job->s_m=2*job->ntrans*job->nmaps; + if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides job->s_th=job->s_m*nm; } else { - if ((ntheta&1023)==0) ntheta+=3; // hack to avoid critical strides job->s_th=2*job->ntrans*job->nmaps; + if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides job->s_m=job->s_th*ntheta; } job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); @@ -502,22 +520,28 @@ static void alloc_almtmp (sharp_job *job, int lmax) static void dealloc_almtmp (sharp_job *job) { DEALLOC(job->almtmp); } -static void alm2almtmp (sharp_job *job, int lmax, int mi) +NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi) { -#define COPY_LOOP(real_t, source_t, expr_of_x) \ - for (int l=job->ainfo->mval[mi]; l<=lmax; ++l) \ +#define COPY_LOOP(real_t, source_t, expr_of_x) \ + { \ + for (int l=m; l<lmin; ++l) \ + for (int i=0; i<job->ntrans*job->nalm; ++i) \ + job->almtmp[job->ntrans*job->nalm*l+i] = 0; \ + for (int l=lmin; l<=lmax; ++l) \ for (int i=0; i<job->ntrans*job->nalm; ++i) \ { \ - source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \ - job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \ - } + source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \ + job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \ + } \ + } if (job->type!=SHARP_MAP2ALM) { ptrdiff_t ofs=job->ainfo->mvstart[mi]; int stride=job->ainfo->stride; int m=job->ainfo->mval[mi]; + int lmin=(m<job->spin) ? job->spin : m; /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the coefficients by sqrt_one_half; here we must compensate to avoid scaling m=0 */ @@ -562,17 +586,17 @@ static void alm2almtmp (sharp_job *job, int lmax, int mi) } } else - SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi], - job->ntrans*job->nalm*(lmax+1),0.); + memset (job->almtmp+job->ntrans*job->nalm*job->ainfo->mval[mi], 0, + job->ntrans*job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx)); #undef COPY_LOOP } -static void almtmp2alm (sharp_job *job, int lmax, int mi) +NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi) { #define COPY_LOOP(real_t, target_t, expr_of_x) \ - for (int l=job->ainfo->mval[mi]; l<=lmax; ++l) \ + for (int l=lmin; l<=lmax; ++l) \ for (int i=0; i<job->ntrans*job->nalm; ++i) \ { \ dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i]; \ @@ -583,6 +607,7 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi) ptrdiff_t ofs=job->ainfo->mvstart[mi]; int stride=job->ainfo->stride; int m=job->ainfo->mval[mi]; + int lmin=(m<job->spin) ? job->spin : m; /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the coefficients by sqrt_two; here we must compensate to avoid scaling m=0 */ @@ -629,27 +654,56 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi) #undef COPY_LOOP } -static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, - int rstride) +NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, + const double *ringtmp, int rstride) { - double **dmap = (double **)job->map; - float **fmap = (float **)job->map; - for (int i=0; i<job->ntrans*job->nmaps; ++i) - for (int m=0; m<ri->nph; ++m) - if (job->flags & SHARP_DP) - dmap[i][ri->ofs+m*ri->stride] += ringtmp[i*rstride+m+1]; + if (job->flags & SHARP_DP) + { + double **dmap = (double **)job->map; + for (int i=0; i<job->ntrans*job->nmaps; ++i) + { + double *restrict p1=&dmap[i][ri->ofs]; + const double *restrict p2=&ringtmp[i*rstride+1]; + if (ri->stride==1) + { + if (job->flags&SHARP_ADD) + for (int m=0; m<ri->nph; ++m) + p1[m] += p2[m]; + else + memcpy(p1,p2,ri->nph*sizeof(double)); + } else + for (int m=0; m<ri->nph; ++m) + p1[m*ri->stride] += p2[m]; + } + } + else + { + float **fmap = (float **)job->map; + for (int i=0; i<job->ntrans*job->nmaps; ++i) + for (int m=0; m<ri->nph; ++m) fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1]; + } } -static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp, - int rstride) +NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, + double *ringtmp, int rstride) { - for (int i=0; i<job->ntrans*job->nmaps; ++i) - for (int m=0; m<ri->nph; ++m) - ringtmp[i*rstride+m+1] = (job->flags & SHARP_DP) ? - ((double *)(job->map[i]))[ri->ofs+m*ri->stride] : - ((float *)(job->map[i]))[ri->ofs+m*ri->stride]; + if (job->flags & SHARP_DP) + for (int i=0; i<job->ntrans*job->nmaps; ++i) + { + double *restrict p1=&ringtmp[i*rstride+1], + *restrict p2=&(((double *)(job->map[i]))[ri->ofs]); + if (ri->stride==1) + memcpy(p1,p2,ri->nph*sizeof(double)); + else + for (int m=0; m<ri->nph; ++m) + p1[m] = p2[m*ri->stride]; + } + else + for (int i=0; i<job->ntrans*job->nmaps; ++i) + for (int m=0; m<ri->nph; ++m) + ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride]; } static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, @@ -693,7 +747,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax, } //FIXME: set phase to zero if not SHARP_MAP2ALM? -static void map2phase (sharp_job *job, int mmax, int llim, int ulim) +NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim) { if (job->type != SHARP_MAP2ALM) return; int pstride = job->s_m; @@ -738,7 +792,7 @@ static void map2phase (sharp_job *job, int mmax, int llim, int ulim) } } -static void phase2map (sharp_job *job, int mmax, int llim, int ulim) +NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim) { if (job->type == SHARP_MAP2ALM) return; int pstride = job->s_m; @@ -783,7 +837,7 @@ static void phase2map (sharp_job *job, int mmax, int llim, int ulim) } } -static void sharp_execute_job (sharp_job *job) +NOINLINE static void sharp_execute_job (sharp_job *job) { double timer=wallTime(); job->opcnt=0; @@ -800,6 +854,7 @@ static void sharp_execute_job (sharp_job *job) int nchunks, chunksize; get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks, &chunksize); +//FIXME: needs to be changed to "nm" alloc_phase (job,mmax+1,chunksize); /* chunk loop */ diff --git a/libsharp/sharp_almhelpers.c b/libsharp/sharp_almhelpers.c index 6a98309..12ce600 100644 --- a/libsharp/sharp_almhelpers.c +++ b/libsharp/sharp_almhelpers.c @@ -25,7 +25,7 @@ /*! \file sharp_almhelpers.c * Spherical transform library * - * Copyright (C) 2008-2013 Max-Planck-Society + * Copyright (C) 2008-2016 Max-Planck-Society * \author Martin Reinecke */ diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h index 3bff317..67016d7 100644 --- a/libsharp/sharp_almhelpers.h +++ b/libsharp/sharp_almhelpers.h @@ -25,7 +25,7 @@ /*! \file sharp_almhelpers.h * SHARP helper function for the creation of a_lm data structures * - * Copyright (C) 2008-2011 Max-Planck-Society + * Copyright (C) 2008-2016 Max-Planck-Society * \author Martin Reinecke */ diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h index 9227ca9..86d1153 100644 --- a/libsharp/sharp_complex_hacks.h +++ b/libsharp/sharp_complex_hacks.h @@ -25,7 +25,7 @@ /* \file sharp_complex_hacks.h * support for converting vector types and complex numbers * - * Copyright (C) 2012,2013 Max-Planck-Society + * Copyright (C) 2012-2016 Max-Planck-Society * Author: Martin Reinecke */ @@ -51,6 +51,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d, complex double * restrict c1, complex double * restrict c2) { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; } +static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, + complex double * restrict cc) + { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; } + #endif #if (VLEN==2) @@ -94,6 +98,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, #endif } +static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, + complex double * restrict cc) + { vhsum_cmplx2(a,b,c,d,cc,cc+1); } + #endif #if (VLEN==4) @@ -130,6 +138,23 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d, #endif } +static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, + complex double * restrict cc) + { + Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); + Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), + tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); + tmp1=vadd(tmp3,tmp4); +#ifdef UNSAFE_CODE + _mm256_storeu_pd((double *)cc, + _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1)); +#else + union {Tv v; complex double c[2]; } u; + u.v=tmp1; + cc[0]+=u.c[0]; cc[1]+=u.c[1]; +#endif + } + #endif #if (VLEN==8) @@ -144,6 +169,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d, *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d); } +static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, + complex double * restrict cc) + { vhsum_cmplx2(a,b,c,d,cc,cc+1); } + #endif #endif diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c index 747658c..8a36bfe 100644 --- a/libsharp/sharp_core_inc.c +++ b/libsharp/sharp_core_inc.c @@ -25,7 +25,7 @@ /*! \file sharp_core_inc.c * Type-dependent code for the computational core * - * Copyright (C) 2012 Max-Planck-Society + * Copyright (C) 2012-2017 Max-Planck-Society * \author Martin Reinecke */ @@ -73,8 +73,8 @@ static inline void Y(Tbmuleq)(Tb * restrict a, Tb b) static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale, double maxval) { - const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval); + const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig); for (int i=0;i<nvec; ++i) { Tm mask = vgt(vabs(val->v[i]),vfmax); @@ -94,35 +94,58 @@ static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale, } } -static void Y(mypow) (Tb val, int npow, Tb * restrict resd, - Tb * restrict ress) +NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit, + Tb * restrict resd, Tb * restrict ress) { - Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.); - - Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf); - - do + Tv vminv=vload(powlimit[npow]); + Tm mask = vlt(vabs(val.v[0]),vminv); + for (int i=1;i<nvec; ++i) + mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv)); + if (!vanyTrue(mask)) // no underflows possible, use quick algoritm + { + Tb res=Y(Tbconst)(1.); + do + { + if (npow&1) + for (int i=0; i<nvec; ++i) + { + vmuleq(res.v[i],val.v[i]); + vmuleq(val.v[i],val.v[i]); + } + else + for (int i=0; i<nvec; ++i) + vmuleq(val.v[i],val.v[i]); + } + while(npow>>=1); + *resd=res; + *ress=Y(Tbconst)(0.); + } + else { - if (npow&1) + Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.); + Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf); + do { + if (npow&1) + { + for (int i=0; i<nvec; ++i) + { + vmuleq(res.v[i],val.v[i]); + vaddeq(scale.v[i],scaleint.v[i]); + } + Y(Tbnormalize)(&res,&scale,sharp_fbighalf); + } for (int i=0; i<nvec; ++i) { - vmuleq(res.v[i],val.v[i]); - vaddeq(scale.v[i],scaleint.v[i]); + vmuleq(val.v[i],val.v[i]); + vaddeq(scaleint.v[i],scaleint.v[i]); } - Y(Tbnormalize)(&res,&scale,sharp_fbighalf); - } - for (int i=0; i<nvec; ++i) - { - vmuleq(val.v[i],val.v[i]); - vaddeq(scaleint.v[i],scaleint.v[i]); + Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf); } - Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf); + while(npow>>=1); + *resd=res; + *ress=scale; } - while(npow>>=1); - - *resd=res; - *ress=scale; } static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2, @@ -179,13 +202,13 @@ static void Y(getCorfac)(Tb scale, Tb * restrict corfac, *corfac=corf.b; } -static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_, +NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_, Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_, const sharp_Ylmgen_C * restrict gen) { int l=gen->m; Tb lam_1=Y(Tbconst)(0.), lam_2, scale; - Y(mypow) (sth,l,&lam_2,&scale); + Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale); Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); Y(Tbnormalize)(&lam_2,&scale,sharp_ftol); @@ -193,12 +216,12 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_, while (below_limit) { if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} - Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]); for (int i=0; i<nvec; ++i) - lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); - r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]); + lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i]) + - vload(gen->rf[l].f[1])*lam_1.v[i]; for (int i=0; i<nvec; ++i) - lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); + lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i]) + - vload(gen->rf[l+1].f[1])*lam_2.v[i]; if (Y(rescale)(&lam_1,&lam_2,&scale)) below_limit = Y(TballLt)(scale,sharp_limscale); l+=2; @@ -213,10 +236,8 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm, Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]); for (int i=0; i<nvec; ++i) { - rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])), - vmul(fx2,rxp->v[i])); - rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])), - vmul(fx2,rxm->v[i])); + rxp->v[i] = (cth.v[i]-fx1)*fx0*ryp->v[i] - fx2*rxp->v[i]; + rxm->v[i] = (cth.v[i]+fx1)*fx0*rym->v[i] - fx2*rxm->v[i]; } } @@ -240,8 +261,10 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_, } Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps; - Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps); - Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps); + Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps); + Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps); + Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps); + Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps); Tb rec2p, rec2m, scalep, scalem; Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.); diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c index 8590d2c..7a34e40 100644 --- a/libsharp/sharp_core_inc0.c +++ b/libsharp/sharp_core_inc0.c @@ -25,7 +25,7 @@ /*! \file sharp_core_inc0.c * Computational core * - * Copyright (C) 2012-2013 Max-Planck-Society + * Copyright (C) 2012-2018 Max-Planck-Society * \author Martin Reinecke */ @@ -34,7 +34,6 @@ #include <string.h> #include "sharp_vecsupport.h" #include "sharp_complex_hacks.h" -#include "sharp_ylmgen_c.h" #include "sharp.h" #include "sharp_core.h" #include "c_utils.h" diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c index 5c9b4ab..9a2e26b 100644 --- a/libsharp/sharp_core_inc2.c +++ b/libsharp/sharp_core_inc2.c @@ -25,11 +25,11 @@ /*! \file sharp_core_inc2.c * Type-dependent code for the computational core * - * Copyright (C) 2012-2013 Max-Planck-Society + * Copyright (C) 2012-2017 Max-Planck-Society * \author Martin Reinecke */ -static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, +NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, int l, int lmax NJ1) @@ -77,29 +77,32 @@ if (njobs>1) } while (l<lmax) { - Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]); for (int i=0; i<nvec; ++i) - lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); + lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i]) + - vload(rf[l].f[1])*lam_1.v[i]; for (int j=0; j<njobs; ++j) { Tv ar=vload(creal(alm[njobs*l+j])), ai=vload(cimag(alm[njobs*l+j])); for (int i=0; i<nvec; ++i) { - vfmaeq(p1[j].r.v[i],lam_2.v[i],ar); - vfmaeq(p1[j].i.v[i],lam_2.v[i],ai); + p1[j].r.v[i] += lam_2.v[i]*ar; + p1[j].i.v[i] += lam_2.v[i]*ai; } - ar=vload(creal(alm[njobs*(l+1)+j])); - ai=vload(cimag(alm[njobs*(l+1)+j])); + } + for (int i=0; i<nvec; ++i) + lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i]) + - vload(rf[l+1].f[1])*lam_2.v[i]; + for (int j=0; j<njobs; ++j) + { + Tv ar=vload(creal(alm[njobs*(l+1)+j])), + ai=vload(cimag(alm[njobs*(l+1)+j])); for (int i=0; i<nvec; ++i) { - vfmaeq(p2[j].r.v[i],lam_1.v[i],ar); - vfmaeq(p2[j].i.v[i],lam_1.v[i],ai); + p2[j].r.v[i] += lam_1.v[i]*ar; + p2[j].i.v[i] += lam_1.v[i]*ai; } } - r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]); - for (int i=0; i<nvec; ++i) - lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); l+=2; } if (l==lmax) @@ -109,64 +112,57 @@ if (njobs>1) Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j])); for (int i=0; i<nvec; ++i) { - vfmaeq(p1[j].r.v[i],lam_2.v[i],ar); - vfmaeq(p1[j].i.v[i],lam_2.v[i],ai); + p1[j].r.v[i] += lam_2.v[i]*ar; + p1[j].i.v[i] += lam_2.v[i]*ai; } } } } -static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1, - const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, - const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax +NOINLINE static void Z(map2alm_kernel) (const Tb cth, + const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, + const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp NJ1) { while (l<lmax) { - Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]); for (int i=0; i<nvec; ++i) - lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); + lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i]) + - vload(rf[l].f[1])*lam_1.v[i]; for (int j=0; j<njobs; ++j) - { - Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero; for (int i=0; i<nvec; ++i) { - vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]); - vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]); + atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i]; + atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i]; } + for (int i=0; i<nvec; ++i) + lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i]) + - vload(rf[l+1].f[1])*lam_2.v[i]; + for (int j=0; j<njobs; ++j) for (int i=0; i<nvec; ++i) { - vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]); - vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]); + atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i]; + atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i]; } - vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]); - } - r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]); - for (int i=0; i<nvec; ++i) - lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); l+=2; } if (l==lmax) { for (int j=0; j<njobs; ++j) - { - Tv tre=vzero, tim=vzero; for (int i=0; i<nvec; ++i) { - vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]); - vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]); + atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i]; + atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i]; } - alm[l*njobs+j]+=vhsum_cmplx(tre,tim); - } } } -static void Z(calc_alm2map) (const Tb cth, const Tb sth, +NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2 NJ1) { int l,lmax=gen->lmax; - Tb lam_1,lam_2,scale; + Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; if (l>lmax) return; @@ -219,12 +215,12 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth, Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2); } -static void Z(calc_map2alm) (const Tb cth, const Tb sth, +NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, - const Y(Tbri) * restrict p2 NJ1) + const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1) { int lmax=gen->lmax; - Tb lam_1,lam_2,scale; + Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale; int l=gen->m; Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); job->opcnt += (l-gen->m) * 4*VLEN*nvec; @@ -234,40 +230,31 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth, const sharp_ylmgen_dbl2 * restrict rf = gen->rf; Tb corfac; Y(getCorfac)(scale,&corfac,gen->cf); - dcmplx * restrict alm=job->almtmp; int full_ieee = Y(TballGe)(scale,sharp_minscale); while (!full_ieee) { for (int j=0; j<njobs; ++j) - { - Tv tre=vzero, tim=vzero; for (int i=0; i<nvec; ++i) { - Tv tmp=vmul(lam_2.v[i],corfac.v[i]); - vfmaeq(tre,tmp,p1[j].r.v[i]); - vfmaeq(tim,tmp,p1[j].i.v[i]); + Tv tmp=lam_2.v[i]*corfac.v[i]; + atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i]; + atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i]; } - alm[l*njobs+j]+=vhsum_cmplx(tre,tim); - } if (++l>lmax) return; - Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); for (int i=0; i<nvec; ++i) - lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1)); + lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i]) + - vload(rf[l-1].f[1])*lam_1.v[i]; for (int j=0; j<njobs; ++j) - { - Tv tre=vzero, tim=vzero; for (int i=0; i<nvec; ++i) { - Tv tmp=vmul(lam_1.v[i],corfac.v[i]); - vfmaeq(tre,tmp,p2[j].r.v[i]); - vfmaeq(tim,tmp,p2[j].i.v[i]); + Tv tmp=lam_1.v[i]*corfac.v[i]; + atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i]; + atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i]; } - alm[l*njobs+j]+=vhsum_cmplx(tre,tim); - } if (++l>lmax) return; - r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); for (int i=0; i<nvec; ++i) - lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1)); + lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i]) + - vload(rf[l-1].f[1])*lam_2.v[i]; if (Y(rescale)(&lam_1,&lam_2,&scale)) { Y(getCorfac)(scale,&corfac,gen->cf); @@ -276,7 +263,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth, } Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); - Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2); + Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2); } static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, @@ -317,8 +304,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1])); for (int i=0; i<nvec; ++i) { - Tv lw1=vadd(r2p.v[i],r2m.v[i]); - Tv lx2=vsub(r1m.v[i],r1p.v[i]); + Tv lw1=r2p.v[i]+r2m.v[i]; + Tv lx2=r1m.v[i]-r1p.v[i]; vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2); vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2); vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2); @@ -326,8 +313,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, } for (int i=0; i<nvec; ++i) { - Tv lx1=vsub(r2m.v[i],r2p.v[i]); - Tv lw2=vadd(r1p.v[i],r1m.v[i]); + Tv lx1=r2m.v[i]-r2p.v[i]; + Tv lw2=r1p.v[i]+r1m.v[i]; vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1); vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1); vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1); @@ -359,11 +346,11 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px, vfmaeq(acr,py[j].qi.v[i],lx); vfmseq(aci,py[j].qr.v[i],lx); } - vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]); + vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]); } } -static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, +NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax NJ1) @@ -374,10 +361,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, fx2=vload(fx[l+1].f[2]); for (int i=0; i<nvec; ++i) { - rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])), - vmul(fx2,rec1p.v[i])); - rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])), - vmul(fx2,rec1m.v[i])); + rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i]; + rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i]; } Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], &alm[2*njobs*(l+1)] NJ2); @@ -385,10 +370,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, fx2=vload(fx[l+2].f[2]); for (int i=0; i<nvec; ++i) { - rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])), - vmul(fx2,rec2p.v[i])); - rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])), - vmul(fx2,rec2m.v[i])); + rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i]; + rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i]; } l+=2; } @@ -396,7 +379,7 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2); } -static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1, +NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax NJ1) @@ -429,7 +412,7 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1, Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2); } -static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth, +NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2 NJ1) { @@ -475,7 +458,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth, lmax NJ2); } -static void Z(calc_map2alm_spin) (Tb cth, Tb sth, +NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth, const sharp_Ylmgen_C * restrict gen, sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1) { @@ -539,7 +522,7 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, } } -static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, +NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax NJ1) @@ -572,7 +555,7 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1, Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2); } -static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth, +NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth, const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2 NJ1) { @@ -621,7 +604,7 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth, #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) -static void Z(inner_loop) (sharp_job *job, const int *ispair, +NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair, const double *cth_, const double *sth_, int llim, int ulim, sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1) { @@ -722,10 +705,30 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, } break; } + default: + { + UTIL_FAIL("must not happen"); + break; + } + } + } + +NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1) + { + const int nval=nvec*VLEN; + const int m = job->ainfo->mval[mi]; + sharp_Ylmgen_prepare (gen, m); + + switch (job->type) + { case SHARP_MAP2ALM: { if (job->spin==0) { + Tv atmp[2*njobs*(gen->lmax+1)]; + memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv)); for (int ith=0; ith<ulim-llim; ith+=nval) { Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2); @@ -751,8 +754,15 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, } } if (!skip) - Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2); + Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2); } + { + int istart=m*njobs, istop=(gen->lmax+1)*njobs; + for(; istart<istop-2; istart+=2) + vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart])); + for(; istart<istop; istart++) + job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]); + } } else { @@ -800,4 +810,13 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair, } } +static void Z(inner_loop) (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, + sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1) + { + (job->type==SHARP_MAP2ALM) ? + Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) : + Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2); + } + #undef VZERO diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h index f8b2365..2c37505 100644 --- a/libsharp/sharp_cxx.h +++ b/libsharp/sharp_cxx.h @@ -25,13 +25,14 @@ /*! \file sharp_cxx.h * Spherical transform library * - * Copyright (C) 2012-2015 Max-Planck-Society + * Copyright (C) 2012-2016 Max-Planck-Society * \author Martin Reinecke */ #ifndef PLANCK_SHARP_CXX_H #define PLANCK_SHARP_CXX_H +#include <complex> #include "sharp_lowlevel.h" #include "sharp_geomhelpers.h" #include "sharp_almhelpers.h" @@ -107,19 +108,30 @@ template<typename T> class sharp_cxxjob: public sharp_base private: static void *conv (T *ptr) { return reinterpret_cast<void *>(ptr); } + static void *conv (std::complex<T> *ptr) + { return reinterpret_cast<void *>(ptr); } static void *conv (const T *ptr) { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); } + static void *conv (const std::complex<T> *ptr) + { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); } public: - void alm2map (const T *alm, T *map, bool add) + void alm2map (const T *alm, T *map, bool add) const { void *aptr=conv(alm), *mptr=conv(map); int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1, flags,0,0); } - void alm2map_spin (const T *alm1, const T *alm2, T *map1, T *map2, - int spin, bool add) + void alm2map (const std::complex<T> *alm, T *map, bool add) const + { + void *aptr=conv(alm), *mptr=conv(map); + int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); + sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1, + flags,0,0); + } + void alm2map_spin (const T *alm1, const T *alm2, + T *map1, T *map2, int spin, bool add) const { void *aptr[2], *mptr[2]; aptr[0]=conv(alm1); aptr[1]=conv(alm2); @@ -127,21 +139,65 @@ template<typename T> class sharp_cxxjob: public sharp_base int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0); } - void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) + void alm2map_spin (const std::complex<T> *alm1, const std::complex<T> *alm2, + T *map1, T *map2, int spin, bool add) const + { + void *aptr[2], *mptr[2]; + aptr[0]=conv(alm1); aptr[1]=conv(alm2); + mptr[0]=conv(map1); mptr[1]=conv(map2); + int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); + sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0); + } + void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) const + { + void *aptr=conv(alm), *mptr[2]; + mptr[0]=conv(map1); mptr[1]=conv(map2); + int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); + sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0); + } + void alm2map_der1 (const std::complex<T> *alm, T *map1, T *map2, bool add) + const { void *aptr=conv(alm), *mptr[2]; mptr[0]=conv(map1); mptr[1]=conv(map2); int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0); } - void map2alm (const T *map, T *alm, bool add) + void alm2map_adjoint (const T *map, T *alm, bool add) const + { + void *aptr=conv(alm), *mptr=conv(map); + int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); + sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0); + } + void alm2map_adjoint (const T *map, std::complex<T> *alm, bool add) const + { + void *aptr=conv(alm), *mptr=conv(map); + int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); + sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0); + } + void map2alm (const T *map, T *alm, bool add) const + { + void *aptr=conv(alm), *mptr=conv(map); + int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); + sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0); + } + void map2alm (const T *map, std::complex<T> *alm, bool add) const { void *aptr=conv(alm), *mptr=conv(map); int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0); } void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2, - int spin, bool add) + int spin, bool add) const + { + void *aptr[2], *mptr[2]; + aptr[0]=conv(alm1); aptr[1]=conv(alm2); + mptr[0]=conv(map1); mptr[1]=conv(map2); + int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0); + sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0); + } + void map2alm_spin (const T *map1, const T *map2, std::complex<T> *alm1, + std::complex<T> *alm2, int spin, bool add) const { void *aptr[2], *mptr[2]; aptr[0]=conv(alm1); aptr[1]=conv(alm2); diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c index 0aed60d..8efb8a0 100644 --- a/libsharp/sharp_geomhelpers.c +++ b/libsharp/sharp_geomhelpers.c @@ -25,9 +25,8 @@ /*! \file sharp_geomhelpers.c * Spherical transform library * - * Copyright (C) 2006-2012 Max-Planck-Society<br> - * Copyright (C) 2007-2008 Pavel Holoborodko (for gauss_legendre_tbl) - * \author Martin Reinecke \author Pavel Holoborodko + * Copyright (C) 2006-2018 Max-Planck-Society<br> + * \author Martin Reinecke */ #include <math.h> @@ -35,7 +34,6 @@ #include "sharp_legendre_roots.h" #include "c_utils.h" #include "pocketfft/pocketfft.h" -#include <stdio.h> void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings, const int *rings, const double *weight, sharp_geom_info **geom_info) diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c index 2b1c7af..f02f9fd 100644 --- a/libsharp/sharp_testsuite.c +++ b/libsharp/sharp_testsuite.c @@ -23,7 +23,7 @@ */ /* \file sharp_testsuite.c - * + * * Copyright (C) 2012-2013 Max-Planck-Society * \author Martin Reinecke */ @@ -50,9 +50,9 @@ typedef complex double dcmplx; int ntasks, mytask; -static double drand (double min, double max, int *state) +static double drand (double min, double max, unsigned *state) { - *state = (((*state) * 1103515245) + 12345) & 0x7fffffff; + *state = (((*state) * 1103515245u) + 12345u) & 0x7fffffffu; return min + (max-min)*(*state)/(0x7fffffff+1.0); } @@ -65,7 +65,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt) for (mi=0;mi<helper->nm; ++mi) { int m=helper->mval[mi]; - int state=1234567*cnt+8912*m; // random seed + unsigned state=1234567u*(unsigned)cnt+8912u*(unsigned)m; // random seed for (int l=m;l<=helper->lmax; ++l) { if ((l<spin)&&(m<spin)) diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h index ee4c5e7..5250948 100644 --- a/libsharp/sharp_vecsupport.h +++ b/libsharp/sharp_vecsupport.h @@ -25,7 +25,7 @@ /* \file sharp_vecsupport.h * Convenience functions for vector arithmetics * - * Copyright (C) 2012,2013 Max-Planck-Society + * Copyright (C) 2012-2016 Max-Planck-Society * Author: Martin Reinecke */ @@ -72,6 +72,7 @@ typedef int Tm; #define vge(a,b) ((a)>=(b)) #define vne(a,b) ((a)!=(b)) #define vand_mask(a,b) ((a)&&(b)) +#define vor_mask(a,b) ((a)||(b)) #define vstoreu(p, a) (*(p)=a) #define vstoreu_s(p, a) (*(p)=a) @@ -138,6 +139,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b) #define vge(a,b) _mm_cmpge_pd(a,b) #define vne(a,b) _mm_cmpneq_pd(a,b) #define vand_mask(a,b) _mm_and_pd(a,b) +#define vor_mask(a,b) _mm_or_pd(a,b) #define vmin(a,b) _mm_min_pd(a,b) #define vmax(a,b) _mm_max_pd(a,b); #define vanyTrue(a) (_mm_movemask_pd(a)!=0) @@ -183,6 +185,13 @@ typedef __m256d Tm; #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a)) #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a)) #else +#if (USE_FMA) +#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a) +#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a) +#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a) +#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a)) +#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a)) +#else #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) #define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c)) #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) @@ -191,6 +200,7 @@ typedef __m256d Tm; #define vfmaseq(a,b,c,d,e) \ a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) #endif +#endif #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a) #define vload(a) _mm256_set1_pd(a) #define vload_s(a) _mm256_set1_ps(a) @@ -201,6 +211,7 @@ typedef __m256d Tm; #define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ) #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ) #define vand_mask(a,b) _mm256_and_pd(a,b) +#define vor_mask(a,b) _mm256_or_pd(a,b) #define vmin(a,b) _mm256_min_pd(a,b) #define vmax(a,b) _mm256_max_pd(a,b) #define vanyTrue(a) (_mm256_movemask_pd(a)!=0) @@ -242,6 +253,7 @@ typedef __mmask8 Tm; #define vge(a,b) _mm512_cmpnlt_pd_mask(a,b) #define vne(a,b) _mm512_cmpneq_pd_mask(a,b) #define vand_mask(a,b) ((a)&(b)) +#define vor_mask(a,b) ((a)|(b)) #define vmin(a,b) _mm512_min_pd(a,b) #define vmax(a,b) _mm512_max_pd(a,b) #define vanyTrue(a) (a!=0) diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c index 6e8cee5..785e063 100644 --- a/libsharp/sharp_ylmgen_c.c +++ b/libsharp/sharp_ylmgen_c.c @@ -25,7 +25,7 @@ /* * Helper code for efficient calculation of Y_lm(theta,phi=0) * - * Copyright (C) 2005-2014 Max-Planck-Society + * Copyright (C) 2005-2016 Max-Planck-Society * Author: Martin Reinecke */ @@ -59,6 +59,12 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin) gen->cf[m]=gen->cf[m+1]*sharp_fsmall; for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m) gen->cf[m]=gen->cf[m-1]*sharp_fbig; + gen->powlimit=RALLOC(double,m_max+spin+1); + gen->powlimit[0]=0.; + const double ln2 = 0.6931471805599453094172321214581766; + const double expo=-400*ln2; + for (int m=1; m<=m_max+spin; ++m) + gen->powlimit[m]=exp(expo/m); gen->m = -1; if (spin==0) @@ -124,6 +130,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin) void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen) { DEALLOC(gen->cf); + DEALLOC(gen->powlimit); if (gen->s==0) { DEALLOC(gen->rf); diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h index 3328f76..63b23cd 100644 --- a/libsharp/sharp_ylmgen_c.h +++ b/libsharp/sharp_ylmgen_c.h @@ -25,7 +25,7 @@ /*! \file sharp_ylmgen_c.h * Code for efficient calculation of Y_lm(phi=0,theta) * - * Copyright (C) 2005-2012 Max-Planck-Society + * Copyright (C) 2005-2016 Max-Planck-Society * \author Martin Reinecke */ @@ -49,6 +49,7 @@ typedef struct /* for public use; immutable during lifetime */ int lmax, mmax, s; double *cf; + double *powlimit; /* for public use; will typically change after call to Ylmgen_prepare() */ int m; -- GitLab