From 18c82762c3a4262d363c4feadeb79fbb693d00b9 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 26 Oct 2018 14:36:25 +0200
Subject: [PATCH] updates

---
 c_utils/c_utils.c              |   6 +-
 c_utils/c_utils.h              |   8 +-
 libsharp/sharp.c               | 195 +++++++++++++++++++++------------
 libsharp/sharp_almhelpers.c    |   2 +-
 libsharp/sharp_almhelpers.h    |   2 +-
 libsharp/sharp_complex_hacks.h |  31 +++++-
 libsharp/sharp_core_inc.c      |  93 ++++++++++------
 libsharp/sharp_core_inc0.c     |   3 +-
 libsharp/sharp_core_inc2.c     | 185 +++++++++++++++++--------------
 libsharp/sharp_cxx.h           |  70 ++++++++++--
 libsharp/sharp_geomhelpers.c   |   6 +-
 libsharp/sharp_testsuite.c     |   8 +-
 libsharp/sharp_vecsupport.h    |  14 ++-
 libsharp/sharp_ylmgen_c.c      |   9 +-
 libsharp/sharp_ylmgen_c.h      |   3 +-
 15 files changed, 420 insertions(+), 215 deletions(-)

diff --git a/c_utils/c_utils.c b/c_utils/c_utils.c
index 96bd765..9344a6d 100644
--- a/c_utils/c_utils.c
+++ b/c_utils/c_utils.c
@@ -25,7 +25,7 @@
 /*
  *  Convenience functions
  *
- *  Copyright (C) 2008, 2009, 2010, 2011, 2012 Max-Planck-Society
+ *  Copyright (C) 2008-2017 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -44,7 +44,7 @@ void util_warn_ (const char *file, int line, const char *func, const char *msg)
 
 /* This function tries to avoid allocations with a total size close to a high
    power of two (called the "critical stride" here), by adding a few more bytes
-   if necssary. This lowers the probability that two arrays differ by a multiple
+   if necessary. This lowers the probability that two arrays differ by a multiple
    of the critical stride in their starting address, which in turn lowers the
    risk of cache line contention. */
 static size_t manipsize(size_t sz)
@@ -61,7 +61,7 @@ void *util_malloc_ (size_t sz)
   {
   void *res;
   if (sz==0) return NULL;
-  res = _mm_malloc(manipsize(sz),16);
+  res = _mm_malloc(manipsize(sz),32);
   UTIL_ASSERT(res,"_mm_malloc() failed");
   return res;
   }
diff --git a/c_utils/c_utils.h b/c_utils/c_utils.h
index 0503449..01c64ad 100644
--- a/c_utils/c_utils.h
+++ b/c_utils/c_utils.h
@@ -25,7 +25,7 @@
 /*! \file c_utils.h
  *  Convenience functions
  *
- *  Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society
+ *  Copyright (C) 2008-2017 Max-Planck-Society
  *  \author Martin Reinecke
  *  \note This file should only be included from .c files, NOT from .h files.
  */
@@ -144,4 +144,10 @@ void util_free_ (void *ptr);
 }
 #endif
 
+#ifdef __GNUC__
+#define NOINLINE __attribute__((noinline))
+#else
+#define NOINLINE
+#endif
+
 #endif
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
index b1b9277..884a644 100644
--- a/libsharp/sharp.c
+++ b/libsharp/sharp.c
@@ -25,11 +25,12 @@
 /*! \file sharp.c
  *  Spherical transform library
  *
- *  Copyright (C) 2006-2013 Max-Planck-Society
+ *  Copyright (C) 2006-2016 Max-Planck-Society
  *  \author Martin Reinecke \author Dag Sverre Seljebotn
  */
 
 #include <math.h>
+#include <string.h>
 #include "pocketfft/pocketfft.h"
 #include "sharp_ylmgen_c.h"
 #include "sharp_internal.h"
@@ -63,7 +64,7 @@ static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
   *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
   }
 
-int sharp_get_mlim (int lmax, int spin, double sth, double cth)
+NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
   {
   double ofs=lmax*0.01;
   if (ofs<100.) ofs=100.;
@@ -83,12 +84,13 @@ typedef struct
   dcmplx *shiftarr;
   int s_shift;
   rfft_plan plan;
+  int length;
   int norot;
   } ringhelper;
 
 static void ringhelper_init (ringhelper *self)
   {
-  static ringhelper rh_null = { 0, NULL, 0, NULL, 0 };
+  static ringhelper rh_null = { 0, NULL, 0, NULL, 0, 0 };
   *self = rh_null;
   }
 
@@ -99,7 +101,7 @@ static void ringhelper_destroy (ringhelper *self)
   ringhelper_init(self);
   }
 
-static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
+NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
   {
   self->norot = (fabs(phi0)<1e-14);
   if (!(self->norot))
@@ -110,12 +112,15 @@ static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
       self->phi0_ = phi0;
       for (int m=0; m<=mmax; ++m)
         self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
+//      double *tmp=(double *) self->shiftarr;
+//      sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
       }
   if (!self->plan) self->plan=make_rfft_plan(nph);
-  if (nph!=(int)rfft_length(self->plan))
+  if (nph!=(int)self->length)
     {
     destroy_rfft_plan(self->plan);
     self->plan=make_rfft_plan(nph);
+    self->length=nph;
     }
   }
 
@@ -127,6 +132,7 @@ static int ringinfo_compare (const void *xa, const void *xb)
 static int ringpair_compare (const void *xa, const void *xb)
   {
   const sharp_ringpair *a=xa, *b=xb;
+//  return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
   if (a->r1.nph==b->r1.nph)
     return (a->r1.phi0 < b->r1.phi0) ? -1 :
       ((a->r1.phi0 > b->r1.phi0) ? 1 :
@@ -261,6 +267,7 @@ void sharp_destroy_geom_info (sharp_geom_info *geom_info)
    distribution are permissible. */
 static int sharp_get_mmax (int *mval, int nm)
   {
+  //FIXME: if gaps are allowed, we have to search the maximum m in the array
   int *mcheck=RALLOC(int,nm);
   SET_ARRAY(mcheck,0,nm,0);
   for (int i=0; i<nm; ++i)
@@ -274,7 +281,7 @@ static int sharp_get_mmax (int *mval, int nm)
   return nm-1;
   }
 
-static void ringhelper_phase2ring (ringhelper *self,
+NOINLINE static void ringhelper_phase2ring (ringhelper *self,
   const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
   int pstride, int flags)
   {
@@ -288,13 +295,19 @@ static void ringhelper_phase2ring (ringhelper *self,
 
   if (nph>=2*mmax+1)
     {
-    for (int m=0; m<=mmax; ++m)
-      {
-      dcmplx tmp = phase[m*pstride]*wgt;
-      if(!self->norot) tmp*=self->shiftarr[m];
-      data[2*m]=creal(tmp);
-      data[2*m+1]=cimag(tmp);
-      }
+    if (self->norot)
+      for (int m=0; m<=mmax; ++m)
+        {
+        data[2*m]=creal(phase[m*pstride])*wgt;
+        data[2*m+1]=cimag(phase[m*pstride])*wgt;
+        }
+    else
+      for (int m=0; m<=mmax; ++m)
+        {
+        dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
+        data[2*m]=creal(tmp)*wgt;
+        data[2*m+1]=cimag(tmp)*wgt;
+        }
     for (int m=2*(mmax+1); m<nph+2; ++m)
       data[m]=0.;
     }
@@ -326,7 +339,7 @@ static void ringhelper_phase2ring (ringhelper *self,
   rfft_backward (self->plan, &(data[1]), 1.);
   }
 
-static void ringhelper_ring2phase (ringhelper *self,
+NOINLINE static void ringhelper_ring2phase (ringhelper *self,
   const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
   int pstride, int flags)
   {
@@ -376,7 +389,7 @@ static void ringhelper_ring2phase (ringhelper *self,
     phase[m*pstride]=0.;
   }
 
-static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
+NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
   int flags)
   {
   if (flags & SHARP_NO_FFT)
@@ -386,50 +399,55 @@ static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
       if (flags&SHARP_DP)
         {
         for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =value;
+          ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
         for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =value;
+          ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
         }
       else
         {
         for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =(float)value;
+          ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
         for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =(float)value;
+          ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
         }
       }
     }
   else
     {
-    for (int j=0;j<ginfo->npairs;++j)
+    if (flags&SHARP_DP)
       {
-      if (flags&SHARP_DP)
+      for (int j=0;j<ginfo->npairs;++j)
         {
-        for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =value;
-        for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =value;
+        double *dmap=(double *)map;
+        if (ginfo->pair[j].r1.stride==1)
+          memset(&dmap[ginfo->pair[j].r1.ofs],0,
+            ginfo->pair[j].r1.nph*sizeof(double));
+        else
+          for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
+            dmap[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
+        if ((ginfo->pair[j].r2.nph>0)&&(ginfo->pair[j].r2.stride==1))
+          memset(&dmap[ginfo->pair[j].r2.ofs],0,
+            ginfo->pair[j].r2.nph*sizeof(double));
+        else
+          for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
+            dmap[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
         }
-      else
+      }
+    else
+      {
+      for (int j=0;j<ginfo->npairs;++j)
         {
         for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
-          ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
-            =(float)value;
+          ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
         for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
-          ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
-            =(float)value;
+          ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
         }
       }
     }
   }
 
-static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
+NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
+  int flags)
   {
 #define CLEARLOOP(real_t,body)             \
       {                                    \
@@ -465,7 +483,7 @@ static void clear_alm (const sharp_alm_info *ainfo, void *alm, int flags)
     }
   }
 
-static void init_output (sharp_job *job)
+NOINLINE static void init_output (sharp_job *job)
   {
   if (job->flags&SHARP_ADD) return;
   if (job->type == SHARP_MAP2ALM)
@@ -473,21 +491,21 @@ static void init_output (sharp_job *job)
       clear_alm (job->ainfo,job->alm[i],job->flags);
   else
     for (int i=0; i<job->ntrans*job->nmaps; ++i)
-      fill_map (job->ginfo,job->map[i],0.,job->flags);
+      clear_map (job->ginfo,job->map[i],job->flags);
   }
 
-static void alloc_phase (sharp_job *job, int nm, int ntheta)
+NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
   {
   if (job->type==SHARP_MAP2ALM)
     {
-    if ((nm&1023)==0) nm+=3; // hack to avoid critical strides
     job->s_m=2*job->ntrans*job->nmaps;
+    if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides
     job->s_th=job->s_m*nm;
     }
   else
     {
-    if ((ntheta&1023)==0) ntheta+=3; // hack to avoid critical strides
     job->s_th=2*job->ntrans*job->nmaps;
+    if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides
     job->s_m=job->s_th*ntheta;
     }
   job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta);
@@ -502,22 +520,28 @@ static void alloc_almtmp (sharp_job *job, int lmax)
 static void dealloc_almtmp (sharp_job *job)
   { DEALLOC(job->almtmp); }
 
-static void alm2almtmp (sharp_job *job, int lmax, int mi)
+NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
   {
 
-#define COPY_LOOP(real_t, source_t, expr_of_x)                      \
-  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)            \
+#define COPY_LOOP(real_t, source_t, expr_of_x)              \
+  {                                                         \
+  for (int l=m; l<lmin; ++l)                                \
+    for (int i=0; i<job->ntrans*job->nalm; ++i)             \
+      job->almtmp[job->ntrans*job->nalm*l+i] = 0;           \
+  for (int l=lmin; l<=lmax; ++l)                            \
     for (int i=0; i<job->ntrans*job->nalm; ++i)             \
       {                                                     \
-        source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
-        job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \
-      }
+      source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
+      job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x;   \
+      }                                                     \
+  }
 
   if (job->type!=SHARP_MAP2ALM)
     {
     ptrdiff_t ofs=job->ainfo->mvstart[mi];
     int stride=job->ainfo->stride;
     int m=job->ainfo->mval[mi];
+    int lmin=(m<job->spin) ? job->spin : m;
     /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the
        coefficients by sqrt_one_half; here we must compensate to avoid scaling
        m=0 */
@@ -562,17 +586,17 @@ static void alm2almtmp (sharp_job *job, int lmax, int mi)
       }
     }
   else
-    SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi],
-              job->ntrans*job->nalm*(lmax+1),0.);
+    memset (job->almtmp+job->ntrans*job->nalm*job->ainfo->mval[mi], 0,
+      job->ntrans*job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
 
 #undef COPY_LOOP
   }
 
-static void almtmp2alm (sharp_job *job, int lmax, int mi)
+NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
   {
 
 #define COPY_LOOP(real_t, target_t, expr_of_x)               \
-  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)             \
+  for (int l=lmin; l<=lmax; ++l)                             \
     for (int i=0; i<job->ntrans*job->nalm; ++i)              \
       {                                                      \
         dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i];   \
@@ -583,6 +607,7 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
   ptrdiff_t ofs=job->ainfo->mvstart[mi];
   int stride=job->ainfo->stride;
   int m=job->ainfo->mval[mi];
+  int lmin=(m<job->spin) ? job->spin : m;
   /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the
      coefficients by sqrt_two; here we must compensate to avoid scaling
      m=0 */
@@ -629,27 +654,56 @@ static void almtmp2alm (sharp_job *job, int lmax, int mi)
 #undef COPY_LOOP
   }
 
-static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri, double *ringtmp,
-  int rstride)
+NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
+  const double *ringtmp, int rstride)
   {
-  double **dmap = (double **)job->map;
-  float  **fmap = (float  **)job->map;
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
-    for (int m=0; m<ri->nph; ++m)
-      if (job->flags & SHARP_DP)
-        dmap[i][ri->ofs+m*ri->stride] += ringtmp[i*rstride+m+1];
+  if (job->flags & SHARP_DP)
+    {
+    double **dmap = (double **)job->map;
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      {
+      double *restrict p1=&dmap[i][ri->ofs];
+      const double *restrict p2=&ringtmp[i*rstride+1];
+      if (ri->stride==1)
+        {
+        if (job->flags&SHARP_ADD)
+          for (int m=0; m<ri->nph; ++m)
+            p1[m] += p2[m];
+        else
+          memcpy(p1,p2,ri->nph*sizeof(double));
+        }
       else
+        for (int m=0; m<ri->nph; ++m)
+          p1[m*ri->stride] += p2[m];
+      }
+    }
+  else
+    {
+    float  **fmap = (float  **)job->map;
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      for (int m=0; m<ri->nph; ++m)
         fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1];
+    }
   }
 
-static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri, double *ringtmp,
-  int rstride)
+NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
+  double *ringtmp, int rstride)
   {
-  for (int i=0; i<job->ntrans*job->nmaps; ++i)
-    for (int m=0; m<ri->nph; ++m)
-      ringtmp[i*rstride+m+1] = (job->flags & SHARP_DP) ?
-        ((double *)(job->map[i]))[ri->ofs+m*ri->stride] :
-        ((float  *)(job->map[i]))[ri->ofs+m*ri->stride];
+  if (job->flags & SHARP_DP)
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      {
+      double *restrict p1=&ringtmp[i*rstride+1],
+             *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
+      if (ri->stride==1)
+        memcpy(p1,p2,ri->nph*sizeof(double));
+      else
+        for (int m=0; m<ri->nph; ++m)
+          p1[m] = p2[m*ri->stride];
+      }
+  else
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      for (int m=0; m<ri->nph; ++m)
+        ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride];
   }
 
 static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
@@ -693,7 +747,7 @@ static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
   }
 
 //FIXME: set phase to zero if not SHARP_MAP2ALM?
-static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
+NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
   {
   if (job->type != SHARP_MAP2ALM) return;
   int pstride = job->s_m;
@@ -738,7 +792,7 @@ static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
     }
   }
 
-static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
+NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
   {
   if (job->type == SHARP_MAP2ALM) return;
   int pstride = job->s_m;
@@ -783,7 +837,7 @@ static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
     }
   }
 
-static void sharp_execute_job (sharp_job *job)
+NOINLINE static void sharp_execute_job (sharp_job *job)
   {
   double timer=wallTime();
   job->opcnt=0;
@@ -800,6 +854,7 @@ static void sharp_execute_job (sharp_job *job)
   int nchunks, chunksize;
   get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
     &chunksize);
+//FIXME: needs to be changed to "nm"
   alloc_phase (job,mmax+1,chunksize);
 
 /* chunk loop */
diff --git a/libsharp/sharp_almhelpers.c b/libsharp/sharp_almhelpers.c
index 6a98309..12ce600 100644
--- a/libsharp/sharp_almhelpers.c
+++ b/libsharp/sharp_almhelpers.c
@@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.c
  *  Spherical transform library
  *
- *  Copyright (C) 2008-2013 Max-Planck-Society
+ *  Copyright (C) 2008-2016 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
index 3bff317..67016d7 100644
--- a/libsharp/sharp_almhelpers.h
+++ b/libsharp/sharp_almhelpers.h
@@ -25,7 +25,7 @@
 /*! \file sharp_almhelpers.h
  *  SHARP helper function for the creation of a_lm data structures
  *
- *  Copyright (C) 2008-2011 Max-Planck-Society
+ *  Copyright (C) 2008-2016 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
diff --git a/libsharp/sharp_complex_hacks.h b/libsharp/sharp_complex_hacks.h
index 9227ca9..86d1153 100644
--- a/libsharp/sharp_complex_hacks.h
+++ b/libsharp/sharp_complex_hacks.h
@@ -25,7 +25,7 @@
 /*  \file sharp_complex_hacks.h
  *  support for converting vector types and complex numbers
  *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -51,6 +51,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
   complex double * restrict c1, complex double * restrict c2)
   { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict cc)
+  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
+
 #endif
 
 #if (VLEN==2)
@@ -94,6 +98,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
 #endif
   }
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict cc)
+  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
+
 #endif
 
 #if (VLEN==4)
@@ -130,6 +138,23 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
 #endif
   }
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict cc)
+  {
+  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
+  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
+     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
+  tmp1=vadd(tmp3,tmp4);
+#ifdef UNSAFE_CODE
+  _mm256_storeu_pd((double *)cc,
+    _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
+#else
+  union {Tv v; complex double c[2]; } u;
+  u.v=tmp1;
+  cc[0]+=u.c[0]; cc[1]+=u.c[1];
+#endif
+  }
+
 #endif
 
 #if (VLEN==8)
@@ -144,6 +169,10 @@ static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
   *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
   }
 
+static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict cc)
+  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
+
 #endif
 
 #endif
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
index 747658c..8a36bfe 100644
--- a/libsharp/sharp_core_inc.c
+++ b/libsharp/sharp_core_inc.c
@@ -25,7 +25,7 @@
 /*! \file sharp_core_inc.c
  *  Type-dependent code for the computational core
  *
- *  Copyright (C) 2012 Max-Planck-Society
+ *  Copyright (C) 2012-2017 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -73,8 +73,8 @@ static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
 static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
   double maxval)
   {
-  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
   const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
+  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
   for (int i=0;i<nvec; ++i)
     {
     Tm mask = vgt(vabs(val->v[i]),vfmax);
@@ -94,35 +94,58 @@ static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
     }
   }
 
-static void Y(mypow) (Tb val, int npow, Tb * restrict resd,
-  Tb * restrict ress)
+NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit,
+  Tb * restrict resd, Tb * restrict ress)
   {
-  Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
-
-  Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
-
-  do
+  Tv vminv=vload(powlimit[npow]);
+  Tm mask = vlt(vabs(val.v[0]),vminv);
+  for (int i=1;i<nvec; ++i)
+    mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
+  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
+    {
+    Tb res=Y(Tbconst)(1.);
+    do
+      {
+      if (npow&1)
+        for (int i=0; i<nvec; ++i)
+          {
+          vmuleq(res.v[i],val.v[i]);
+          vmuleq(val.v[i],val.v[i]);
+          }
+      else
+        for (int i=0; i<nvec; ++i)
+          vmuleq(val.v[i],val.v[i]);
+      }
+    while(npow>>=1);
+    *resd=res;
+    *ress=Y(Tbconst)(0.);
+    }
+  else
     {
-    if (npow&1)
+    Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
+    Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
+    do
       {
+      if (npow&1)
+        {
+        for (int i=0; i<nvec; ++i)
+          {
+          vmuleq(res.v[i],val.v[i]);
+          vaddeq(scale.v[i],scaleint.v[i]);
+          }
+        Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
+        }
       for (int i=0; i<nvec; ++i)
         {
-        vmuleq(res.v[i],val.v[i]);
-        vaddeq(scale.v[i],scaleint.v[i]);
+        vmuleq(val.v[i],val.v[i]);
+        vaddeq(scaleint.v[i],scaleint.v[i]);
         }
-      Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
-      }
-    for (int i=0; i<nvec; ++i)
-      {
-      vmuleq(val.v[i],val.v[i]);
-      vaddeq(scaleint.v[i],scaleint.v[i]);
+      Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
       }
-    Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
+    while(npow>>=1);
+    *resd=res;
+    *ress=scale;
     }
-  while(npow>>=1);
-
-  *resd=res;
-  *ress=scale;
   }
 
 static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
@@ -179,13 +202,13 @@ static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
   *corfac=corf.b;
   }
 
-static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
+NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
   Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
   const sharp_Ylmgen_C * restrict gen)
   {
   int l=gen->m;
   Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
-  Y(mypow) (sth,l,&lam_2,&scale);
+  Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale);
   Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
   Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
 
@@ -193,12 +216,12 @@ static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
   while (below_limit)
     {
     if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
-    Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
-    r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]);
+      lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(gen->rf[l].f[1])*lam_1.v[i];
     for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(gen->rf[l+1].f[1])*lam_2.v[i];
     if (Y(rescale)(&lam_1,&lam_2,&scale))
       below_limit = Y(TballLt)(scale,sharp_limscale);
     l+=2;
@@ -213,10 +236,8 @@ static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
   Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
   for (int i=0; i<nvec; ++i)
     {
-    rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])),
-                vmul(fx2,rxp->v[i]));
-    rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])),
-                vmul(fx2,rxm->v[i]));
+    rxp->v[i] = (cth.v[i]-fx1)*fx0*ryp->v[i] - fx2*rxp->v[i];
+    rxm->v[i] = (cth.v[i]+fx1)*fx0*rym->v[i] - fx2*rxm->v[i];
     }
   }
 
@@ -240,8 +261,10 @@ static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
     }
 
   Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
-  Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps);
-  Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps);
+  Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
+  Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
+  Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
+  Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
 
   Tb rec2p, rec2m, scalep, scalem;
   Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
diff --git a/libsharp/sharp_core_inc0.c b/libsharp/sharp_core_inc0.c
index 8590d2c..7a34e40 100644
--- a/libsharp/sharp_core_inc0.c
+++ b/libsharp/sharp_core_inc0.c
@@ -25,7 +25,7 @@
 /*! \file sharp_core_inc0.c
  *  Computational core
  *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -34,7 +34,6 @@
 #include <string.h>
 #include "sharp_vecsupport.h"
 #include "sharp_complex_hacks.h"
-#include "sharp_ylmgen_c.h"
 #include "sharp.h"
 #include "sharp_core.h"
 #include "c_utils.h"
diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c
index 5c9b4ab..9a2e26b 100644
--- a/libsharp/sharp_core_inc2.c
+++ b/libsharp/sharp_core_inc2.c
@@ -25,11 +25,11 @@
 /*! \file sharp_core_inc2.c
  *  Type-dependent code for the computational core
  *
- *  Copyright (C) 2012-2013 Max-Planck-Society
+ *  Copyright (C) 2012-2017 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
-static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
+NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
   Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
   const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
   int l, int lmax NJ1)
@@ -77,29 +77,32 @@ if (njobs>1)
   }
   while (l<lmax)
     {
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(rf[l].f[1])*lam_1.v[i];
     for (int j=0; j<njobs; ++j)
       {
       Tv ar=vload(creal(alm[njobs*l+j])),
          ai=vload(cimag(alm[njobs*l+j]));
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
+        p1[j].r.v[i] += lam_2.v[i]*ar;
+        p1[j].i.v[i] += lam_2.v[i]*ai;
         }
-      ar=vload(creal(alm[njobs*(l+1)+j]));
-      ai=vload(cimag(alm[njobs*(l+1)+j]));
+      }
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(rf[l+1].f[1])*lam_2.v[i];
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*(l+1)+j])),
+         ai=vload(cimag(alm[njobs*(l+1)+j]));
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
-        vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
+        p2[j].r.v[i] += lam_1.v[i]*ar;
+        p2[j].i.v[i] += lam_1.v[i]*ai;
         }
       }
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
     l+=2;
     }
   if (l==lmax)
@@ -109,64 +112,57 @@ if (njobs>1)
       Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
-        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
+        p1[j].r.v[i] += lam_2.v[i]*ar;
+        p1[j].i.v[i] += lam_2.v[i]*ai;
         }
       }
     }
   }
 
-static void Z(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
-  const sharp_ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax
+NOINLINE static void Z(map2alm_kernel) (const Tb cth,
+  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp
   NJ1)
   {
   while (l<lmax)
     {
-    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(rf[l].f[1])*lam_1.v[i];
     for (int j=0; j<njobs; ++j)
-      {
-      Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
+        atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
+        atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
         }
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(rf[l+1].f[1])*lam_2.v[i];
+    for (int j=0; j<njobs; ++j)
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
-        vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
+        atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
+        atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
         }
-      vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
-      }
-    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
-    for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
     l+=2;
     }
   if (l==lmax)
     {
     for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
       for (int i=0; i<nvec; ++i)
         {
-        vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
-        vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
+        atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i];
+        atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i];
         }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
     }
   }
 
-static void Z(calc_alm2map) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
   Y(Tbri) * restrict p2 NJ1)
   {
   int l,lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
+  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
   Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
   if (l>lmax) return;
@@ -219,12 +215,12 @@ static void Z(calc_alm2map) (const Tb cth, const Tb sth,
   Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
   }
 
-static void Z(calc_map2alm) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
-  const Y(Tbri) * restrict p2 NJ1)
+  const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1)
   {
   int lmax=gen->lmax;
-  Tb lam_1,lam_2,scale;
+  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
   int l=gen->m;
   Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
   job->opcnt += (l-gen->m) * 4*VLEN*nvec;
@@ -234,40 +230,31 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
   const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
   Tb corfac;
   Y(getCorfac)(scale,&corfac,gen->cf);
-  dcmplx * restrict alm=job->almtmp;
   int full_ieee = Y(TballGe)(scale,sharp_minscale);
   while (!full_ieee)
     {
     for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
       for (int i=0; i<nvec; ++i)
         {
-        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p1[j].r.v[i]);
-        vfmaeq(tim,tmp,p1[j].i.v[i]);
+        Tv tmp=lam_2.v[i]*corfac.v[i];
+        atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i];
+        atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i];
         }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
     if (++l>lmax) return;
-    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+      lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
+                 - vload(rf[l-1].f[1])*lam_1.v[i];
     for (int j=0; j<njobs; ++j)
-      {
-      Tv tre=vzero, tim=vzero;
       for (int i=0; i<nvec; ++i)
         {
-        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
-        vfmaeq(tre,tmp,p2[j].r.v[i]);
-        vfmaeq(tim,tmp,p2[j].i.v[i]);
+        Tv tmp=lam_1.v[i]*corfac.v[i];
+        atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i];
+        atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i];
         }
-      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
-      }
     if (++l>lmax) return;
-    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
     for (int i=0; i<nvec; ++i)
-      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+      lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
+                 - vload(rf[l-1].f[1])*lam_2.v[i];
     if (Y(rescale)(&lam_1,&lam_2,&scale))
       {
       Y(getCorfac)(scale,&corfac,gen->cf);
@@ -276,7 +263,7 @@ static void Z(calc_map2alm) (const Tb cth, const Tb sth,
     }
 
   Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
-  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
+  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2);
   }
 
 static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
@@ -317,8 +304,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
        acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
     for (int i=0; i<nvec; ++i)
       {
-      Tv lw1=vadd(r2p.v[i],r2m.v[i]);
-      Tv lx2=vsub(r1m.v[i],r1p.v[i]);
+      Tv lw1=r2p.v[i]+r2m.v[i];
+      Tv lx2=r1m.v[i]-r1p.v[i];
       vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
       vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
       vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
@@ -326,8 +313,8 @@ static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
       }
     for (int i=0; i<nvec; ++i)
       {
-      Tv lx1=vsub(r2m.v[i],r2p.v[i]);
-      Tv lw2=vadd(r1p.v[i],r1m.v[i]);
+      Tv lx1=r2m.v[i]-r2p.v[i];
+      Tv lw2=r1p.v[i]+r1m.v[i];
       vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
       vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
       vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
@@ -359,11 +346,11 @@ static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
       vfmaeq(acr,py[j].qi.v[i],lx);
       vfmseq(aci,py[j].qr.v[i],lx);
       }
-    vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
+    vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]);
     }
   }
 
-static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
   int lmax NJ1)
@@ -374,10 +361,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
        fx2=vload(fx[l+1].f[2]);
     for (int i=0; i<nvec; ++i)
       {
-      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
-                        vmul(fx2,rec1p.v[i]));
-      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
-                        vmul(fx2,rec1m.v[i]));
+      rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
+      rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
       }
     Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
       &alm[2*njobs*(l+1)] NJ2);
@@ -385,10 +370,8 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     fx2=vload(fx[l+2].f[2]);
     for (int i=0; i<nvec; ++i)
       {
-      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
-                        vmul(fx2,rec2p.v[i]));
-      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
-                        vmul(fx2,rec2m.v[i]));
+      rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
+      rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
       }
     l+=2;
     }
@@ -396,7 +379,7 @@ static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
   }
 
-static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
+NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
   const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
   NJ1)
@@ -429,7 +412,7 @@ static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
     Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
   }
 
-static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2 NJ1)
   {
@@ -475,7 +458,7 @@ static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
     lmax NJ2);
   }
 
-static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
+NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
   const sharp_Ylmgen_C * restrict gen, sharp_job *job,
   const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
   {
@@ -539,7 +522,7 @@ static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
     }
   }
 
-static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
   const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
   int lmax NJ1)
@@ -572,7 +555,7 @@ static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
     Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
   }
 
-static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
+NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
   const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
   Y(Tbqu) * restrict p2 NJ1)
   {
@@ -621,7 +604,7 @@ static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
 
 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
 
-static void Z(inner_loop) (sharp_job *job, const int *ispair,
+NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
   const double *cth_, const double *sth_, int llim, int ulim,
   sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
   {
@@ -722,10 +705,30 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
         }
       break;
       }
+    default:
+      {
+      UTIL_FAIL("must not happen");
+      break;
+      }
+    }
+  }
+
+NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  {
+  const int nval=nvec*VLEN;
+  const int m = job->ainfo->mval[mi];
+  sharp_Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
     case SHARP_MAP2ALM:
       {
       if (job->spin==0)
         {
+        Tv atmp[2*njobs*(gen->lmax+1)];
+        memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv));
         for (int ith=0; ith<ulim-llim; ith+=nval)
           {
           Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
@@ -751,8 +754,15 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
               }
             }
           if (!skip)
-            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
+            Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2);
           }
+        {
+        int istart=m*njobs, istop=(gen->lmax+1)*njobs;
+        for(; istart<istop-2; istart+=2)
+          vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
+        for(; istart<istop; istart++)
+          job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
+        }
         }
       else
         {
@@ -800,4 +810,13 @@ static void Z(inner_loop) (sharp_job *job, const int *ispair,
     }
   }
 
+static void Z(inner_loop) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim,
+  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
+  {
+  (job->type==SHARP_MAP2ALM) ?
+    Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) :
+    Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2);
+  }
+
 #undef VZERO
diff --git a/libsharp/sharp_cxx.h b/libsharp/sharp_cxx.h
index f8b2365..2c37505 100644
--- a/libsharp/sharp_cxx.h
+++ b/libsharp/sharp_cxx.h
@@ -25,13 +25,14 @@
 /*! \file sharp_cxx.h
  *  Spherical transform library
  *
- *  Copyright (C) 2012-2015 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
 #ifndef PLANCK_SHARP_CXX_H
 #define PLANCK_SHARP_CXX_H
 
+#include <complex>
 #include "sharp_lowlevel.h"
 #include "sharp_geomhelpers.h"
 #include "sharp_almhelpers.h"
@@ -107,19 +108,30 @@ template<typename T> class sharp_cxxjob: public sharp_base
   private:
     static void *conv (T *ptr)
       { return reinterpret_cast<void *>(ptr); }
+    static void *conv (std::complex<T> *ptr)
+      { return reinterpret_cast<void *>(ptr); }
     static void *conv (const T *ptr)
       { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
+    static void *conv (const std::complex<T> *ptr)
+      { return const_cast<void *>(reinterpret_cast<const void *>(ptr)); }
 
   public:
-    void alm2map (const T *alm, T *map, bool add)
+    void alm2map (const T *alm, T *map, bool add) const
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
         flags,0,0);
       }
-    void alm2map_spin (const T *alm1, const T *alm2, T *map1, T *map2,
-      int spin, bool add)
+    void alm2map (const std::complex<T> *alm, T *map, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP, 0, &aptr, &mptr, ginfo, ainfo, 1,
+        flags,0,0);
+      }
+    void alm2map_spin (const T *alm1, const T *alm2,
+      T *map1, T *map2, int spin, bool add) const
       {
       void *aptr[2], *mptr[2];
       aptr[0]=conv(alm1); aptr[1]=conv(alm2);
@@ -127,21 +139,65 @@ template<typename T> class sharp_cxxjob: public sharp_base
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
       }
-    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add)
+    void alm2map_spin (const std::complex<T> *alm1, const std::complex<T> *alm2,
+      T *map1, T *map2, int spin, bool add) const
+      {
+      void *aptr[2], *mptr[2];
+      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void alm2map_der1 (const T *alm, T *map1, T *map2, bool add) const
+      {
+      void *aptr=conv(alm), *mptr[2];
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void alm2map_der1 (const std::complex<T> *alm, T *map1, T *map2, bool add)
+      const
       {
       void *aptr=conv(alm), *mptr[2];
       mptr[0]=conv(map1); mptr[1]=conv(map2);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_ALM2MAP_DERIV1,1,&aptr,mptr,ginfo,ainfo,1,flags,0,0);
       }
-    void map2alm (const T *map, T *alm, bool add)
+    void alm2map_adjoint (const T *map, T *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void alm2map_adjoint (const T *map, std::complex<T> *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_Yt,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void map2alm (const T *map, T *alm, bool add) const
+      {
+      void *aptr=conv(alm), *mptr=conv(map);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void map2alm (const T *map, std::complex<T> *alm, bool add) const
       {
       void *aptr=conv(alm), *mptr=conv(map);
       int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
       sharp_execute (SHARP_MAP2ALM,0,&aptr,&mptr,ginfo,ainfo,1,flags,0,0);
       }
     void map2alm_spin (const T *map1, const T *map2, T *alm1, T *alm2,
-      int spin, bool add)
+      int spin, bool add) const
+      {
+      void *aptr[2], *mptr[2];
+      aptr[0]=conv(alm1); aptr[1]=conv(alm2);
+      mptr[0]=conv(map1); mptr[1]=conv(map2);
+      int flags=cxxjobhelper__<T>::val | (add ? SHARP_ADD : 0);
+      sharp_execute (SHARP_MAP2ALM,spin,aptr,mptr,ginfo,ainfo,1,flags,0,0);
+      }
+    void map2alm_spin (const T *map1, const T *map2, std::complex<T> *alm1,
+      std::complex<T> *alm2, int spin, bool add) const
       {
       void *aptr[2], *mptr[2];
       aptr[0]=conv(alm1); aptr[1]=conv(alm2);
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
index 0aed60d..8efb8a0 100644
--- a/libsharp/sharp_geomhelpers.c
+++ b/libsharp/sharp_geomhelpers.c
@@ -25,9 +25,8 @@
 /*! \file sharp_geomhelpers.c
  *  Spherical transform library
  *
- *  Copyright (C) 2006-2012 Max-Planck-Society<br>
- *  Copyright (C) 2007-2008 Pavel Holoborodko (for gauss_legendre_tbl)
- *  \author Martin Reinecke \author Pavel Holoborodko
+ *  Copyright (C) 2006-2018 Max-Planck-Society<br>
+ *  \author Martin Reinecke
  */
 
 #include <math.h>
@@ -35,7 +34,6 @@
 #include "sharp_legendre_roots.h"
 #include "c_utils.h"
 #include "pocketfft/pocketfft.h"
-#include <stdio.h>
 
 void sharp_make_subset_healpix_geom_info (int nside, int stride, int nrings,
   const int *rings, const double *weight, sharp_geom_info **geom_info)
diff --git a/libsharp/sharp_testsuite.c b/libsharp/sharp_testsuite.c
index 2b1c7af..f02f9fd 100644
--- a/libsharp/sharp_testsuite.c
+++ b/libsharp/sharp_testsuite.c
@@ -23,7 +23,7 @@
  */
 
 /*  \file sharp_testsuite.c
- * 
+ *
  *  Copyright (C) 2012-2013 Max-Planck-Society
  *  \author Martin Reinecke
  */
@@ -50,9 +50,9 @@ typedef complex double dcmplx;
 
 int ntasks, mytask;
 
-static double drand (double min, double max, int *state)
+static double drand (double min, double max, unsigned *state)
   {
-  *state = (((*state) * 1103515245) + 12345) & 0x7fffffff;
+  *state = (((*state) * 1103515245u) + 12345u) & 0x7fffffffu;
   return min + (max-min)*(*state)/(0x7fffffff+1.0);
   }
 
@@ -65,7 +65,7 @@ static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin, int cnt)
   for (mi=0;mi<helper->nm; ++mi)
     {
     int m=helper->mval[mi];
-    int state=1234567*cnt+8912*m; // random seed
+    unsigned state=1234567u*(unsigned)cnt+8912u*(unsigned)m; // random seed
     for (int l=m;l<=helper->lmax; ++l)
       {
       if ((l<spin)&&(m<spin))
diff --git a/libsharp/sharp_vecsupport.h b/libsharp/sharp_vecsupport.h
index ee4c5e7..5250948 100644
--- a/libsharp/sharp_vecsupport.h
+++ b/libsharp/sharp_vecsupport.h
@@ -25,7 +25,7 @@
 /*  \file sharp_vecsupport.h
  *  Convenience functions for vector arithmetics
  *
- *  Copyright (C) 2012,2013 Max-Planck-Society
+ *  Copyright (C) 2012-2016 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -72,6 +72,7 @@ typedef int Tm;
 #define vge(a,b) ((a)>=(b))
 #define vne(a,b) ((a)!=(b))
 #define vand_mask(a,b) ((a)&&(b))
+#define vor_mask(a,b) ((a)||(b))
 #define vstoreu(p, a) (*(p)=a)
 #define vstoreu_s(p, a) (*(p)=a)
 
@@ -138,6 +139,7 @@ static inline Tv vblend__(Tv m, Tv a, Tv b)
 #define vge(a,b) _mm_cmpge_pd(a,b)
 #define vne(a,b) _mm_cmpneq_pd(a,b)
 #define vand_mask(a,b) _mm_and_pd(a,b)
+#define vor_mask(a,b) _mm_or_pd(a,b)
 #define vmin(a,b) _mm_min_pd(a,b)
 #define vmax(a,b) _mm_max_pd(a,b);
 #define vanyTrue(a) (_mm_movemask_pd(a)!=0)
@@ -183,6 +185,13 @@ typedef __m256d Tm;
 #define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
 #define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
 #else
+#if (USE_FMA)
+#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
+#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
+#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
+#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
+#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
+#else
 #define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
 #define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
 #define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
@@ -191,6 +200,7 @@ typedef __m256d Tm;
 #define vfmaseq(a,b,c,d,e) \
   a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
 #endif
+#endif
 #define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
 #define vload(a) _mm256_set1_pd(a)
 #define vload_s(a) _mm256_set1_ps(a)
@@ -201,6 +211,7 @@ typedef __m256d Tm;
 #define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
 #define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
 #define vand_mask(a,b) _mm256_and_pd(a,b)
+#define vor_mask(a,b) _mm256_or_pd(a,b)
 #define vmin(a,b) _mm256_min_pd(a,b)
 #define vmax(a,b) _mm256_max_pd(a,b)
 #define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
@@ -242,6 +253,7 @@ typedef __mmask8 Tm;
 #define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
 #define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
 #define vand_mask(a,b) ((a)&(b))
+#define vor_mask(a,b) ((a)|(b))
 #define vmin(a,b) _mm512_min_pd(a,b)
 #define vmax(a,b) _mm512_max_pd(a,b)
 #define vanyTrue(a) (a!=0)
diff --git a/libsharp/sharp_ylmgen_c.c b/libsharp/sharp_ylmgen_c.c
index 6e8cee5..785e063 100644
--- a/libsharp/sharp_ylmgen_c.c
+++ b/libsharp/sharp_ylmgen_c.c
@@ -25,7 +25,7 @@
 /*
  *  Helper code for efficient calculation of Y_lm(theta,phi=0)
  *
- *  Copyright (C) 2005-2014 Max-Planck-Society
+ *  Copyright (C) 2005-2016 Max-Planck-Society
  *  Author: Martin Reinecke
  */
 
@@ -59,6 +59,12 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
     gen->cf[m]=gen->cf[m+1]*sharp_fsmall;
   for (int m=-sharp_minscale+1; m<(sharp_maxscale-sharp_minscale+1); ++m)
     gen->cf[m]=gen->cf[m-1]*sharp_fbig;
+  gen->powlimit=RALLOC(double,m_max+spin+1);
+  gen->powlimit[0]=0.;
+  const double ln2 = 0.6931471805599453094172321214581766;
+  const double expo=-400*ln2;
+  for (int m=1; m<=m_max+spin; ++m)
+    gen->powlimit[m]=exp(expo/m);
 
   gen->m = -1;
   if (spin==0)
@@ -124,6 +130,7 @@ void sharp_Ylmgen_init (sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
 void sharp_Ylmgen_destroy (sharp_Ylmgen_C *gen)
   {
   DEALLOC(gen->cf);
+  DEALLOC(gen->powlimit);
   if (gen->s==0)
     {
     DEALLOC(gen->rf);
diff --git a/libsharp/sharp_ylmgen_c.h b/libsharp/sharp_ylmgen_c.h
index 3328f76..63b23cd 100644
--- a/libsharp/sharp_ylmgen_c.h
+++ b/libsharp/sharp_ylmgen_c.h
@@ -25,7 +25,7 @@
 /*! \file sharp_ylmgen_c.h
  *  Code for efficient calculation of Y_lm(phi=0,theta)
  *
- *  Copyright (C) 2005-2012 Max-Planck-Society
+ *  Copyright (C) 2005-2016 Max-Planck-Society
  *  \author Martin Reinecke
  */
 
@@ -49,6 +49,7 @@ typedef struct
 /* for public use; immutable during lifetime */
   int lmax, mmax, s;
   double *cf;
+  double *powlimit;
 
 /* for public use; will typically change after call to Ylmgen_prepare() */
   int m;
-- 
GitLab