From 2fd77f2063f76b4aa88adb2582861b8aeb482021 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Wed, 8 May 2019 21:17:29 +0200
Subject: [PATCH] synchronnize

---
 pocketfft_hdronly.h | 371 +++++++++++++++++++++++---------------------
 pypocketfft.cc      |  12 +-
 2 files changed, 199 insertions(+), 184 deletions(-)

diff --git a/pocketfft_hdronly.h b/pocketfft_hdronly.h
index 1b4af74..03817a0 100644
--- a/pocketfft_hdronly.h
+++ b/pocketfft_hdronly.h
@@ -20,6 +20,9 @@
 #include <stdexcept>
 #include <memory>
 #include <vector>
+#if defined(_WIN32)
+#include <malloc.h>
+#endif
 
 #ifdef __GNUC__
 #define NOINLINE __attribute__((noinline))
@@ -38,6 +41,21 @@ using stride_t = vector<ptrdiff_t>;
 
 namespace detail {
 
+#ifndef POCKETFFT_NO_VECTORS
+#if (defined(__AVX512F__))
+#define HAVE_VECSUPPORT
+constexpr int VBYTELEN=64;
+#elif (defined(__AVX__))
+#define HAVE_VECSUPPORT
+constexpr int VBYTELEN=32;
+#elif (defined(__SSE2__))
+#define HAVE_VECSUPPORT
+constexpr int VBYTELEN=16;
+#else
+#define POCKETFFT_NO_VECTORS
+#endif
+#endif
+
 template<typename T> struct arr
   {
   private:
@@ -51,9 +69,12 @@ template<typename T> struct arr
       void *res = malloc(num*sizeof(T));
       if (!res) throw bad_alloc();
 #else
-#if 0
+#if __cplusplus >= 201703L
       void *res = aligned_alloc(64,num*sizeof(T));
       if (!res) throw bad_alloc();
+#elif defined(_WIN32)
+      void *res = _aligned_malloc(num*sizeof(T), 64);
+      if (!res) throw bad_alloc();
 #else
       void *res(nullptr);
       if (posix_memalign(&res, 64, num*sizeof(T))!=0)
@@ -172,7 +193,7 @@ template<typename T> class sincos_2pibyn
       if (n==0) return;
       res[0]=1.; res[1]=0.;
       if (n==1) return;
-      size_t l1=(size_t)sqrt(n);
+      size_t l1 = size_t(sqrt(n));
       arr<double> tmp(2*l1);
       for (size_t i=1; i<l1; ++i)
         {
@@ -320,60 +341,71 @@ template<typename T> class sincos_2pibyn
       { return reinterpret_cast<const cmplx<T> *>(data.data()); }
   };
 
-NOINLINE size_t largest_prime_factor (size_t n)
+struct util // hack to avoid duplicate symbols
   {
-  size_t res=1;
-  while ((n&1)==0)
-    { res=2; n>>=1; }
-
-  size_t limit=size_t(sqrt(n+0.01));
-  for (size_t x=3; x<=limit; x+=2)
-  while ((n/x)*x==n)
+  static NOINLINE size_t largest_prime_factor (size_t n)
     {
-    res=x;
-    n/=x;
-    limit=size_t(sqrt(n+0.01));
-    }
-  if (n>1) res=n;
+    size_t res=1;
+    while ((n&1)==0)
+      { res=2; n>>=1; }
 
-  return res;
-  }
+    size_t limit=size_t(sqrt(n+0.01));
+    for (size_t x=3; x<=limit; x+=2)
+    while ((n/x)*x==n)
+      {
+      res=x;
+      n/=x;
+      limit=size_t(sqrt(n+0.01));
+      }
+    if (n>1) res=n;
 
-NOINLINE double cost_guess (size_t n)
-  {
-  const double lfp=1.1; // penalty for non-hardcoded larger factors
-  size_t ni=n;
-  double result=0.;
-  while ((n&1)==0)
-    { result+=2; n>>=1; }
-
-  size_t limit=size_t(sqrt(n+0.01));
-  for (size_t x=3; x<=limit; x+=2)
-  while ((n/x)*x==n)
+    return res;
+    }
+
+  static NOINLINE double cost_guess (size_t n)
     {
-    result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
-    n/=x;
-    limit=size_t(sqrt(n+0.01));
+    const double lfp=1.1; // penalty for non-hardcoded larger factors
+    size_t ni=n;
+    double result=0.;
+    while ((n&1)==0)
+      { result+=2; n>>=1; }
+
+    size_t limit=size_t(sqrt(n+0.01));
+    for (size_t x=3; x<=limit; x+=2)
+    while ((n/x)*x==n)
+      {
+      result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
+      n/=x;
+      limit=size_t(sqrt(n+0.01));
+      }
+    if (n>1) result+=(n<=5) ? n : lfp*n;
+
+    return result*ni;
     }
-  if (n>1) result+=(n<=5) ? n : lfp*n;
 
-  return result*ni;
-  }
+  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
+  static NOINLINE size_t good_size(size_t n)
+    {
+    if (n<=12) return n;
+
+    size_t bestfac=2*n;
+    for (size_t f2=1; f2<bestfac; f2*=2)
+      for (size_t f23=f2; f23<bestfac; f23*=3)
+        for (size_t f235=f23; f235<bestfac; f235*=5)
+          for (size_t f2357=f235; f2357<bestfac; f2357*=7)
+            for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
+              if (f235711>=n) bestfac=f235711;
+    return bestfac;
+    }
 
-/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
-NOINLINE size_t good_size(size_t n)
-  {
-  if (n<=12) return n;
-
-  size_t bestfac=2*n;
-  for (size_t f2=1; f2<bestfac; f2*=2)
-    for (size_t f23=f2; f23<bestfac; f23*=3)
-      for (size_t f235=f23; f235<bestfac; f235*=5)
-        for (size_t f2357=f235; f2357<bestfac; f2357*=7)
-          for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
-            if (f235711>=n) bestfac=f235711;
-  return bestfac;
-  }
+  static size_t prod(const shape_t &shape)
+    {
+    size_t res=1;
+    for (auto sz: shape)
+      res*=sz;
+    return res;
+    }
+  };
 
 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
@@ -397,12 +429,12 @@ template<typename T0> class cfftp
 
     size_t length, nfct;
     arr<cmplx<T0>> mem;
-    fctdata fct[NFCT];
+    fctdata fact[NFCT];
 
     void add_factor(size_t factor)
       {
       if (nfct>=NFCT) throw runtime_error("too many prime factors");
-      fct[nfct++].fct = factor;
+      fact[nfct++].fct = factor;
       }
 
 template<bool bwd, typename T> void pass2 (size_t ido, size_t l1,
@@ -844,33 +876,33 @@ template<bool bwd, typename T> void passg (size_t ido, size_t ip,
 #undef CX2
 #undef CX
 
-template<bool bwd, typename T> void pass_all(T c[], T0 fact)
+template<bool bwd, typename T> void pass_all(T c[], T0 fct)
   {
-  if (length==1) { c[0]*=fact; return; }
+  if (length==1) { c[0]*=fct; return; }
   size_t l1=1;
   arr<T> ch(length);
   T *p1=c, *p2=ch.data();
 
   for(size_t k1=0; k1<nfct; k1++)
     {
-    size_t ip=fct[k1].fct;
+    size_t ip=fact[k1].fct;
     size_t l2=ip*l1;
     size_t ido = length/l2;
     if     (ip==4)
-      pass4<bwd> (ido, l1, p1, p2, fct[k1].tw);
+      pass4<bwd> (ido, l1, p1, p2, fact[k1].tw);
     else if(ip==2)
-      pass2<bwd>(ido, l1, p1, p2, fct[k1].tw);
+      pass2<bwd>(ido, l1, p1, p2, fact[k1].tw);
     else if(ip==3)
-      pass3<bwd> (ido, l1, p1, p2, fct[k1].tw);
+      pass3<bwd> (ido, l1, p1, p2, fact[k1].tw);
     else if(ip==5)
-      pass5<bwd> (ido, l1, p1, p2, fct[k1].tw);
+      pass5<bwd> (ido, l1, p1, p2, fact[k1].tw);
     else if(ip==7)
-      pass7<bwd> (ido, l1, p1, p2, fct[k1].tw);
+      pass7<bwd> (ido, l1, p1, p2, fact[k1].tw);
     else if(ip==11)
-      pass11<bwd> (ido, l1, p1, p2, fct[k1].tw);
+      pass11<bwd> (ido, l1, p1, p2, fact[k1].tw);
     else
       {
-      passg<bwd>(ido, ip, l1, p1, p2, fct[k1].tw, fct[k1].tws);
+      passg<bwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
       swap(p1,p2);
       }
     swap(p1,p2);
@@ -878,16 +910,16 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
     }
   if (p1!=c)
     {
-    if (fact!=1.)
+    if (fct!=1.)
       for (size_t i=0; i<length; ++i)
-        c[i] = ch[i]*fact;
+        c[i] = ch[i]*fct;
     else
       memcpy (c,p1,length*sizeof(T));
     }
   else
-    if (fact!=1.)
+    if (fct!=1.)
       for (size_t i=0; i<length; ++i)
-        c[i] *= fact;
+        c[i] *= fct;
   }
 
 #undef WA
@@ -913,9 +945,9 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
         len>>=1;
         // factor 2 should be at the front of the factor list
         add_factor(2);
-        swap(fct[0].fct, fct[nfct-1].fct);
+        swap(fact[0].fct, fact[nfct-1].fct);
         }
-      size_t maxl=(size_t)(sqrt((double)len))+1;
+      size_t maxl = size_t(sqrt(double(len)))+1;
       for (size_t divisor=3; (len>1)&&(divisor<maxl); divisor+=2)
         if ((len%divisor)==0)
           {
@@ -924,7 +956,7 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
             add_factor(divisor);
             len/=divisor;
             }
-          maxl=size_t(sqrt((double)len))+1;
+          maxl=size_t(sqrt(double(len)))+1;
           }
       if (len>1) add_factor(len);
       }
@@ -934,7 +966,7 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
       size_t twsize=0, l1=1;
       for (size_t k=0; k<nfct; ++k)
         {
-        size_t ip=fct[k].fct, ido= length/(l1*ip);
+        size_t ip=fact[k].fct, ido= length/(l1*ip);
         twsize+=(ip-1)*(ido-1);
         if (ip>11)
           twsize+=ip;
@@ -951,18 +983,18 @@ template<bool bwd, typename T> void pass_all(T c[], T0 fact)
       size_t memofs=0;
       for (size_t k=0; k<nfct; ++k)
         {
-        size_t ip=fct[k].fct, ido=length/(l1*ip);
-        fct[k].tw=mem.data()+memofs;
+        size_t ip=fact[k].fct, ido=length/(l1*ip);
+        fact[k].tw=mem.data()+memofs;
         memofs+=(ip-1)*(ido-1);
         for (size_t j=1; j<ip; ++j)
           for (size_t i=1; i<ido; ++i)
-            fct[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
+            fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
         if (ip>11)
           {
-          fct[k].tws=mem.data()+memofs;
+          fact[k].tws=mem.data()+memofs;
           memofs+=ip;
           for (size_t j=0; j<ip; ++j)
-            fct[k].tws[j] = twiddle[j*l1*ido];
+            fact[k].tws[j] = twiddle[j*l1*ido];
           }
         l1*=ip;
         }
@@ -995,12 +1027,12 @@ template<typename T0> class rfftp
 
     size_t length, nfct;
     arr<T0> mem;
-    fctdata fct[NFCT];
+    fctdata fact[NFCT];
 
     void add_factor(size_t factor)
       {
       if (nfct>=NFCT) throw runtime_error("too many prime factors");
-      fct[nfct++].fct = factor;
+      fact[nfct++].fct = factor;
       }
 
 #define WA(x,i) wa[(i)+(x)*(ido-1)]
@@ -1632,9 +1664,9 @@ template<typename T> void copy_and_norm(T *c, T *p1, size_t n, T0 fct)
 
   public:
 
-template<typename T> void forward(T c[], T0 fact)
+template<typename T> void forward(T c[], T0 fct)
   {
-  if (length==1) { c[0]*=fact; return; }
+  if (length==1) { c[0]*=fct; return; }
   size_t n=length;
   size_t l1=n, nf=nfct;
   arr<T> ch(n);
@@ -1643,30 +1675,30 @@ template<typename T> void forward(T c[], T0 fact)
   for(size_t k1=0; k1<nf;++k1)
     {
     size_t k=nf-k1-1;
-    size_t ip=fct[k].fct;
+    size_t ip=fact[k].fct;
     size_t ido=n / l1;
     l1 /= ip;
     if(ip==4)
-      radf4(ido, l1, p1, p2, fct[k].tw);
+      radf4(ido, l1, p1, p2, fact[k].tw);
     else if(ip==2)
-      radf2(ido, l1, p1, p2, fct[k].tw);
+      radf2(ido, l1, p1, p2, fact[k].tw);
     else if(ip==3)
-      radf3(ido, l1, p1, p2, fct[k].tw);
+      radf3(ido, l1, p1, p2, fact[k].tw);
     else if(ip==5)
-      radf5(ido, l1, p1, p2, fct[k].tw);
+      radf5(ido, l1, p1, p2, fact[k].tw);
     else
       {
-      radfg(ido, ip, l1, p1, p2, fct[k].tw, fct[k].tws);
+      radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
       swap (p1,p2);
       }
     swap (p1,p2);
     }
-  copy_and_norm(c,p1,n,fact);
+  copy_and_norm(c,p1,n,fct);
   }
 
-template<typename T> void backward(T c[], T0 fact)
+template<typename T> void backward(T c[], T0 fct)
   {
-  if (length==1) { c[0]*=fact; return; }
+  if (length==1) { c[0]*=fct; return; }
   size_t n=length;
   size_t l1=1, nf=nfct;
   arr<T> ch(n);
@@ -1674,22 +1706,22 @@ template<typename T> void backward(T c[], T0 fact)
 
   for(size_t k=0; k<nf; k++)
     {
-    size_t ip = fct[k].fct,
+    size_t ip = fact[k].fct,
            ido= n/(ip*l1);
     if(ip==4)
-      radb4(ido, l1, p1, p2, fct[k].tw);
+      radb4(ido, l1, p1, p2, fact[k].tw);
     else if(ip==2)
-      radb2(ido, l1, p1, p2, fct[k].tw);
+      radb2(ido, l1, p1, p2, fact[k].tw);
     else if(ip==3)
-      radb3(ido, l1, p1, p2, fct[k].tw);
+      radb3(ido, l1, p1, p2, fact[k].tw);
     else if(ip==5)
-      radb5(ido, l1, p1, p2, fct[k].tw);
+      radb5(ido, l1, p1, p2, fact[k].tw);
     else
-      radbg(ido, ip, l1, p1, p2, fct[k].tw, fct[k].tws);
+      radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
     swap (p1,p2);
     l1*=ip;
     }
-  copy_and_norm(c,p1,n,fact);
+  copy_and_norm(c,p1,n,fct);
   }
 
   private:
@@ -1705,9 +1737,9 @@ void factorize()
     len>>=1;
     // factor 2 should be at the front of the factor list
     add_factor(2);
-    swap(fct[0].fct, fct[nfct-1].fct);
+    swap(fact[0].fct, fact[nfct-1].fct);
     }
-  size_t maxl=(size_t)(sqrt((double)len))+1;
+  size_t maxl = size_t(sqrt(double(len)))+1;
   for (size_t divisor=3; (len>1)&&(divisor<maxl); divisor+=2)
     if ((len%divisor)==0)
       {
@@ -1716,7 +1748,7 @@ void factorize()
         add_factor(divisor);
         len/=divisor;
         }
-      maxl=(size_t)(sqrt((double)len))+1;
+      maxl=size_t(sqrt(double(len)))+1;
       }
   if (len>1) add_factor(len);
   }
@@ -1726,7 +1758,7 @@ size_t twsize() const
   size_t twsz=0, l1=1;
   for (size_t k=0; k<nfct; ++k)
     {
-    size_t ip=fct[k].fct, ido=length/(l1*ip);
+    size_t ip=fact[k].fct, ido=length/(l1*ip);
     twsz+=(ip-1)*(ido-1);
     if (ip>5) twsz+=2*ip;
     l1*=ip;
@@ -1741,28 +1773,28 @@ void comp_twiddle()
   T0 *ptr=mem.data();
   for (size_t k=0; k<nfct; ++k)
     {
-    size_t ip=fct[k].fct, ido=length/(l1*ip);
+    size_t ip=fact[k].fct, ido=length/(l1*ip);
     if (k<nfct-1) // last factor doesn't need twiddles
       {
-      fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
+      fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
       for (size_t j=1; j<ip; ++j)
         for (size_t i=1; i<=(ido-1)/2; ++i)
           {
-          fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
-          fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
+          fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
+          fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
           }
       }
     if (ip>5) // special factors required by *g functions
       {
-      fct[k].tws=ptr; ptr+=2*ip;
-      fct[k].tws[0] = 1.;
-      fct[k].tws[1] = 0.;
+      fact[k].tws=ptr; ptr+=2*ip;
+      fact[k].tws[0] = 1.;
+      fact[k].tws[1] = 0.;
       for (size_t i=1; i<=(ip>>1); ++i)
         {
-        fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
-        fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
-        fct[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
-        fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
+        fact[k].tws[2*i  ] = twid[2*i*(length/ip)];
+        fact[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
+        fact[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
+        fact[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
         }
       }
     l1*=ip;
@@ -1822,7 +1854,7 @@ template<typename T0> class fftblue
 
   public:
     NOINLINE fftblue(size_t length)
-      : n(length), n2(good_size(n*2-1)), plan(n2), mem(n+n2),
+      : n(length), n2(util::good_size(n*2-1)), plan(n2), mem(n+n2),
         bk(mem.data()), bkf(mem.data()+n)
       {
       /* initialize b_k */
@@ -1858,7 +1890,8 @@ template<typename T0> class fftblue
       {
       arr<cmplx<T>> tmp(n);
       tmp[0].Set(c[0],c[0]*0);
-      memcpy ((void *)(tmp.data()+1),(void *)(c+1), (n-1)*sizeof(T));
+      memcpy (reinterpret_cast<void *>(tmp.data()+1),
+              reinterpret_cast<void *>(c+1), (n-1)*sizeof(T));
       if ((n&1)==0) tmp[n/2].i=c[0]*0.;
       for (size_t m=1; 2*m<n; ++m)
         tmp[n-m].Set(tmp[m].r, -tmp[m].i);
@@ -1894,13 +1927,13 @@ template<typename T0> class pocketfft_c
       : len(length)
       {
       if (length==0) throw runtime_error("zero-length FFT requested");
-      if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+      if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length)))
         {
         packplan=unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
         return;
         }
-      double comp1 = cost_guess(length);
-      double comp2 = 2*cost_guess(good_size(2*length-1));
+      double comp1 = util::cost_guess(length);
+      double comp2 = 2*util::cost_guess(util::good_size(2*length-1));
       comp2*=1.5; /* fudge factor that appears to give good overall performance */
       if (comp2<comp1) // use Bluestein
         blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
@@ -1908,16 +1941,16 @@ template<typename T0> class pocketfft_c
         packplan=unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
       }
 
-    template<typename T> NOINLINE void backward(T c[], T0 fct)
+    template<typename T> NOINLINE void backward(cmplx<T> c[], T0 fct)
       {
-      packplan ? packplan->backward((cmplx<T> *)c,fct)
-               : blueplan->backward((cmplx<T> *)c,fct);
+      packplan ? packplan->backward(c,fct)
+               : blueplan->backward(c,fct);
       }
 
-    template<typename T> NOINLINE void forward(T c[], T0 fct)
+    template<typename T> NOINLINE void forward(cmplx<T> c[], T0 fct)
       {
-      packplan ? packplan->forward((cmplx<T> *)c,fct)
-               : blueplan->forward((cmplx<T> *)c,fct);
+      packplan ? packplan->forward(c,fct)
+               : blueplan->forward(c,fct);
       }
 
     size_t length() const { return len; }
@@ -1939,13 +1972,13 @@ template<typename T0> class pocketfft_r
       : len(length)
       {
       if (length==0) throw runtime_error("zero-length FFT requested");
-      if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+      if ((length<50) || (util::largest_prime_factor(length)<=sqrt(length)))
         {
         packplan=unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
         return;
         }
-      double comp1 = 0.5*cost_guess(length);
-      double comp2 = 2*cost_guess(good_size(2*length-1));
+      double comp1 = 0.5*util::cost_guess(length);
+      double comp2 = 2*util::cost_guess(util::good_size(2*length-1));
       comp2*=1.5; /* fudge factor that appears to give good overall performance */
       if (comp2<comp1) // use Bluestein
         blueplan=unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
@@ -1972,14 +2005,6 @@ template<typename T0> class pocketfft_r
 // multi-D infrastructure
 //
 
-size_t prod(const shape_t &shape)
-  {
-  size_t res=1;
-  for (auto sz: shape)
-    res*=sz;
-  return res;
-  }
-
 template<typename T> class ndarr
   {
   private:
@@ -1996,7 +2021,7 @@ template<typename T> class ndarr
       : d(reinterpret_cast<char *>(data_)),
         cd(reinterpret_cast<const char *>(data_)), shp(shape_), str(stride_) {}
     size_t ndim() const { return shp.size(); }
-    size_t size() const { return prod(shp); }
+    size_t size() const { return util::prod(shp); }
     const shape_t &shape() const { return shp; }
     size_t shape(size_t i) const { return shp[i]; }
     const stride_t &stride() const { return str; }
@@ -2067,20 +2092,6 @@ template<size_t N, typename Ti, typename To> class multi_iter
     bool contiguous_out() const { return str_o==sizeof(To); }
   };
 
-
-#ifndef POCKETFFT_NO_VECTORS
-#if (defined(__AVX512F__))
-#define HAVE_VECSUPPORT
-constexpr int VBYTELEN=64;
-#elif (defined(__AVX__))
-#define HAVE_VECSUPPORT
-constexpr int VBYTELEN=32;
-#elif (defined(__SSE2__))
-#define HAVE_VECSUPPORT
-constexpr int VBYTELEN=16;
-#endif
-#endif
-
 #if defined(HAVE_VECSUPPORT)
 template<typename T> struct VTYPE{};
 template<> struct VTYPE<double>
@@ -2104,14 +2115,14 @@ template<> struct VTYPE<float>
 template<typename T> arr<char> alloc_tmp(const shape_t &shape,
   size_t axsize, size_t elemsize)
   {
-  auto othersize = prod(shape)/axsize;
+  auto othersize = util::prod(shape)/axsize;
   auto tmpsize = axsize*((othersize>=VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1);
   return arr<char>(tmpsize*elemsize);
   }
 template<typename T> arr<char> alloc_tmp(const shape_t &shape,
   const shape_t &axes, size_t elemsize)
   {
-  size_t fullsize=prod(shape);
+  size_t fullsize=util::prod(shape);
   size_t tmpsize=0;
   for (size_t i=0; i<axes.size(); ++i)
     {
@@ -2123,7 +2134,7 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
   return arr<char>(tmpsize*elemsize);
   }
 
-template<typename T> NOINLINE void pocketfft_general_c(
+template<typename T> NOINLINE void general_c(
   const ndarr<cmplx<T>> &in, ndarr<cmplx<T>> &out,
   const shape_t &axes, bool forward, T fct)
   {
@@ -2143,15 +2154,15 @@ template<typename T> NOINLINE void pocketfft_general_c(
         {
         using vtype = typename VTYPE<T>::type;
         it.advance(vlen);
-        auto tdatav = (cmplx<vtype> *)storage.data();
+        auto tdatav = reinterpret_cast<cmplx<vtype> *>(storage.data());
         for (size_t i=0; i<len; ++i)
           for (size_t j=0; j<vlen; ++j)
             {
             tdatav[i].r[j] = it.in(j,i).r;
             tdatav[i].i[j] = it.in(j,i).i;
             }
-        forward ? plan->forward ((vtype *)tdatav, fct)
-                : plan->backward((vtype *)tdatav, fct);
+        forward ? plan->forward (tdatav, fct)
+                : plan->backward(tdatav, fct);
         for (size_t i=0; i<len; ++i)
           for (size_t j=0; j<vlen; ++j)
             it.out(j,i).Set(tdatav[i].r[j],tdatav[i].i[j]);
@@ -2160,23 +2171,23 @@ template<typename T> NOINLINE void pocketfft_general_c(
     while (it.remaining()>0)
       {
       it.advance(1);
-      auto tdata = (cmplx<T> *)storage.data();
+      auto tdata = reinterpret_cast<cmplx<T> *>(storage.data());
       if (it.inplace() && it.contiguous_out()) // fully in-place
-        forward ? plan->forward ((T *)(&it.out(0)), fct)
-                : plan->backward((T *)(&it.out(0)), fct);
+        forward ? plan->forward ((&it.out(0)), fct)
+                : plan->backward((&it.out(0)), fct);
       else if (it.contiguous_out()) // compute FFT in output location
         {
         for (size_t i=0; i<len; ++i)
           it.out(i) = it.in(i);
-        forward ? plan->forward ((T *)(&it.out(0)), fct)
-                : plan->backward((T *)(&it.out(0)), fct);
+        forward ? plan->forward ((&it.out(0)), fct)
+                : plan->backward((&it.out(0)), fct);
         }
       else
         {
         for (size_t i=0; i<len; ++i)
           tdata[i] = it.in(i);
-        forward ? plan->forward ((T *)tdata, fct)
-                : plan->backward((T *)tdata, fct);
+        forward ? plan->forward (tdata, fct)
+                : plan->backward(tdata, fct);
         for (size_t i=0; i<len; ++i)
           it.out(i) = tdata[i];
         }
@@ -2185,7 +2196,7 @@ template<typename T> NOINLINE void pocketfft_general_c(
     }
   }
 
-template<typename T> NOINLINE void pocketfft_general_hartley(
+template<typename T> NOINLINE void general_hartley(
   const ndarr<T> &in, ndarr<T> &out, const shape_t &axes, T fct)
   {
   auto storage = alloc_tmp<T>(in.shape(), axes, sizeof(T));
@@ -2204,11 +2215,11 @@ template<typename T> NOINLINE void pocketfft_general_hartley(
         {
         using vtype = typename VTYPE<T>::type;
         it.advance(vlen);
-        auto tdatav = (vtype *)storage.data();
+        auto tdatav = reinterpret_cast<vtype *>(storage.data());
         for (size_t i=0; i<len; ++i)
           for (size_t j=0; j<vlen; ++j)
             tdatav[i][j] = it.in(j,i);
-        plan->forward((vtype *)tdatav, fct);
+        plan->forward(tdatav, fct);
         for (size_t j=0; j<vlen; ++j)
           it.out(j,0) = tdatav[0][j];
         size_t i=1, i1=1, i2=len-1;
@@ -2226,10 +2237,10 @@ template<typename T> NOINLINE void pocketfft_general_hartley(
     while (it.remaining()>0)
       {
       it.advance(1);
-      auto tdata = (T *)storage.data();
+      auto tdata = reinterpret_cast<T *>(storage.data());
       for (size_t i=0; i<len; ++i)
         tdata[i] = it.in(i);
-      plan->forward((T *)tdata, fct);
+      plan->forward(tdata, fct);
       // Hartley order
       it.out(0) = tdata[0];
       size_t i=1, i1=1, i2=len-1;
@@ -2245,7 +2256,7 @@ template<typename T> NOINLINE void pocketfft_general_hartley(
     }
   }
 
-template<typename T> NOINLINE void pocketfft_general_r2c(
+template<typename T> NOINLINE void general_r2c(
   const ndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, T fct)
   {
   auto storage = alloc_tmp<T>(in.shape(), in.shape(axis), sizeof(T));
@@ -2260,11 +2271,11 @@ template<typename T> NOINLINE void pocketfft_general_r2c(
       {
       using vtype = typename VTYPE<T>::type;
       it.advance(vlen);
-      auto tdatav = (vtype *)storage.data();
+      auto tdatav = reinterpret_cast<vtype *>(storage.data());
       for (size_t i=0; i<len; ++i)
         for (size_t j=0; j<vlen; ++j)
           tdatav[i][j] = it.in(j,i);
-      plan.forward((vtype *)tdatav, fct);
+      plan.forward(tdatav, fct);
       for (size_t j=0; j<vlen; ++j)
         it.out(j,0).Set(tdatav[0][j]);
       size_t i=1, ii=1;
@@ -2279,7 +2290,7 @@ template<typename T> NOINLINE void pocketfft_general_r2c(
   while (it.remaining()>0)
     {
     it.advance(1);
-    auto tdata = (T *)storage.data();
+    auto tdata = reinterpret_cast<T *>(storage.data());
     for (size_t i=0; i<len; ++i)
       tdata[i] = it.in(i);
     plan.forward(tdata, fct);
@@ -2291,7 +2302,7 @@ template<typename T> NOINLINE void pocketfft_general_r2c(
       it.out(ii).Set(tdata[i]);
     }
   }
-template<typename T> NOINLINE void pocketfft_general_c2r(
+template<typename T> NOINLINE void general_c2r(
   const ndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, T fct)
   {
   auto storage = alloc_tmp<T>(out.shape(), out.shape(axis), sizeof(T));
@@ -2306,9 +2317,10 @@ template<typename T> NOINLINE void pocketfft_general_c2r(
       {
       using vtype = typename VTYPE<T>::type;
       it.advance(vlen);
-      auto tdatav = (vtype *)storage.data();
+      auto tdatav = reinterpret_cast<vtype *>(storage.data());
       for (size_t j=0; j<vlen; ++j)
         tdatav[0][j]=it.in(j,0).r;
+      {
       size_t i=1, ii=1;
       for (; i<len-1; i+=2, ++ii)
         for (size_t j=0; j<vlen; ++j)
@@ -2319,6 +2331,7 @@ template<typename T> NOINLINE void pocketfft_general_c2r(
       if (i<len)
         for (size_t j=0; j<vlen; ++j)
           tdatav[i][j] = it.in(j,ii).r;
+      }
       plan.backward(tdatav, fct);
       for (size_t i=0; i<len; ++i)
         for (size_t j=0; j<vlen; ++j)
@@ -2328,8 +2341,9 @@ template<typename T> NOINLINE void pocketfft_general_c2r(
   while (it.remaining()>0)
     {
     it.advance(1);
-    auto tdata = (T *)storage.data();
+    auto tdata = reinterpret_cast<T *>(storage.data());
     tdata[0]=it.in(0).r;
+    {
     size_t i=1, ii=1;
     for (; i<len-1; i+=2, ++ii)
       {
@@ -2338,13 +2352,14 @@ template<typename T> NOINLINE void pocketfft_general_c2r(
       }
     if (i<len)
       tdata[i] = it.in(ii).r;
+    }
     plan.backward(tdata, fct);
     for (size_t i=0; i<len; ++i)
       it.out(i) = tdata[i];
     }
   }
 
-template<typename T> NOINLINE void pocketfft_general_r(
+template<typename T> NOINLINE void general_r(
   const ndarr<T> &in, ndarr<T> &out, size_t axis, bool forward, T fct)
   {
   auto storage = alloc_tmp<T>(in.shape(), in.shape(axis), sizeof(T));
@@ -2359,7 +2374,7 @@ template<typename T> NOINLINE void pocketfft_general_r(
       {
       using vtype = typename VTYPE<T>::type;
       it.advance(vlen);
-      auto tdatav = (vtype *)storage.data();
+      auto tdatav = reinterpret_cast<vtype *>(storage.data());
       for (size_t i=0; i<len; ++i)
         for (size_t j=0; j<vlen; ++j)
           tdatav[i][j] = it.in(j,i);
@@ -2373,7 +2388,7 @@ template<typename T> NOINLINE void pocketfft_general_r(
   while (it.remaining()>0)
     {
     it.advance(1);
-    auto tdata = (T *)storage.data();
+    auto tdata = reinterpret_cast<T *>(storage.data());
     if (it.inplace() && it.contiguous_out()) // fully in-place
       forward ? plan.forward (&it.out(0), fct)
               : plan.backward(&it.out(0), fct);
@@ -2407,7 +2422,7 @@ template<typename T> void c2c(const shape_t &shape,
   using namespace detail;
   ndarr<cmplx<T>> ain(data_in, shape, stride_in);
   ndarr<cmplx<T>> aout(data_out, shape, stride_out);
-  pocketfft_general_c(ain, aout, axes, forward, fct);
+  general_c(ain, aout, axes, forward, fct);
   }
 
 template<typename T> void r2c(const shape_t &shape,
@@ -2417,7 +2432,7 @@ template<typename T> void r2c(const shape_t &shape,
   using namespace detail;
   ndarr<T> ain(data_in, shape, stride_in);
   ndarr<cmplx<T>> aout(data_out, shape, stride_out);
-  pocketfft_general_r2c(ain, aout, axis, fct);
+  general_r2c(ain, aout, axis, fct);
   }
 
 template<typename T> void c2r(const shape_t &shape, size_t new_size,
@@ -2429,7 +2444,7 @@ template<typename T> void c2r(const shape_t &shape, size_t new_size,
   shape_out[axis] = new_size;
   ndarr<cmplx<T>> ain(data_in, shape, stride_in);
   ndarr<T> aout(data_out, shape_out, stride_out);
-  pocketfft_general_c2r(ain, aout, axis, fct);
+  general_c2r(ain, aout, axis, fct);
   }
 
 template<typename T> void r2r_fftpack(const shape_t &shape,
@@ -2439,7 +2454,7 @@ template<typename T> void r2r_fftpack(const shape_t &shape,
   using namespace detail;
   ndarr<T> ain(data_in, shape, stride_in);
   ndarr<T> aout(data_out, shape, stride_out);
-  pocketfft_general_r(ain, aout, axis, forward, fct);
+  general_r(ain, aout, axis, forward, fct);
   }
 
 template<typename T> void r2r_hartley(const shape_t &shape,
@@ -2449,7 +2464,7 @@ template<typename T> void r2r_hartley(const shape_t &shape,
   using namespace detail;
   ndarr<T> ain(data_in, shape, stride_in);
   ndarr<T> aout(data_out, shape, stride_out);
-  pocketfft_general_hartley(ain, aout, axes, fct);
+  general_hartley(ain, aout, axes, fct);
   }
 
 } // namespace pocketfft
diff --git a/pypocketfft.cc b/pypocketfft.cc
index 2e2cd9a..96b2ad6 100644
--- a/pypocketfft.cc
+++ b/pypocketfft.cc
@@ -85,7 +85,7 @@ template<typename T> py::array xfftn_internal(const py::array &in,
   py::array res = inplace ? in : py::array_t<complex<T>>(dims);
   ndarr<cmplx<T>> ain(in.data(), dims, copy_strides(in));
   ndarr<cmplx<T>> aout(res.mutable_data(), dims, copy_strides(res));
-  pocketfft_general_c<T>(ain, aout, axes, fwd, fct);
+  general_c<T>(ain, aout, axes, fwd, fct);
   return res;
   }
 
@@ -110,10 +110,10 @@ template<typename T> py::array rfftn_internal(const py::array &in,
   py::array res = py::array_t<complex<T>>(dims_out);
   ndarr<T> ain(in.data(), dims_in, copy_strides(in));
   ndarr<cmplx<T>> aout(res.mutable_data(), dims_out, copy_strides(res));
-  pocketfft_general_r2c<T>(ain, aout, axes.back(), fct);
+  general_r2c<T>(ain, aout, axes.back(), fct);
   if (axes.size()==1) return res;
   shape_t axes2(axes.begin(), --axes.end());
-  pocketfft_general_c<T>(aout, aout, axes2, true, 1.);
+  general_c<T>(aout, aout, axes2, true, 1.);
   return res;
   }
 py::array rfftn(const py::array &in, py::object axes_, double fct)
@@ -128,7 +128,7 @@ template<typename T> py::array xrfft_scipy(const py::array &in,
   py::array res = inplace ? in : py::array_t<T>(dims);
   ndarr<T> ain(in.data(), dims, copy_strides(in));
   ndarr<T> aout(res.mutable_data(), dims, copy_strides(res));
-  pocketfft_general_r<T>(ain, aout, axis, fwd, fct);
+  general_r<T>(ain, aout, axis, fwd, fct);
   return res;
   }
 py::array rfft_scipy(const py::array &in, size_t axis, double fct, bool inplace)
@@ -160,7 +160,7 @@ template<typename T> py::array irfftn_internal(const py::array &in,
   py::array res = py::array_t<T>(dims_out);
   ndarr<cmplx<T>> ain(inter.data(), copy_shape(inter), copy_strides(inter));
   ndarr<T> aout(res.mutable_data(), dims_out, copy_strides(res));
-  pocketfft_general_c2r<T>(ain, aout, axis, fct);
+  general_c2r<T>(ain, aout, axis, fct);
   return res;
   }
 py::array irfftn(const py::array &in, py::object axes_, size_t lastsize,
@@ -178,7 +178,7 @@ template<typename T> py::array hartley_internal(const py::array &in,
   py::array res = inplace ? in : py::array_t<T>(dims);
   ndarr<T> ain(in.data(), copy_shape(in), copy_strides(in));
   ndarr<T> aout(res.mutable_data(), copy_shape(res), copy_strides(res));
-  pocketfft_general_hartley<T>(ain, aout, makeaxes(in, axes_), fct);
+  general_hartley<T>(ain, aout, makeaxes(in, axes_), fct);
   return res;
   }
 py::array hartley(const py::array &in, py::object axes_, double fct,
-- 
GitLab