From 7e4b35133beda71192ee865fe29a46cae1bc21eb Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Thu, 2 May 2019 11:44:03 +0200
Subject: [PATCH 1/5] smaller memory consumption during twiddle generation

---
 pocketfft.cc | 129 ++++++++++++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 63 deletions(-)

diff --git a/pocketfft.cc b/pocketfft.cc
index bed7179..9026678 100644
--- a/pocketfft.cc
+++ b/pocketfft.cc
@@ -72,14 +72,58 @@ template<typename T> struct arr
     size_t size() const { return sz; }
   };
 
+template<typename T> struct cmplx {
+  T r, i;
+  cmplx() {}
+  cmplx(T r_, T i_) : r(r_), i(i_) {}
+  void Set(T r_, T i_) { r=r_; i=i_; }
+  void Set(T r_) { r=r_; i=T(0); }
+  cmplx &operator+= (const cmplx &other)
+    { r+=other.r; i+=other.i; return *this; }
+  template<typename T2>cmplx &operator*= (T2 other)
+    { r*=other; i*=other; return *this; }
+  cmplx operator+ (const cmplx &other) const
+    { return cmplx(r+other.r, i+other.i); }
+  cmplx operator- (const cmplx &other) const
+    { return cmplx(r-other.r, i-other.i); }
+  template<typename T2> auto operator* (const T2 &other) const
+    -> cmplx<decltype(r*other)>
+    {
+    return {r*other, i*other};
+    }
+  template<typename T2> auto operator* (const cmplx<T2> &other) const
+    -> cmplx<decltype(r+other.r)>
+    {
+    return {r*other.r-i*other.i, r*other.i + i*other.r};
+    }
+  template<bool bwd, typename T2> auto special_mul (const cmplx<T2> &other) const
+    -> cmplx<decltype(r+other.r)>
+    {
+    if (bwd)
+      return {r*other.r-i*other.i, r*other.i + i*other.r};
+    else
+      return {r*other.r+i*other.i, i*other.r - r*other.i};
+    }
+};
+template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b,
+  const cmplx<T> &c, const cmplx<T> &d)
+  { a = c+d; b = c-d; }
+template<typename T> cmplx<T> conj(const cmplx<T> &a)
+  { return {a.r, -a.i}; }
+
+template<typename T> void ROT90(cmplx<T> &a)
+  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
+template<typename T> void ROTM90(cmplx<T> &a)
+  { auto tmp_=-a.r; a.r=a.i; a.i=tmp_; }
+
 //
 // twiddle factor section
 //
 
-class sincos_2pibyn
+template<typename T> class sincos_2pibyn
   {
   private:
-    arr<double> data;
+    arr<T> data;
 
     // adapted from https://stackoverflow.com/questions/42792939/
     // CAUTION: this function only works for arguments in the range
@@ -110,15 +154,20 @@ class sincos_2pibyn
       res[1] = s;
       }
 
-    NOINLINE void calc_first_octant(size_t den, double * restrict res)
+    NOINLINE void calc_first_octant(size_t den, T * restrict res)
       {
       size_t n = (den+4)>>3;
       if (n==0) return;
       res[0]=1.; res[1]=0.;
       if (n==1) return;
       size_t l1=(size_t)sqrt(n);
+      arr<double> tmp(2*l1);
       for (size_t i=1; i<l1; ++i)
-        my_sincosm1pi((2.*i)/den,&res[2*i]);
+        {
+        my_sincosm1pi((2.*i)/den,&tmp[2*i]);
+        res[2*i  ] = tmp[2*i]+1.;
+        res[2*i+1] = tmp[2*i+1];
+        }
       size_t start=l1;
       while(start<n)
         {
@@ -130,19 +179,17 @@ class sincos_2pibyn
         if (start+end>n) end = n-start;
         for (size_t i=1; i<end; ++i)
           {
-          double csx[2]={res[2*i], res[2*i+1]};
+          double csx[2]={tmp[2*i], tmp[2*i+1]};
           res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
           res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
           }
         start += l1;
         }
-      for (size_t i=1; i<l1; ++i)
-        res[2*i] += 1.;
       }
 
-    NOINLINE void calc_first_quadrant(size_t n, double * restrict res)
+    NOINLINE void calc_first_quadrant(size_t n, T * restrict res)
       {
-      double * restrict p = res+n;
+      T * restrict p = res+n;
       calc_first_octant(n<<1, p);
       size_t ndone=(n+2)>>2;
       size_t i=0, idx1=0, idx2=2*ndone-2;
@@ -160,10 +207,10 @@ class sincos_2pibyn
         }
       }
 
-    NOINLINE void calc_first_half(size_t n, double * restrict res)
+    NOINLINE void calc_first_half(size_t n, T * restrict res)
       {
       int ndone=(n+1)>>1;
-      double * p = res+n-1;
+      T * p = res+n-1;
       calc_first_octant(n<<2, p);
       int i4=0, in=n, i=0;
       for (; i4<=in-i4; ++i, i4+=4) // octant 0
@@ -187,7 +234,7 @@ class sincos_2pibyn
         }
       }
 
-    NOINLINE void fill_first_quadrant(size_t n, double * restrict res)
+    NOINLINE void fill_first_quadrant(size_t n, T * restrict res)
       {
       const double hsqt2 = 0.707106781186547524400844362104849;
       size_t quart = n>>2;
@@ -200,7 +247,7 @@ class sincos_2pibyn
         }
       }
 
-    NOINLINE void fill_first_half(size_t n, double * restrict res)
+    NOINLINE void fill_first_half(size_t n, T * restrict res)
       {
       size_t half = n>>1;
       if ((n&3)==0)
@@ -217,7 +264,7 @@ class sincos_2pibyn
           }
       }
 
-    NOINLINE void fill_second_half(size_t n, double * restrict res)
+    NOINLINE void fill_second_half(size_t n, T * restrict res)
       {
       if ((n&1)==0)
         for (size_t i=0; i<n; ++i)
@@ -230,7 +277,7 @@ class sincos_2pibyn
           }
       }
 
-    NOINLINE void sincos_2pibyn_half(size_t n, double * restrict res)
+    NOINLINE void sincos_2pibyn_half(size_t n, T * restrict res)
       {
       if ((n&3)==0)
         {
@@ -255,7 +302,7 @@ class sincos_2pibyn
       if (!half) fill_second_half(n, data.data());
       }
 
-    double operator[](size_t idx) const { return data[idx]; }
+    T operator[](size_t idx) const { return data[idx]; }
   };
 
 NOINLINE size_t largest_prime_factor (size_t n)
@@ -313,50 +360,6 @@ NOINLINE size_t good_size(size_t n)
   return bestfac;
   }
 
-template<typename T> struct cmplx {
-  T r, i;
-  cmplx() {}
-  cmplx(T r_, T i_) : r(r_), i(i_) {}
-  void Set(T r_, T i_) { r=r_; i=i_; }
-  void Set(T r_) { r=r_; i=T(0); }
-  cmplx &operator+= (const cmplx &other)
-    { r+=other.r; i+=other.i; return *this; }
-  template<typename T2>cmplx &operator*= (T2 other)
-    { r*=other; i*=other; return *this; }
-  cmplx operator+ (const cmplx &other) const
-    { return cmplx(r+other.r, i+other.i); }
-  cmplx operator- (const cmplx &other) const
-    { return cmplx(r-other.r, i-other.i); }
-  template<typename T2> auto operator* (const T2 &other) const
-    -> cmplx<decltype(r*other)>
-    {
-    return {r*other, i*other};
-    }
-  template<typename T2> auto operator* (const cmplx<T2> &other) const
-    -> cmplx<decltype(r+other.r)>
-    {
-    return {r*other.r-i*other.i, r*other.i + i*other.r};
-    }
-  template<bool bwd, typename T2> auto special_mul (const cmplx<T2> &other) const
-    -> cmplx<decltype(r+other.r)>
-    {
-    if (bwd)
-      return {r*other.r-i*other.i, r*other.i + i*other.r};
-    else
-      return {r*other.r+i*other.i, i*other.r - r*other.i};
-    }
-};
-template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b,
-  const cmplx<T> &c, const cmplx<T> &d)
-  { a = c+d; b = c-d; }
-template<typename T> cmplx<T> conj(const cmplx<T> &a)
-  { return {a.r, -a.i}; }
-
-template<typename T> void ROT90(cmplx<T> &a)
-  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
-template<typename T> void ROTM90(cmplx<T> &a)
-  { auto tmp_=-a.r; a.r=a.i; a.i=tmp_; }
-
 #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
 #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
 #define WA(x,i) wa[(i)-1+(x)*(ido-1)]
@@ -910,7 +913,7 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
 
     NOINLINE void comp_twiddle()
       {
-      sincos_2pibyn twid(length, false);
+      sincos_2pibyn<T0> twid(length, false);
       size_t l1=1;
       size_t memofs=0;
       for (size_t k=0; k<nfct; ++k)
@@ -1705,7 +1708,7 @@ size_t twsize() const
 
 NOINLINE void comp_twiddle()
   {
-  sincos_2pibyn twid(length, true);
+  sincos_2pibyn<T0> twid(length, true);
   size_t l1=1;
   T0 *ptr=mem.data();
   for (size_t k=0; k<nfct; ++k)
@@ -1807,7 +1810,7 @@ template<typename T0> class fftblue
         bk(mem.data()), bkf(mem.data()+2*n)
       {
       /* initialize b_k */
-      sincos_2pibyn tmp(2*n, false);
+      sincos_2pibyn<T0> tmp(2*n, false);
       bk[0] = 1;
       bk[1] = 0;
 
-- 
GitLab


From 18adaa57d44610dc7cc7a76ed1f213c3fc33f086 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 3 May 2019 09:05:21 +0200
Subject: [PATCH 2/5] more complex arrays

---
 pocketfft.cc | 117 ++++++++++++++++++++-------------------------------
 1 file changed, 45 insertions(+), 72 deletions(-)

diff --git a/pocketfft.cc b/pocketfft.cc
index 9026678..d7ea57c 100644
--- a/pocketfft.cc
+++ b/pocketfft.cc
@@ -88,21 +88,15 @@ template<typename T> struct cmplx {
     { return cmplx(r-other.r, i-other.i); }
   template<typename T2> auto operator* (const T2 &other) const
     -> cmplx<decltype(r*other)>
-    {
-    return {r*other, i*other};
-    }
+    { return {r*other, i*other}; }
   template<typename T2> auto operator* (const cmplx<T2> &other) const
     -> cmplx<decltype(r+other.r)>
-    {
-    return {r*other.r-i*other.i, r*other.i + i*other.r};
-    }
+    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
   template<bool bwd, typename T2> auto special_mul (const cmplx<T2> &other) const
     -> cmplx<decltype(r+other.r)>
     {
-    if (bwd)
-      return {r*other.r-i*other.i, r*other.i + i*other.r};
-    else
-      return {r*other.r+i*other.i, i*other.r - r*other.i};
+    return bwd ? cmplx(r*other.r-i*other.i, r*other.i + i*other.r)
+               : cmplx(r*other.r+i*other.i, i*other.r - r*other.i);
     }
 };
 template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b,
@@ -303,6 +297,9 @@ template<typename T> class sincos_2pibyn
       }
 
     T operator[](size_t idx) const { return data[idx]; }
+    const T *rdata() const { return data; }
+    const cmplx<T> *cdata() const
+      { return reinterpret_cast<const cmplx<T> *>(data.data()); }
   };
 
 NOINLINE size_t largest_prime_factor (size_t n)
@@ -914,6 +911,7 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
     NOINLINE void comp_twiddle()
       {
       sincos_2pibyn<T0> twid(length, false);
+      auto twiddle = twid.cdata();
       size_t l1=1;
       size_t memofs=0;
       for (size_t k=0; k<nfct; ++k)
@@ -923,19 +921,13 @@ template<bool bwd, typename T> NOINLINE void pass_all(T c[], T0 fact)
         memofs+=(ip-1)*(ido-1);
         for (size_t j=1; j<ip; ++j)
           for (size_t i=1; i<ido; ++i)
-            {
-            fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
-            fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
-            }
+            fct[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
         if (ip>11)
           {
           fct[k].tws=mem.data()+memofs;
           memofs+=ip;
           for (size_t j=0; j<ip; ++j)
-            {
-            fct[k].tws[j].r = twid[2*j*l1*ido];
-            fct[k].tws[j].i = twid[2*j*l1*ido+1];
-            }
+            fct[k].tws[j] = twiddle[j*l1*ido];
           }
         l1*=ip;
         }
@@ -1746,7 +1738,7 @@ NOINLINE void comp_twiddle()
 NOINLINE rfftp(size_t length_)
   : length(length_)
   {
-  if (length==0) throw runtime_error("zero_sized FFT");
+  if (length==0) throw runtime_error("zero-sized FFT");
   nfct=0;
   if (length==1) return;
   factorize();
@@ -1765,82 +1757,66 @@ template<typename T0> class fftblue
   private:
     size_t n, n2;
     cfftp<T0> plan;
-    arr<T0> mem;
-    T0 *bk, *bkf;
+    arr<cmplx<T0>> mem;
+    cmplx<T0> *bk, *bkf;
 
     template<bool bwd, typename T> NOINLINE
-    void fft(T c[], T0 fct)
+    void fft(cmplx<T> c[], T0 fct)
       {
-      constexpr T0 isign = bwd ? 1 : -1;
-      arr<T> akf(2*n2);
+      arr<cmplx<T>> akf(n2);
 
       /* initialize a_k and FFT it */
-      for (size_t m=0; m<2*n; m+=2)
-        {
-        akf[m]   = c[m]*bk[m]   -isign*c[m+1]*bk[m+1];
-        akf[m+1] = isign*c[m]*bk[m+1] + c[m+1]*bk[m];
-        }
-      for (size_t m=2*n; m<2*n2; ++m)
-        akf[m]=0.*c[0];
+      for (size_t m=0; m<n; ++m)
+        akf[m] = c[m].template special_mul<bwd>(bk[m]);
+      for (size_t m=n; m<n2; ++m)
+        akf[m]=akf[0]*T0(0);
 
-      plan.forward ((cmplx<T> *)akf.data(),1.);
+      plan.forward (akf.data(),1.);
 
       /* do the convolution */
-      for (size_t m=0; m<2*n2; m+=2)
-        {
-        T im = -isign*akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
-        akf[m  ]  =  akf[m]*bkf[m]   + isign*akf[m+1]*bkf[m+1];
-        akf[m+1]  = im;
-        }
+      for (size_t m=0; m<n2; ++m)
+        akf[m] = akf[m].template special_mul<!bwd>(bkf[m]);
 
       /* inverse FFT */
-      plan.backward ((cmplx<T> *)akf.data(),1.);
+      plan.backward (akf.data(),1.);
 
       /* multiply by b_k */
-      for (size_t m=0; m<2*n; m+=2)
-        {
-        c[m]   = fct*(bk[m]  *akf[m] - isign*bk[m+1]*akf[m+1]);
-        c[m+1] = fct*(isign*bk[m+1]*akf[m] + bk[m]  *akf[m+1]);
-        }
+      for (size_t m=0; m<n; ++m)
+        c[m] = akf[m].template special_mul<bwd>(bk[m])*fct;
       }
 
   public:
     NOINLINE fftblue(size_t length)
-      : n(length), n2(good_size(n*2-1)), plan(n2), mem(2*(n+n2)),
-        bk(mem.data()), bkf(mem.data()+2*n)
+      : n(length), n2(good_size(n*2-1)), plan(n2), mem(n+n2),
+        bk(mem.data()), bkf(mem.data()+n)
       {
       /* initialize b_k */
-      sincos_2pibyn<T0> tmp(2*n, false);
-      bk[0] = 1;
-      bk[1] = 0;
+      sincos_2pibyn<T0> tmp_(2*n, false);
+      auto tmp = tmp_.cdata();
+      bk[0].Set(1, 0);
 
       size_t coeff=0;
       for (size_t m=1; m<n; ++m)
         {
         coeff+=2*m-1;
         if (coeff>=2*n) coeff-=2*n;
-        bk[2*m  ] = tmp[2*coeff  ];
-        bk[2*m+1] = tmp[2*coeff+1];
+        bk[m] = tmp[coeff];
         }
 
       /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
       T0 xn2 = 1./n2;
       bkf[0] = bk[0]*xn2;
-      bkf[1] = bk[1]*xn2;
-      for (size_t m=2; m<2*n; m+=2)
-        {
-        bkf[m]   = bkf[2*n2-m]   = bk[m]   *xn2;
-        bkf[m+1] = bkf[2*n2-m+1] = bk[m+1] *xn2;
-        }
-      for (size_t m=2*n;m<=(2*n2-2*n+1);++m)
-        bkf[m]=0.;
-      plan.forward((cmplx<T0> *)bkf,1.);
+      for (size_t m=1; m<n; ++m)
+        bkf[m] = bkf[n2-m] = bk[m]*xn2;
+      for (size_t m=n;m<=(n2-n);++m)
+        bkf[m].Set(0.,0.);
+      plan.forward(bkf,1.);
       }
 
-    template<typename T> NOINLINE void backward(T c[], T0 fct)
+    template<typename T> NOINLINE void backward(cmplx<T> c[], T0 fct)
       { fft<true>(c,fct); }
 
-    template<typename T> NOINLINE void forward(T c[], T0 fct)
+    template<typename T> NOINLINE void forward(cmplx<T> c[], T0 fct)
       { fft<false>(c,fct); }
 
     template<typename T> NOINLINE void backward_r(T c[], T0 fct)
@@ -1855,22 +1831,19 @@ template<typename T0> class fftblue
         tmp[2*n-m]=tmp[m];
         tmp[2*n-m+1]=-tmp[m+1];
         }
-      fft<true>(tmp.data(),fct);
+      fft<true>((cmplx<T> *)tmp.data(),fct);
       for (size_t m=0; m<n; ++m)
         c[m] = tmp[2*m];
       }
 
     template<typename T> NOINLINE void forward_r(T c[], T0 fct)
       {
-      arr<T> tmp(2*n);
+      arr<cmplx<T>> tmp(n);
       for (size_t m=0; m<n; ++m)
-        {
-        tmp[2*m] = c[m];
-        tmp[2*m+1] = c[m]*0.;
-        }
+        tmp[m].Set(c[m], 0.*c[m]);
       fft<false>(tmp.data(),fct);
-      c[0] = tmp[0];
-      memcpy (c+1, tmp.data()+2, (n-1)*sizeof(T));
+      c[0] = tmp[0].r;
+      memcpy (c+1, tmp.data()+1, (n-1)*sizeof(T));
       }
     };
 
@@ -1907,13 +1880,13 @@ template<typename T0> class pocketfft_c
     template<typename T> NOINLINE void backward(T c[], T0 fct)
       {
       packplan ? packplan->backward((cmplx<T> *)c,fct)
-               : blueplan->backward(c,fct);
+               : blueplan->backward((cmplx<T> *)c,fct);
       }
 
     template<typename T> NOINLINE void forward(T c[], T0 fct)
       {
       packplan ? packplan->forward((cmplx<T> *)c,fct)
-               : blueplan->forward(c,fct);
+               : blueplan->forward((cmplx<T> *)c,fct);
       }
 
     size_t length() const { return len; }
-- 
GitLab


From 2cdcfbca47eb160b44961d160b93fb2193323a5f Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 3 May 2019 09:55:06 +0200
Subject: [PATCH 3/5] hide all vector-related attributes behind #ifdef

---
 pocketfft.cc | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pocketfft.cc b/pocketfft.cc
index d7ea57c..dab8544 100644
--- a/pocketfft.cc
+++ b/pocketfft.cc
@@ -2066,15 +2066,9 @@ template<> struct VTYPE<float>
 #else
 template<typename T> struct VTYPE{};
 template<> struct VTYPE<double>
-  {
-  using type = double __attribute__ ((vector_size (8)));
-  static constexpr int vlen=1;
-  };
+  {  static constexpr int vlen=1; };
 template<> struct VTYPE<float>
-  {
-  using type = float __attribute__ ((vector_size (4)));
-  static constexpr int vlen=1;
-  };
+  { static constexpr int vlen=1; };
 #endif
 
 template<typename T> arr<char> alloc_tmp(const shape_t &shape,
@@ -2112,6 +2106,7 @@ template<typename T> NOINLINE void pocketfft_general_c(
     size_t len=it.length_in();
     if ((!plan) || (len!=plan->length()))
       plan.reset(new pocketfft_c<T>(len));
+#if defined(HAVE_VECSUPPORT)
     if (vlen>1)
       while (it.remaining()>=vlen)
         {
@@ -2130,6 +2125,7 @@ template<typename T> NOINLINE void pocketfft_general_c(
           for (size_t j=0; j<vlen; ++j)
             it.out(j,i).Set(tdatav[i].r[j],tdatav[i].i[j]);
         }
+#endif
     while (it.remaining()>0)
       {
       it.advance(1);
@@ -2171,6 +2167,7 @@ template<typename T> NOINLINE void pocketfft_general_hartley(
     size_t len=it.length_in();
     if ((!plan) || (len!=plan->length()))
       plan.reset(new pocketfft_r<T>(len));
+#if defined(HAVE_VECSUPPORT)
     if (vlen>1)
       while (it.remaining()>=vlen)
         {
@@ -2194,6 +2191,7 @@ template<typename T> NOINLINE void pocketfft_general_hartley(
           for (size_t j=0; j<vlen; ++j)
             it.out(j,i1) = tdatav[i][j];
         }
+#endif
     while (it.remaining()>0)
       {
       it.advance(1);
@@ -2225,6 +2223,7 @@ template<typename T> NOINLINE void pocketfft_general_r2c(
   constexpr int vlen = VTYPE<T>::vlen;
   multi_iter<vlen, T, cmplx<T>> it(in, out, axis);
   size_t len=in.shape(axis);
+#if defined(HAVE_VECSUPPORT)
   if (vlen>1)
     while (it.remaining()>=vlen)
       {
@@ -2245,6 +2244,7 @@ template<typename T> NOINLINE void pocketfft_general_r2c(
         for (size_t j=0; j<vlen; ++j)
           it.out(j,ii).Set(tdatav[i][j]);
       }
+#endif
   while (it.remaining()>0)
     {
     it.advance(1);
@@ -2269,6 +2269,7 @@ template<typename T> NOINLINE void pocketfft_general_c2r(
   constexpr int vlen = VTYPE<T>::vlen;
   multi_iter<vlen, cmplx<T>, T> it(in, out, axis);
   size_t len=out.shape(axis);
+#if defined(HAVE_VECSUPPORT)
   if (vlen>1)
     while (it.remaining()>=vlen)
       {
@@ -2292,6 +2293,7 @@ template<typename T> NOINLINE void pocketfft_general_c2r(
         for (size_t j=0; j<vlen; ++j)
           it.out(j,i) = tdatav[i][j];
       }
+#endif
   while (it.remaining()>0)
     {
     it.advance(1);
-- 
GitLab


From 39524f2315536dc6560f958c2b237f8408f0508d Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 3 May 2019 13:21:55 +0200
Subject: [PATCH 4/5] restructuring; add interface for scipy emulation

---
 pocketfft.cc | 143 +++++++++++++++++++++++++++++++++++++++------------
 test.py      |  56 ++++++++++----------
 2 files changed, 135 insertions(+), 64 deletions(-)

diff --git a/pocketfft.cc b/pocketfft.cc
index dab8544..1c65916 100644
--- a/pocketfft.cc
+++ b/pocketfft.cc
@@ -976,8 +976,8 @@ template<typename T0> class rfftp
 #define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
 #define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
 
-template<typename T> NOINLINE void radf2 (size_t ido, size_t l1, const T * restrict cc,
-  T * restrict ch, const T0 * restrict wa)
+template<typename T> NOINLINE void radf2 (size_t ido, size_t l1,
+  const T * restrict cc, T * restrict ch, const T0 * restrict wa)
   {
   constexpr size_t cdim=2;
 
@@ -1001,8 +1001,8 @@ template<typename T> NOINLINE void radf2 (size_t ido, size_t l1, const T * restr
       }
   }
 
-template<typename T> NOINLINE void radf3(size_t ido, size_t l1, const T * restrict cc,
-  T * restrict ch, const T0 * restrict wa)
+template<typename T> NOINLINE void radf3(size_t ido, size_t l1,
+  const T * restrict cc, T * restrict ch, const T0 * restrict wa)
   {
   constexpr size_t cdim=3;
   constexpr T0 taur=-0.5, taui=0.86602540378443864676;
@@ -1035,8 +1035,8 @@ template<typename T> NOINLINE void radf3(size_t ido, size_t l1, const T * restri
       }
   }
 
-template<typename T> NOINLINE void radf4(size_t ido, size_t l1, const T * restrict cc,
-  T * restrict ch, const T0 * restrict wa)
+template<typename T> NOINLINE void radf4(size_t ido, size_t l1,
+  const T * restrict cc, T * restrict ch, const T0 * restrict wa)
   {
   constexpr size_t cdim=4;
   constexpr T0 hsqt2=0.70710678118654752440;
@@ -1076,8 +1076,8 @@ template<typename T> NOINLINE void radf4(size_t ido, size_t l1, const T * restri
       }
   }
 
-template<typename T> NOINLINE void radf5(size_t ido, size_t l1, const T * restrict cc,
-  T * restrict ch, const T0 * restrict wa)
+template<typename T> NOINLINE void radf5(size_t ido, size_t l1,
+  const T * restrict cc, T * restrict ch, const T0 * restrict wa)
   {
   constexpr size_t cdim=5;
   constexpr T0 tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
@@ -1301,8 +1301,8 @@ template<typename T> NOINLINE void radb2(size_t ido, size_t l1, const T * restri
       }
   }
 
-template<typename T> NOINLINE void radb3(size_t ido, size_t l1, const T * restrict cc,
-  T * restrict ch, const T0 * restrict wa)
+template<typename T> NOINLINE void radb3(size_t ido, size_t l1,
+  const T * restrict cc, T * restrict ch, const T0 * restrict wa)
   {
   constexpr size_t cdim=3;
   constexpr T0 taur=-0.5, taui=0.86602540378443864676;
@@ -1336,8 +1336,8 @@ template<typename T> NOINLINE void radb3(size_t ido, size_t l1, const T * restri
       }
   }
 
-template<typename T> NOINLINE void radb4(size_t ido, size_t l1, const T * restrict cc,
-  T * restrict ch, const T0 * restrict wa)
+template<typename T> NOINLINE void radb4(size_t ido, size_t l1,
+  const T * restrict cc, T * restrict ch, const T0 * restrict wa)
   {
   constexpr size_t cdim=4;
   constexpr T0 sqrt2=1.41421356237309504880;
@@ -1382,8 +1382,8 @@ template<typename T> NOINLINE void radb4(size_t ido, size_t l1, const T * restri
       }
   }
 
-template<typename T>NOINLINE void radb5(size_t ido, size_t l1, const T * restrict cc,
-  T * restrict ch, const T0 * restrict wa)
+template<typename T>NOINLINE void radb5(size_t ido, size_t l1,
+  const T * restrict cc, T * restrict ch, const T0 * restrict wa)
   {
   constexpr size_t cdim=5;
   constexpr T0 tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
@@ -1821,19 +1821,15 @@ template<typename T0> class fftblue
 
     template<typename T> NOINLINE void backward_r(T c[], T0 fct)
       {
-      arr<T> tmp(2*n);
-      tmp[0]=c[0];
-      tmp[1]=c[0]*0.;
-      memcpy (tmp.data()+2,c+1, (n-1)*sizeof(T));
-      if ((n&1)==0) tmp[n+1]=c[0]*0.;
-      for (size_t m=2; m<n; m+=2)
-        {
-        tmp[2*n-m]=tmp[m];
-        tmp[2*n-m+1]=-tmp[m+1];
-        }
-      fft<true>((cmplx<T> *)tmp.data(),fct);
+      arr<cmplx<T>> tmp(n);
+      tmp[0].Set(c[0],c[0]*0);
+      memcpy ((void *)(tmp.data()+1),(void *)(c+1), (n-1)*sizeof(T));
+      if ((n&1)==0) tmp[n/2].i=c[0]*0.;
+      for (size_t m=1; 2*m<n; ++m)
+        tmp[n-m].Set(tmp[m].r, -tmp[m].i);
+      fft<true>(tmp.data(),fct);
       for (size_t m=0; m<n; ++m)
-        c[m] = tmp[2*m];
+        c[m] = tmp[m].r;
       }
 
     template<typename T> NOINLINE void forward_r(T c[], T0 fct)
@@ -2056,12 +2052,12 @@ template<typename T> struct VTYPE{};
 template<> struct VTYPE<double>
   {
   using type = double __attribute__ ((vector_size (VBYTELEN)));
-  static constexpr int vlen=VBYTELEN/8;
+  static constexpr int vlen=VBYTELEN/sizeof(double);
   };
 template<> struct VTYPE<float>
   {
   using type = float __attribute__ ((vector_size (VBYTELEN)));
-  static constexpr int vlen=VBYTELEN/4;
+  static constexpr int vlen=VBYTELEN/sizeof(float);
   };
 #else
 template<typename T> struct VTYPE{};
@@ -2087,7 +2083,8 @@ template<typename T> arr<char> alloc_tmp(const shape_t &shape,
     {
     auto axsize = shape[axes[i]];
     auto othersize = fullsize/axsize;
-    tmpsize = max(tmpsize, axsize*((othersize>=VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1));
+    tmpsize = max(tmpsize,
+                  axsize*((othersize>=VTYPE<T>::vlen) ? VTYPE<T>::vlen : 1));
     }
   return arr<char>(tmpsize*elemsize);
   }
@@ -2119,7 +2116,7 @@ template<typename T> NOINLINE void pocketfft_general_c(
             tdatav[i].r[j] = it.in(j,i).r;
             tdatav[i].i[j] = it.in(j,i).i;
             }
-        forward ? plan->forward((vtype *)tdatav, fct)
+        forward ? plan->forward ((vtype *)tdatav, fct)
                 : plan->backward((vtype *)tdatav, fct);
         for (size_t i=0; i<len; ++i)
           for (size_t j=0; j<vlen; ++j)
@@ -2131,20 +2128,20 @@ template<typename T> NOINLINE void pocketfft_general_c(
       it.advance(1);
       auto tdata = (cmplx<T> *)storage.data();
       if (it.inplace() && it.contiguous_out()) // fully in-place
-        forward ? plan->forward((T *)(&it.in(0)), fct)
-                : plan->backward((T *)(&it.in(0)), fct);
+        forward ? plan->forward ((T *)(&it.out(0)), fct)
+                : plan->backward((T *)(&it.out(0)), fct);
       else if (it.contiguous_out()) // compute FFT in output location
         {
         for (size_t i=0; i<len; ++i)
           it.out(i) = it.in(i);
-        forward ? plan->forward((T *)(&it.out(0)), fct)
+        forward ? plan->forward ((T *)(&it.out(0)), fct)
                 : plan->backward((T *)(&it.out(0)), fct);
         }
       else
         {
         for (size_t i=0; i<len; ++i)
           tdata[i] = it.in(i);
-        forward ? plan->forward((T *)tdata, fct)
+        forward ? plan->forward ((T *)tdata, fct)
                 : plan->backward((T *)tdata, fct);
         for (size_t i=0; i<len; ++i)
           it.out(i) = tdata[i];
@@ -2313,6 +2310,58 @@ template<typename T> NOINLINE void pocketfft_general_c2r(
     }
   }
 
+template<typename T> NOINLINE void pocketfft_general_r(
+  const ndarr<T> &in, ndarr<T> &out, int axis, bool forward, T fct)
+  {
+  auto storage = alloc_tmp<T>(in.shape(), in.shape(axis), sizeof(T));
+
+  constexpr int vlen = VTYPE<T>::vlen;
+  multi_iter<vlen, T, T> it(in, out, axis);
+  size_t len=it.length_in();
+  pocketfft_r<T> plan(len);
+#if defined(HAVE_VECSUPPORT)
+  if (vlen>1)
+    while (it.remaining()>=vlen)
+      {
+      using vtype = typename VTYPE<T>::type;
+      it.advance(vlen);
+      auto tdatav = (vtype *)storage.data();
+      for (size_t i=0; i<len; ++i)
+        for (size_t j=0; j<vlen; ++j)
+          tdatav[i][j] = it.in(j,i);
+      forward ? plan.forward (tdatav, fct)
+              : plan.backward(tdatav, fct);
+      for (size_t i=0; i<len; ++i)
+        for (size_t j=0; j<vlen; ++j)
+          it.out(j,i) = tdatav[i][j];
+      }
+#endif
+  while (it.remaining()>0)
+    {
+    it.advance(1);
+    auto tdata = (T *)storage.data();
+    if (it.inplace() && it.contiguous_out()) // fully in-place
+      forward ? plan.forward (&it.out(0), fct)
+              : plan.backward(&it.out(0), fct);
+    else if (it.contiguous_out()) // compute FFT in output location
+      {
+      for (size_t i=0; i<len; ++i)
+        it.out(i) = it.in(i);
+      forward ? plan.forward (&it.out(0), fct)
+              : plan.backward(&it.out(0), fct);
+      }
+    else
+      {
+      for (size_t i=0; i<len; ++i)
+        tdata[i] = it.in(i);
+      forward ? plan.forward (tdata, fct)
+              : plan.backward(tdata, fct);
+      for (size_t i=0; i<len; ++i)
+        it.out(i) = tdata[i];
+      }
+    }
+  }
+
 //
 // Python interface
 //
@@ -2410,6 +2459,28 @@ py::array rfftn(const py::array &in, py::object axes_, double fct)
   return tcheck(in, f64, f32) ? rfftn_internal<double>(in, axes_, fct)
                               : rfftn_internal<float> (in, axes_, fct);
   }
+template<typename T> py::array xrfft_scipy(const py::array &in,
+  int axis, double fct, bool inplace, bool fwd)
+  {
+  auto dims(copy_shape(in));
+  py::array res = inplace ? in : py::array_t<T>(dims);
+  ndarr<T> ain(in.data(), dims, copy_strides(in));
+  ndarr<T> aout(res.mutable_data(), dims, copy_strides(res));
+  pocketfft_general_r<T>(ain, aout, axis, fwd, fct);
+  return res;
+  }
+py::array rfft_scipy(const py::array &in, int axis, double fct, bool inplace)
+  {
+  return tcheck(in, f64, f32) ?
+    xrfft_scipy<double>(in, axis, fct, inplace, true) :
+    xrfft_scipy<float> (in, axis, fct, inplace, true);
+  }
+py::array irfft_scipy(const py::array &in, int axis, double fct, bool inplace)
+  {
+  return tcheck(in, f64, f32) ?
+    xrfft_scipy<double>(in, axis, fct, inplace, false) :
+    xrfft_scipy<float> (in, axis, fct, inplace, false);
+  }
 template<typename T> py::array irfftn_internal(const py::array &in,
   py::object axes_, size_t lastsize, T fct)
   {
@@ -2639,6 +2710,10 @@ PYBIND11_MODULE(pypocketfft, m)
   m.def("ifftn",&ifftn, ifftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.,
     "inplace"_a=false);
   m.def("rfftn",&rfftn, rfftn_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.);
+  m.def("rfft_scipy",&rfft_scipy, "a"_a, "axis"_a, "fct"_a=1.,
+    "inplace"_a=false);
+  m.def("irfft_scipy",&irfft_scipy, "a"_a, "axis"_a, "fct"_a=1.,
+    "inplace"_a=false);
   m.def("irfftn",&irfftn, irfftn_DS, "a"_a, "axes"_a=py::none(), "lastsize"_a=0,
     "fct"_a=1.);
   m.def("hartley",&hartley, hartley_DS, "a"_a, "axes"_a=py::none(), "fct"_a=1.,
diff --git a/test.py b/test.py
index 181f545..be6f5c7 100644
--- a/test.py
+++ b/test.py
@@ -32,96 +32,92 @@ def test1D(len):
 @pmp("shp", shapes)
 def test_fftn(shp):
     a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j
-    vmax = np.max(np.abs(pyfftw.interfaces.numpy_fft.fftn(a)))
-    assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a), pypocketfft.fftn(a))<1e-15*vmax)
+    assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a), pypocketfft.fftn(a))<1e-15)
     a=a.astype(np.complex64)
-    assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a), pypocketfft.fftn(a))<5e-7*vmax)
+    assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a), pypocketfft.fftn(a))<5e-7)
 
 @pmp("shp", shapes2D)
 @pmp("axes", ((0,),(1,),(0,1),(1,0)))
 def test_fftn2D(shp, axes):
     a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j
-    vmax = np.max(np.abs(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes)))
-    assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes), pypocketfft.fftn(a, axes=axes))<1e-15*vmax)
+    assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes), pypocketfft.fftn(a, axes=axes))<1e-15)
     a=a.astype(np.complex64)
-    assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes), pypocketfft.fftn(a, axes=axes))<5e-7*vmax)
+    assert(_l2error(pyfftw.interfaces.numpy_fft.fftn(a, axes=axes), pypocketfft.fftn(a, axes=axes))<5e-7)
 
 @pmp("shp", shapes)
 def test_rfftn(shp):
     a=np.random.rand(*shp)-0.5
-    vmax = np.max(np.abs(pyfftw.interfaces.numpy_fft.fftn(a)))
-    assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a), pypocketfft.rfftn(a))<1e-15*vmax)
+    assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a), pypocketfft.rfftn(a))<1e-15)
     a=a.astype(np.float32)
-    assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a), pypocketfft.rfftn(a))<5e-7*vmax)
+    assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a), pypocketfft.rfftn(a))<5e-7)
+
+@pmp("shp", shapes)
+def test_rfft_scipy(shp):
+    for i in range(len(shp)):
+        a=np.random.rand(*shp)-0.5
+        assert(_l2error(pyfftw.interfaces.scipy_fftpack.rfft(a,axis=i), pypocketfft.rfft_scipy(a,axis=i))<1e-15)
+        assert(_l2error(pyfftw.interfaces.scipy_fftpack.irfft(a,axis=i), pypocketfft.irfft_scipy(a,axis=i,fct=1./a.shape[i]))<1e-15)
 
 @pmp("shp", shapes2D)
 @pmp("axes", ((0,),(1,),(0,1),(1,0)))
 def test_rfftn2D(shp, axes):
     a=np.random.rand(*shp)-0.5
-    vmax = np.max(np.abs(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes)))
-    assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes), pypocketfft.rfftn(a, axes=axes))<1e-15*vmax)
+    assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes), pypocketfft.rfftn(a, axes=axes))<1e-15)
     a=a.astype(np.float32)
-    assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes), pypocketfft.rfftn(a, axes=axes))<5e-7*vmax)
+    assert(_l2error(pyfftw.interfaces.numpy_fft.rfftn(a, axes=axes), pypocketfft.rfftn(a, axes=axes))<5e-7)
 
 @pmp("shp", shapes)
 def test_identity(shp):
     a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j
-    vmax = np.max(np.abs(a))
-    assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1.5e-15*vmax)
+    assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<1.5e-15)
     tmp=a.copy()
     assert (pypocketfft.ifftn(pypocketfft.fftn(tmp, inplace=True), fct=1./a.size, inplace=True) is tmp)
-    assert(_l2error(tmp, a)<1.5e-15*vmax)
+    assert(_l2error(tmp, a)<1.5e-15)
     a=a.astype(np.complex64)
-    assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<6e-7*vmax)
+    assert(_l2error(pypocketfft.ifftn(pypocketfft.fftn(a),fct=1./a.size), a)<6e-7)
 
 @pmp("shp", shapes)
 def test_identity_r(shp):
     a=np.random.rand(*shp)-0.5
     b=a.astype(np.float32)
-    vmax = np.max(np.abs(a))
     for ax in range(a.ndim):
         n = a.shape[ax]
-        assert(_l2error(pypocketfft.irfftn(pypocketfft.rfftn(a,(ax,)),(ax,),lastsize=n,fct=1./n), a)<1e-15*vmax)
-        assert(_l2error(pypocketfft.irfftn(pypocketfft.rfftn(b,(ax,)),(ax,),lastsize=n,fct=1./n), b)<5e-7*vmax)
+        assert(_l2error(pypocketfft.irfftn(pypocketfft.rfftn(a,(ax,)),(ax,),lastsize=n,fct=1./n), a)<1e-15)
+        assert(_l2error(pypocketfft.irfftn(pypocketfft.rfftn(b,(ax,)),(ax,),lastsize=n,fct=1./n), b)<5e-7)
 
 @pmp("shp", shapes)
 def test_identity_r2(shp):
     a=np.random.rand(*shp)-0.5 + 1j*np.random.rand(*shp)-0.5j
     a=pypocketfft.rfftn(pypocketfft.irfftn(a))
     fct = 1./pypocketfft.irfftn(a).size
-    vmax = np.max(np.abs(a))
-    assert(_l2error(pypocketfft.rfftn(pypocketfft.irfftn(a),fct=fct), a)<1e-15*vmax)
+    assert(_l2error(pypocketfft.rfftn(pypocketfft.irfftn(a),fct=fct), a)<1e-15)
 
 @pmp("shp", shapes2D+shapes3D)
 def test_hartley2(shp):
     a=np.random.rand(*shp)-0.5
     v1 = pypocketfft.hartley2(a)
     v2 = pypocketfft.fftn(a.astype(np.complex128))
-    vmax = np.max(np.abs(v1))
     v2 = v2.real+v2.imag
-    assert(_l2error(v1, v2)<1e-15*vmax)
+    assert(_l2error(v1, v2)<1e-15)
 
 @pmp("shp", shapes)
 def test_hartley_identity(shp):
     a=np.random.rand(*shp)-0.5
     v1 = pypocketfft.hartley(pypocketfft.hartley(a))/a.size
-    vmax = np.max(np.abs(a))
-    assert(_l2error(a, v1)<1e-15*vmax)
+    assert(_l2error(a, v1)<1e-15)
 
 @pmp("shp", shapes)
 def test_hartley2_identity(shp):
     a=np.random.rand(*shp)-0.5
     v1 = pypocketfft.hartley2(pypocketfft.hartley2(a))/a.size
-    vmax = np.max(np.abs(a))
-    assert(_l2error(a, v1)<1e-15*vmax)
+    assert(_l2error(a, v1)<1e-15)
     v1 = a.copy()
     assert (pypocketfft.hartley2(pypocketfft.hartley2(v1, inplace=True), fct=1./a.size, inplace=True) is v1)
-    assert(_l2error(a, v1)<1e-15*vmax)
+    assert(_l2error(a, v1)<1e-15)
 
 @pmp("shp", shapes2D)
 @pmp("axes", ((0,),(1,),(0,1),(1,0)))
 def test_hartley2_2D(shp, axes):
     a=np.random.rand(*shp)-0.5
     fct = 1./np.prod(np.take(shp,axes))
-    vmax = np.max(np.abs(a))
-    assert(_l2error(pypocketfft.hartley2(pypocketfft.hartley2(a, axes=axes), axes=axes, fct=fct),a)<1e-15*vmax)
+    assert(_l2error(pypocketfft.hartley2(pypocketfft.hartley2(a, axes=axes), axes=axes, fct=fct),a)<1e-15)
-- 
GitLab


From 5f09d105e716d5f77f0cf79c527f1052b9b2671c Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 4 May 2019 12:50:27 +0200
Subject: [PATCH 5/5] small type fixes

---
 pocketfft.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pocketfft.cc b/pocketfft.cc
index 1c65916..08dd1be 100644
--- a/pocketfft.cc
+++ b/pocketfft.cc
@@ -2311,7 +2311,7 @@ template<typename T> NOINLINE void pocketfft_general_c2r(
   }
 
 template<typename T> NOINLINE void pocketfft_general_r(
-  const ndarr<T> &in, ndarr<T> &out, int axis, bool forward, T fct)
+  const ndarr<T> &in, ndarr<T> &out, size_t axis, bool forward, T fct)
   {
   auto storage = alloc_tmp<T>(in.shape(), in.shape(axis), sizeof(T));
 
@@ -2460,7 +2460,7 @@ py::array rfftn(const py::array &in, py::object axes_, double fct)
                               : rfftn_internal<float> (in, axes_, fct);
   }
 template<typename T> py::array xrfft_scipy(const py::array &in,
-  int axis, double fct, bool inplace, bool fwd)
+  size_t axis, double fct, bool inplace, bool fwd)
   {
   auto dims(copy_shape(in));
   py::array res = inplace ? in : py::array_t<T>(dims);
@@ -2469,13 +2469,14 @@ template<typename T> py::array xrfft_scipy(const py::array &in,
   pocketfft_general_r<T>(ain, aout, axis, fwd, fct);
   return res;
   }
-py::array rfft_scipy(const py::array &in, int axis, double fct, bool inplace)
+py::array rfft_scipy(const py::array &in, size_t axis, double fct, bool inplace)
   {
   return tcheck(in, f64, f32) ?
     xrfft_scipy<double>(in, axis, fct, inplace, true) :
     xrfft_scipy<float> (in, axis, fct, inplace, true);
   }
-py::array irfft_scipy(const py::array &in, int axis, double fct, bool inplace)
+py::array irfft_scipy(const py::array &in, size_t axis, double fct,
+  bool inplace)
   {
   return tcheck(in, f64, f32) ?
     xrfft_scipy<double>(in, axis, fct, inplace, false) :
-- 
GitLab