pocketfft_hdronly.h 104 KB
Newer Older
1
/*
2 3 4
This file is part of pocketfft.

Copyright (C) 2010-2019 Max-Planck-Society
5 6 7 8 9 10 11
Copyright (C) 2019 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
38 39 40 41

#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H

42
#ifndef __cplusplus
43
#error This file is C++ and requires a C++ compiler.
44 45
#endif

Martin Reinecke's avatar
sync  
Martin Reinecke committed
46
#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
47
#error This file requires at least C++11 support.
48 49
#endif

50 51 52 53
#ifndef POCKETFFT_CACHE_SIZE
#define POCKETFFT_CACHE_SIZE 16
#endif

54 55 56 57 58 59
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <memory>
#include <vector>
Martin Reinecke's avatar
sync  
Martin Reinecke committed
60
#include <complex>
61 62
#if POCKETFFT_CACHE_SIZE!=0
#include <array>
Martin Reinecke's avatar
Martin Reinecke committed
63
#include <mutex>
64
#endif
Martin Reinecke's avatar
Martin Reinecke committed
65 66 67 68
#ifdef POCKETFFT_OPENMP
#include <omp.h>
#endif

69

Martin Reinecke's avatar
Martin Reinecke committed
70
#if defined(__GNUC__)
71 72
#define POCKETFFT_NOINLINE __attribute__((noinline))
#define POCKETFFT_RESTRICT __restrict__
Martin Reinecke's avatar
Martin Reinecke committed
73
#elif defined(_MSC_VER)
74 75
#define POCKETFFT_NOINLINE __declspec(noinline)
#define POCKETFFT_RESTRICT __restrict
76
#else
77 78
#define POCKETFFT_NOINLINE
#define POCKETFFT_RESTRICT
79 80 81 82 83 84
#endif

namespace pocketfft {

namespace detail {

Martin Reinecke's avatar
Martin Reinecke committed
85 86
using namespace std;

Martin Reinecke's avatar
Martin Reinecke committed
87 88 89 90 91 92
using shape_t = vector<size_t>;
using stride_t = vector<ptrdiff_t>;

constexpr bool FORWARD  = true,
               BACKWARD = false;

Martin Reinecke's avatar
sync  
Martin Reinecke committed
93 94 95 96
// only enable vector support for gcc>=5.0 and clang>=5.0
#ifndef POCKETFFT_NO_VECTORS
#define POCKETFFT_NO_VECTORS
#if defined(__INTEL_COMPILER)
97
// do nothing. This is necessary because this compiler also sets __GNUC__.
Martin Reinecke's avatar
sync  
Martin Reinecke committed
98 99 100 101 102 103 104 105 106 107 108
#elif defined(__clang__)
#if __clang__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
109 110
template<typename T> struct VLEN { static constexpr size_t val=1; };

Martin Reinecke's avatar
Martin Reinecke committed
111 112
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
Martin Reinecke's avatar
Martin Reinecke committed
113 114
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
Martin Reinecke's avatar
Martin Reinecke committed
115
#elif (defined(__AVX__))
Martin Reinecke's avatar
Martin Reinecke committed
116 117
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
Martin Reinecke's avatar
Martin Reinecke committed
118
#elif (defined(__SSE2__))
Martin Reinecke's avatar
Martin Reinecke committed
119 120
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
sync  
Martin Reinecke committed
121
#elif (defined(__VSX__))
Martin Reinecke's avatar
Martin Reinecke committed
122 123
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
Martin Reinecke committed
124 125 126 127 128
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
129
template<typename T> class arr
130 131 132 133 134
  {
  private:
    T *p;
    size_t sz;

Martin Reinecke's avatar
Martin Reinecke committed
135
#if defined(POCKETFFT_NO_VECTORS)
136 137 138 139 140
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = malloc(num*sizeof(T));
      if (!res) throw bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
141 142 143 144 145 146 147 148
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
#elif __cplusplus >= 201703L
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
149 150
      void *res = aligned_alloc(64,num*sizeof(T));
      if (!res) throw bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
151 152 153 154
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
155
#else // portable emulation
Martin Reinecke's avatar
Martin Reinecke committed
156 157 158
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
159 160 161 162 163 164
      void *ptr = malloc(num*sizeof(T)+64);
      if (!ptr) throw bad_alloc();
      T *res = reinterpret_cast<T *>
        ((reinterpret_cast<size_t>(ptr) & ~(size_t(63))) + 64);
      (reinterpret_cast<void**>(res))[-1] = ptr;
      return res;
Martin Reinecke's avatar
Martin Reinecke committed
165 166
      }
    static void dealloc(T *ptr)
167
      { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
Martin Reinecke's avatar
Martin Reinecke committed
168
#endif
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200

  public:
    arr() : p(0), sz(0) {}
    arr(size_t n) : p(ralloc(n)), sz(n) {}
    arr(arr &&other)
      : p(other.p), sz(other.sz)
      { other.p=nullptr; other.sz=0; }
    ~arr() { dealloc(p); }

    void resize(size_t n)
      {
      if (n==sz) return;
      dealloc(p);
      p = ralloc(n);
      sz = n;
      }

    T &operator[](size_t idx) { return p[idx]; }
    const T &operator[](size_t idx) const { return p[idx]; }

    T *data() { return p; }
    const T *data() const { return p; }

    size_t size() const { return sz; }
  };

template<typename T> struct cmplx {
  T r, i;
  cmplx() {}
  cmplx(T r_, T i_) : r(r_), i(i_) {}
  void Set(T r_, T i_) { r=r_; i=i_; }
  void Set(T r_) { r=r_; i=T(0); }
Martin Reinecke's avatar
Martin Reinecke committed
201 202
  void Split(T &r_, T &i_) const { r_=r; i_=i; }
  void SplitConj(T &r_, T &i_) const { r_=r; i_=-i; }
203 204 205 206
  cmplx &operator+= (const cmplx &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator*= (T2 other)
    { r*=other; i*=other; return *this; }
207 208 209 210 211 212 213
  template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
    {
    T tmp = r*other.r - i*other.i;
    i = r*other.i + i*other.r;
    r = tmp;
    return *this;
    }
Martin Reinecke's avatar
Martin Reinecke committed
214 215 216 217
  template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
    { r-=other.r; i-=other.i; return *this; }
218 219 220
  template<typename T2> auto operator* (const T2 &other) const
    -> cmplx<decltype(r*other)>
    { return {r*other, i*other}; }
Martin Reinecke's avatar
Martin Reinecke committed
221 222 223 224 225 226
  template<typename T2> auto operator+ (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r+other.r, i+other.i}; }
  template<typename T2> auto operator- (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r-other.r, i-other.i}; }
227 228 229
  template<typename T2> auto operator* (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
Martin Reinecke's avatar
Martin Reinecke committed
230
  template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
231 232
    -> cmplx<decltype(r+other.r)>
    {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
233
    using Tres = cmplx<decltype(r+other.r)>;
Martin Reinecke's avatar
Martin Reinecke committed
234 235
    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
236 237
    }
};
Martin Reinecke's avatar
Martin Reinecke committed
238 239
template<typename T> inline void PM(T &a, T &b, T c, T d)
  { a=c+d; b=c-d; }
Martin Reinecke's avatar
Martin Reinecke committed
240 241
template<typename T> inline void PMINPLACE(T &a, T &b)
  { T t = a; a+=b; b=t-b; }
242 243
template<typename T> inline void MPINPLACE(T &a, T &b)
  { T t = a; a-=b; b=t+b; }
244 245
template<typename T> cmplx<T> conj(const cmplx<T> &a)
  { return {a.r, -a.i}; }
Martin Reinecke's avatar
Martin Reinecke committed
246 247 248 249 250
template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
  {
  res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
            : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
  }
251 252 253

template<typename T> void ROT90(cmplx<T> &a)
  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
Martin Reinecke's avatar
Martin Reinecke committed
254 255
template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
256 257 258 259 260 261 262 263

//
// twiddle factor section
//

template<typename T> class sincos_2pibyn
  {
  private:
Martin Reinecke's avatar
cleanup  
Martin Reinecke committed
264
    using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
265 266
    arr<T> data;

267
    void my_sincosm1pi (Thigh a_, Thigh *POCKETFFT_RESTRICT res)
268
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
269 270
      if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
271 272 273
        constexpr Thigh pi = Thigh(3.141592653589793238462643383279502884197L);
        auto s = sin(pi*a_);
        res[1] = s;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
274 275 276
        res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1);
        return;
        }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
277 278 279
      // adapted from https://stackoverflow.com/questions/42792939/
      // CAUTION: this function only works for arguments in the range
      //          [-0.25; 0.25]!
Martin Reinecke's avatar
sync  
Martin Reinecke committed
280
      double a = double(a_);
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
      double s = a * a;
      /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
      double r =     -1.0369917389758117e-4;
      r = fma (r, s,  1.9294935641298806e-3);
      r = fma (r, s, -2.5806887942825395e-2);
      r = fma (r, s,  2.3533063028328211e-1);
      r = fma (r, s, -1.3352627688538006e+0);
      r = fma (r, s,  4.0587121264167623e+0);
      r = fma (r, s, -4.9348022005446790e+0);
      double c = r*s;
      /* Approximate sin(pi*x) for x in [-0.25,0.25] */
      r =             4.6151442520157035e-4;
      r = fma (r, s, -7.3700183130883555e-3);
      r = fma (r, s,  8.2145868949323936e-2);
      r = fma (r, s, -5.9926452893214921e-1);
      r = fma (r, s,  2.5501640398732688e+0);
      r = fma (r, s, -5.1677127800499516e+0);
      s = s * a;
      r = r * s;
      s = fma (a, 3.1415926535897931e+0, r);
      res[0] = c;
      res[1] = s;
      }

305 306
    POCKETFFT_NOINLINE void calc_first_octant(size_t den,
      T * POCKETFFT_RESTRICT res)
307 308 309 310 311
      {
      size_t n = (den+4)>>3;
      if (n==0) return;
      res[0]=1.; res[1]=0.;
      if (n==1) return;
Martin Reinecke's avatar
Martin Reinecke committed
312
      size_t l1 = size_t(sqrt(n));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
313
      arr<Thigh> tmp(2*l1);
314 315
      for (size_t i=1; i<l1; ++i)
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
316 317 318
        my_sincosm1pi(Thigh(2*i)/Thigh(den),&tmp[2*i]);
        res[2*i  ] = T(tmp[2*i]+1);
        res[2*i+1] = T(tmp[2*i+1]);
319 320 321 322
        }
      size_t start=l1;
      while(start<n)
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
323
        Thigh cs[2];
Martin Reinecke's avatar
sync  
Martin Reinecke committed
324 325 326
        my_sincosm1pi((Thigh(2*start))/Thigh(den),cs);
        res[2*start] = T(cs[0]+1);
        res[2*start+1] = T(cs[1]);
327 328 329 330
        size_t end = l1;
        if (start+end>n) end = n-start;
        for (size_t i=1; i<end; ++i)
          {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
331
          Thigh csx[2]={tmp[2*i], tmp[2*i+1]};
Martin Reinecke's avatar
sync  
Martin Reinecke committed
332 333
          res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1);
          res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]);
334 335 336 337 338
          }
        start += l1;
        }
      }

339
    void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
340
      {
341
      T * POCKETFFT_RESTRICT p = res+n;
342 343 344 345 346
      calc_first_octant(n<<1, p);
      size_t ndone=(n+2)>>2;
      size_t i=0, idx1=0, idx2=2*ndone-2;
      for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
        {
Martin Reinecke's avatar
Martin Reinecke committed
347 348
        res[idx1] = p[2*i  ]; res[idx1+1] = p[2*i+1];
        res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2];
349 350
        }
      if (i!=ndone)
Martin Reinecke's avatar
Martin Reinecke committed
351
        { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; }
352 353
      }

354
    void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res)
355
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
356
      int ndone=int(n+1)>>1;
357 358
      T * p = res+n-1;
      calc_first_octant(n<<2, p);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
359
      int i4=0, in=int(n), i=0;
360
      for (; i4<=in-i4; ++i, i4+=4) // octant 0
Martin Reinecke's avatar
Martin Reinecke committed
361
        { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; }
362
      for (; i4-in <= 0; ++i, i4+=4) // octant 1
Martin Reinecke's avatar
sync  
Martin Reinecke committed
363
        { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; }
364
      for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
Martin Reinecke's avatar
sync  
Martin Reinecke committed
365
        { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; }
366
      for (; i<ndone; ++i, i4+=4) // octant 3
Martin Reinecke's avatar
sync  
Martin Reinecke committed
367
        { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; }
368 369
      }

370
    void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
371
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
372
      constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
373 374 375 376
      size_t quart = n>>2;
      if ((n&7)==0)
        res[quart] = res[quart+1] = hsqt2;
      for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
377
        { res[j] = res[i+1]; res[j+1] = res[i]; }
378 379
      }

380
    POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res)
381 382 383 384
      {
      size_t half = n>>1;
      if ((n&3)==0)
        for (size_t i=0; i<half; i+=2)
Martin Reinecke's avatar
Martin Reinecke committed
385
          { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; }
386 387
      else
        for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
388
          { res[j] = -res[i]; res[j+1] = res[i+1]; }
389 390
      }

391
    void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res)
392 393 394 395 396 397
      {
      if ((n&1)==0)
        for (size_t i=0; i<n; ++i)
          res[i+n] = -res[i];
      else
        for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
398
          { res[j] = res[i]; res[j+1] = -res[i+1]; }
399 400
      }

401
    POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res)
402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
      {
      if ((n&3)==0)
        {
        calc_first_octant(n, res);
        fill_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else if ((n&1)==0)
        {
        calc_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else
        calc_first_half(n, res);
      }

  public:
419
    POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half)
420 421 422 423 424 425 426 427 428 429 430 431
      : data(2*n)
      {
      sincos_2pibyn_half(n, data.data());
      if (!half) fill_second_half(n, data.data());
      }

    T operator[](size_t idx) const { return data[idx]; }
    const T *rdata() const { return data; }
    const cmplx<T> *cdata() const
      { return reinterpret_cast<const cmplx<T> *>(data.data()); }
  };

Martin Reinecke's avatar
Martin Reinecke committed
432
struct util // hack to avoid duplicate symbols
433
  {
434
  static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
435
    {
Martin Reinecke's avatar
Martin Reinecke committed
436 437 438
    size_t res=1;
    while ((n&1)==0)
      { res=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
439 440 441
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        { res=x; n/=x; }
Martin Reinecke's avatar
Martin Reinecke committed
442 443 444 445
    if (n>1) res=n;
    return res;
    }

446
  static POCKETFFT_NOINLINE double cost_guess (size_t n)
447
    {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
448
    constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
Martin Reinecke's avatar
Martin Reinecke committed
449 450 451 452
    size_t ni=n;
    double result=0.;
    while ((n&1)==0)
      { result+=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
453 454 455 456 457 458
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        {
        result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
        n/=x;
        }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
459 460
    if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
    return result*double(ni);
461 462
    }

Martin Reinecke's avatar
Martin Reinecke committed
463
  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
464
  static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
465 466 467 468
    {
    if (n<=12) return n;

    size_t bestfac=2*n;
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
    for (size_t f11=1; f11<bestfac; f11*=11)
      for (size_t f117=f11; f117<bestfac; f117*=7)
        for (size_t f1175=f117; f1175<bestfac; f1175*=5)
          {
          size_t x=f1175;
          while (x<n) x*=2;
          for (;;)
            {
            if (x<n)
              x*=3;
            else if (x>n)
              {
              if (x<bestfac) bestfac=x;
              if (x&1) break;
              x>>=1;
              }
            else
              return n;
            }
          }
    return bestfac;
    }

  /* returns the smallest composite of 2, 3, 5 which is >= n */
  static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
    {
    if (n<=6) return n;

    size_t bestfac=2*n;
    for (size_t f5=1; f5<bestfac; f5*=5)
      {
      size_t x = f5;
      while (x<n) x *= 2;
      for (;;)
        {
        if (x<n)
          x*=3;
        else if (x>n)
          {
          if (x<bestfac) bestfac=x;
          if (x&1) break;
          x>>=1;
          }
        else
          return n;
        }
      }
Martin Reinecke's avatar
Martin Reinecke committed
516 517
    return bestfac;
    }
518

Martin Reinecke's avatar
Martin Reinecke committed
519 520 521 522 523 524 525
  static size_t prod(const shape_t &shape)
    {
    size_t res=1;
    for (auto sz: shape)
      res*=sz;
    return res;
    }
Martin Reinecke's avatar
Martin Reinecke committed
526

527
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
528 529 530 531 532 533
    const stride_t &stride_in, const stride_t &stride_out, bool inplace)
    {
    auto ndim = shape.size();
    if (ndim<1) throw runtime_error("ndim must be >= 1");
    if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
      throw runtime_error("stride dimension mismatch");
Martin Reinecke's avatar
sync  
Martin Reinecke committed
534 535
    if (inplace && (stride_in!=stride_out))
      throw runtime_error("stride mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
536 537
    }

538
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
539 540 541 542 543 544 545 546
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    const shape_t &axes)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    auto ndim = shape.size();
    shape_t tmp(ndim,0);
    for (auto ax : axes)
      {
547 548
      if (ax>=ndim) throw invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw invalid_argument("axis specified repeatedly");
Martin Reinecke's avatar
Martin Reinecke committed
549 550 551
      }
    }

552
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
553 554 555 556
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    size_t axis)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
557
    if (axis>=shape.size()) throw invalid_argument("bad axis number");
Martin Reinecke's avatar
Martin Reinecke committed
558
    }
Martin Reinecke's avatar
Martin Reinecke committed
559 560

#ifdef POCKETFFT_OPENMP
Martin Reinecke's avatar
sync  
Martin Reinecke committed
561 562
    static size_t nthreads() { return size_t(omp_get_num_threads()); }
    static size_t thread_num() { return size_t(omp_get_thread_num()); }
Martin Reinecke's avatar
Martin Reinecke committed
563 564 565 566
    static size_t thread_count (size_t nthreads, const shape_t &shape,
      size_t axis)
      {
      if (nthreads==1) return 1;
Martin Reinecke's avatar
Martin Reinecke committed
567
      if (prod(shape) < 20*shape[axis]) return 1;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
568
      return (nthreads==0) ? size_t(omp_get_max_threads()) : nthreads;
Martin Reinecke's avatar
Martin Reinecke committed
569
      }
Martin Reinecke's avatar
Martin Reinecke committed
570
#else
Martin Reinecke's avatar
Martin Reinecke committed
571 572
    static constexpr size_t nthreads() { return 1; }
    static constexpr size_t thread_num() { return 0; }
Martin Reinecke's avatar
Martin Reinecke committed
573
#endif
Martin Reinecke's avatar
Martin Reinecke committed
574
  };
575 576 577 578 579 580 581 582 583 584 585 586 587 588

//
// complex FFTPACK transforms
//

template<typename T0> class cfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      cmplx<T0> *tw, *tws;
      };

Martin Reinecke's avatar
Martin Reinecke committed
589
    size_t length;
590
    arr<cmplx<T0>> mem;
Martin Reinecke's avatar
Martin Reinecke committed
591
    vector<fctdata> fact;
592 593

    void add_factor(size_t factor)
Martin Reinecke's avatar
Martin Reinecke committed
594
      { fact.push_back({factor, nullptr, nullptr}); }
595

Martin Reinecke's avatar
Martin Reinecke committed
596
template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
597
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
598
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
599 600 601
  {
  constexpr size_t cdim=2;

602 603
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
604
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
605 606 607 608
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

609 610 611 612 613 614 615 616 617 618 619 620 621 622
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      for (size_t i=1; i<ido; ++i)
        {
        CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
Martin Reinecke's avatar
Martin Reinecke committed
623
        special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
624 625 626 627
        }
      }
  }

628
#define POCKETFFT_PREP3(idx) \
629
        T t0 = CC(idx,0,k), t1, t2; \
Martin Reinecke's avatar
Martin Reinecke committed
630
        PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
631
        CH(idx,k,0)=t0+t1;
632
#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
633
        { \
Martin Reinecke's avatar
Martin Reinecke committed
634 635 636
        T ca=t0+t1*twr; \
        T cb{-t2.i*twi, t2.r*twi}; \
        PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
637
        }
638
#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
639
        { \
Martin Reinecke's avatar
Martin Reinecke committed
640 641 642 643
        T ca=t0+t1*twr; \
        T cb{-t2.i*twi, t2.r*twi}; \
        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
644
        }
Martin Reinecke's avatar
Martin Reinecke committed
645
template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
646
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
647
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
648 649
  {
  constexpr size_t cdim=3;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
650
  constexpr T0 tw1r=-0.5,
Martin Reinecke's avatar
Martin Reinecke committed
651
               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
652

653 654
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
655
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
656 657 658 659
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

660 661 662
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
663 664
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
665 666 667 668 669
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
670 671
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
672 673 674
      }
      for (size_t i=1; i<ido; ++i)
        {
675 676
        POCKETFFT_PREP3(i)
        POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
677 678 679 680
        }
      }
  }

681 682 683
#undef POCKETFFT_PARTSTEP3b
#undef POCKETFFT_PARTSTEP3a
#undef POCKETFFT_PREP3
684

Martin Reinecke's avatar
Martin Reinecke committed
685
template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
686
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
687
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
688 689 690
  {
  constexpr size_t cdim=4;

691 692
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
693
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
694 695 696 697
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

698 699 700 701
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T t1, t2, t3, t4;
Martin Reinecke's avatar
Martin Reinecke committed
702 703
      PM(t2,t1,CC(0,0,k),CC(0,2,k));
      PM(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
704
      ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
705 706
      PM(CH(0,k,0),CH(0,k,2),t2,t3);
      PM(CH(0,k,1),CH(0,k,3),t1,t4);
707 708 709 710 711 712
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T t1, t2, t3, t4;
Martin Reinecke's avatar
Martin Reinecke committed
713 714
      PM(t2,t1,CC(0,0,k),CC(0,2,k));
      PM(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
715
      ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
716 717
      PM(CH(0,k,0),CH(0,k,2),t2,t3);
      PM(CH(0,k,1),CH(0,k,3),t1,t4);
718 719 720
      }
      for (size_t i=1; i<ido; ++i)
        {
Martin Reinecke's avatar
Martin Reinecke committed
721
        T t1, t2, t3, t4;
722
        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
Martin Reinecke's avatar
Martin Reinecke committed
723 724
        PM(t2,t1,cc0,cc2);
        PM(t3,t4,cc1,cc3);
Martin Reinecke's avatar
Martin Reinecke committed
725
        ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
726 727 728 729
        CH(i,k,0) = t2+t3;
        special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
        special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
        special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
730 731 732 733
        }
      }
  }

734
#define POCKETFFT_PREP5(idx) \
735
        T t0 = CC(idx,0,k), t1, t2, t3, t4; \
Martin Reinecke's avatar
Martin Reinecke committed
736 737
        PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
        PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
738 739 740
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
        CH(idx,k,0).i=t0.i+t1.i+t2.i;

741
#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
742 743 744 745 746 747
        { \
        T ca,cb; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
Martin Reinecke's avatar
Martin Reinecke committed
748
        PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
749 750
        }

751
#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
752 753 754 755 756 757
        { \
        T ca,cb,da,db; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
Martin Reinecke's avatar
Martin Reinecke committed
758 759
        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
760
        }
Martin Reinecke's avatar
Martin Reinecke committed
761
template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
762
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
763
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
764 765
  {
  constexpr size_t cdim=5;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
766
  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
767
               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
768
               tw2r= T0(-0.8090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
769
               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
770

771 772
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
773
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
774 775 776 777
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

778 779 780
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
781 782 783
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
784 785 786 787 788
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
789 790 791
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
792 793 794
      }
      for (size_t i=1; i<ido; ++i)
        {
795 796 797
        POCKETFFT_PREP5(i)
        POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
        POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
798 799 800 801
        }
      }
  }

802 803 804
#undef POCKETFFT_PARTSTEP5b
#undef POCKETFFT_PARTSTEP5a
#undef POCKETFFT_PREP5
805

806
#define POCKETFFT_PREP7(idx) \
807
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
Martin Reinecke's avatar
Martin Reinecke committed
808 809 810
        PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
        PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
        PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
811 812 813
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;

814
#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
815 816 817 818 819 820
        { \
        T ca,cb; \
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
Martin Reinecke's avatar
Martin Reinecke committed
821
        PM(out1,out2,ca,cb); \
822
        }
823 824 825
#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
826 827
        { \
        T da,db; \
828
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
829 830
        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
831 832
        }

Martin Reinecke's avatar
Martin Reinecke committed
833
template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
834
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
835
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
836 837
  {
  constexpr size_t cdim=7;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
838
  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
Martin Reinecke's avatar
Martin Reinecke committed
839
               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
840
               tw2r= T0(-0.2225209339563144042889025644967948L),
Martin Reinecke's avatar
Martin Reinecke committed
841
               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
842
               tw3r= T0(-0.9009688679024191262361023195074451L),
Martin Reinecke's avatar
Martin Reinecke committed
843
               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
844

845 846
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
847
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
848 849 850 851
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

852 853 854
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
855 856 857 858
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
859 860 861 862 863
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
864 865 866 867
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
868 869 870
      }
      for (size_t i=1; i<ido; ++i)
        {
871 872 873 874
        POCKETFFT_PREP7(i)
        POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
        POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
875 876 877 878
        }
      }
  }

879 880 881 882
#undef POCKETFFT_PARTSTEP7
#undef POCKETFFT_PARTSTEP7a0
#undef POCKETFFT_PARTSTEP7a
#undef POCKETFFT_PREP7
883

Martin Reinecke's avatar
Martin Reinecke committed
884
template <bool fwd, typename T> void ROTX45(T &a) const
Martin Reinecke's avatar
sync  
Martin Reinecke committed
885 886
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
887
  if (fwd)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
888
    { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
Martin Reinecke's avatar
Martin Reinecke committed
889 890
  else
    { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
891
  }
Martin Reinecke's avatar
Martin Reinecke committed
892
template <bool fwd, typename T> void ROTX135(T &a) const
Martin Reinecke's avatar
sync  
Martin Reinecke committed
893 894
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
895
  if (fwd)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
896
    { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
Martin Reinecke's avatar
Martin Reinecke committed
897 898
  else
    { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
899 900
  }

Martin Reinecke's avatar
Martin Reinecke committed
901
template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
902
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
903
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
Martin Reinecke's avatar
sync  
Martin Reinecke committed
904 905 906
  {
  constexpr size_t cdim=8;

907 908
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
909
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
910 911 912 913
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
sync  
Martin Reinecke committed
914 915 916 917
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
Martin Reinecke's avatar
Martin Reinecke committed
918 919
      PM(a1,a5,CC(0,1,k),CC(0,5,k));
      PM(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
920
      PMINPLACE(a1,a3);
Martin Reinecke's avatar
Martin Reinecke committed
921
      ROTX90<fwd>(a3);
Martin Reinecke's avatar
Martin Reinecke committed
922 923

      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
924
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
925 926
      ROTX45<fwd>(a5);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
927

Martin Reinecke's avatar
Martin Reinecke committed
928 929 930 931
      PM(a0,a4,CC(0,0,k),CC(0,4,k));
      PM(a2,a6,CC(0,2,k),CC(0,6,k));
      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
Martin Reinecke's avatar
Martin Reinecke committed
932
      ROTX90<fwd>(a6);
Martin Reinecke's avatar
Martin Reinecke committed
933 934
      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
935 936 937
      }
  else
    for (size_t k=0; k<l1; ++k)
Martin Reinecke's avatar
Martin Reinecke committed
938
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
939 940
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
Martin Reinecke's avatar
Martin Reinecke committed
941 942
      PM(a1,a5,CC(0,1,k),CC(0,5,k));
      PM(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
943
      PMINPLACE(a1,a3);
Martin Reinecke's avatar
Martin Reinecke committed
944
      ROTX90<fwd>(a3);
Martin Reinecke's avatar
Martin Reinecke committed
945 946

      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
947
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
948 949
      ROTX45<fwd>(a5);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
950

Martin Reinecke's avatar
Martin Reinecke committed
951 952 953 954
      PM(a0,a4,CC(0,0,k),CC(0,4,k));
      PM(a2,a6,CC(0,2,k),CC(0,6,k));
      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
Martin Reinecke's avatar
Martin Reinecke committed
955
      ROTX90<fwd>(a6);
Martin Reinecke's avatar
Martin Reinecke committed
956 957
      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
Martin Reinecke's avatar
Martin Reinecke committed
958
      }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
959 960 961
      for (size_t i=1; i<ido; ++i)
        {
        T a0, a1, a2, a3, a4, a5, a6, a7;
Martin Reinecke's avatar
Martin Reinecke committed
962 963
        PM(a1,a5,CC(i,1,k),CC(i,5,k));
        PM(a3,a7,CC(i,3,k),CC(i,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
964
        ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
965
        PMINPLACE(a1,a3);
Martin Reinecke's avatar
Martin Reinecke committed
966
        ROTX90<fwd>(a3);
Martin Reinecke's avatar
sync