pocketfft_hdronly.h 107 KB
Newer Older
Martin Reinecke's avatar
Martin Reinecke committed
1
2
3
4
/*
This file is part of pocketfft.

Copyright (C) 2010-2019 Max-Planck-Society
Martin Reinecke's avatar
Martin Reinecke committed
5
6
7
8
9
10
11
Copyright (C) 2019 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell
Martin Reinecke's avatar
Martin Reinecke committed
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H

#ifndef __cplusplus
Martin Reinecke's avatar
Martin Reinecke committed
43
#error This file is C++ and requires a C++ compiler.
Martin Reinecke's avatar
Martin Reinecke committed
44
45
46
#endif

#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
Martin Reinecke's avatar
Martin Reinecke committed
47
48
49
50
51
#error This file requires at least C++11 support.
#endif

#ifndef POCKETFFT_CACHE_SIZE
#define POCKETFFT_CACHE_SIZE 16
Martin Reinecke's avatar
Martin Reinecke committed
52
53
54
55
56
57
58
59
60
#endif

#include <cmath>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <memory>
#include <vector>
#include <complex>
Martin Reinecke's avatar
Martin Reinecke committed
61
62
63
#if POCKETFFT_CACHE_SIZE!=0
#include <array>
#include <mutex>
Martin Reinecke's avatar
Martin Reinecke committed
64
65
66
67
68
69
70
#endif
#ifdef POCKETFFT_OPENMP
#include <omp.h>
#endif


#if defined(__GNUC__)
Martin Reinecke's avatar
Martin Reinecke committed
71
72
#define POCKETFFT_NOINLINE __attribute__((noinline))
#define POCKETFFT_RESTRICT __restrict__
Martin Reinecke's avatar
Martin Reinecke committed
73
#elif defined(_MSC_VER)
Martin Reinecke's avatar
Martin Reinecke committed
74
75
#define POCKETFFT_NOINLINE __declspec(noinline)
#define POCKETFFT_RESTRICT __restrict
Martin Reinecke's avatar
Martin Reinecke committed
76
#else
Martin Reinecke's avatar
Martin Reinecke committed
77
78
#define POCKETFFT_NOINLINE
#define POCKETFFT_RESTRICT
Martin Reinecke's avatar
Martin Reinecke committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#endif

namespace pocketfft {

namespace detail {

using namespace std;

using shape_t = vector<size_t>;
using stride_t = vector<ptrdiff_t>;

constexpr bool FORWARD  = true,
               BACKWARD = false;

// only enable vector support for gcc>=5.0 and clang>=5.0
#ifndef POCKETFFT_NO_VECTORS
#define POCKETFFT_NO_VECTORS
#if defined(__INTEL_COMPILER)
Martin Reinecke's avatar
Martin Reinecke committed
97
// do nothing. This is necessary because this compiler also sets __GNUC__.
Martin Reinecke's avatar
Martin Reinecke committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#elif defined(__clang__)
#if __clang__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#endif
#endif

template<typename T> struct VLEN { static constexpr size_t val=1; };

#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
#elif (defined(__AVX__))
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
#elif (defined(__SSE2__))
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
#elif (defined(__VSX__))
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif

template<typename T> class arr
  {
  private:
    T *p;
    size_t sz;

#if defined(POCKETFFT_NO_VECTORS)
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = malloc(num*sizeof(T));
      if (!res) throw bad_alloc();
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
#elif __cplusplus >= 201703L
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = aligned_alloc(64,num*sizeof(T));
      if (!res) throw bad_alloc();
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
Martin Reinecke's avatar
Martin Reinecke committed
155
#else // portable emulation
Martin Reinecke's avatar
Martin Reinecke committed
156
157
158
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
Martin Reinecke's avatar
Martin Reinecke committed
159
160
161
162
163
164
      void *ptr = malloc(num*sizeof(T)+64);
      if (!ptr) throw bad_alloc();
      T *res = reinterpret_cast<T *>
        ((reinterpret_cast<size_t>(ptr) & ~(size_t(63))) + 64);
      (reinterpret_cast<void**>(res))[-1] = ptr;
      return res;
Martin Reinecke's avatar
Martin Reinecke committed
165
166
      }
    static void dealloc(T *ptr)
Martin Reinecke's avatar
Martin Reinecke committed
167
      { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
Martin Reinecke's avatar
Martin Reinecke committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#endif

  public:
    arr() : p(0), sz(0) {}
    arr(size_t n) : p(ralloc(n)), sz(n) {}
    arr(arr &&other)
      : p(other.p), sz(other.sz)
      { other.p=nullptr; other.sz=0; }
    ~arr() { dealloc(p); }

    void resize(size_t n)
      {
      if (n==sz) return;
      dealloc(p);
      p = ralloc(n);
      sz = n;
      }

    T &operator[](size_t idx) { return p[idx]; }
    const T &operator[](size_t idx) const { return p[idx]; }

    T *data() { return p; }
    const T *data() const { return p; }

    size_t size() const { return sz; }
  };

template<typename T> struct cmplx {
  T r, i;
  cmplx() {}
  cmplx(T r_, T i_) : r(r_), i(i_) {}
  void Set(T r_, T i_) { r=r_; i=i_; }
  void Set(T r_) { r=r_; i=T(0); }
  cmplx &operator+= (const cmplx &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator*= (T2 other)
    { r*=other; i*=other; return *this; }
Martin Reinecke's avatar
Martin Reinecke committed
205
206
207
208
209
210
211
212
213
214
215
  template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
    {
    T tmp = r*other.r - i*other.i;
    i = r*other.i + i*other.r;
    r = tmp;
    return *this;
    }
  template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
    { r-=other.r; i-=other.i; return *this; }
Martin Reinecke's avatar
Martin Reinecke committed
216
217
218
  template<typename T2> auto operator* (const T2 &other) const
    -> cmplx<decltype(r*other)>
    { return {r*other, i*other}; }
Martin Reinecke's avatar
Martin Reinecke committed
219
220
221
222
223
224
  template<typename T2> auto operator+ (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r+other.r, i+other.i}; }
  template<typename T2> auto operator- (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r-other.r, i-other.i}; }
Martin Reinecke's avatar
Martin Reinecke committed
225
226
227
  template<typename T2> auto operator* (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
Martin Reinecke's avatar
Martin Reinecke committed
228
  template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
Martin Reinecke's avatar
Martin Reinecke committed
229
230
231
    -> cmplx<decltype(r+other.r)>
    {
    using Tres = cmplx<decltype(r+other.r)>;
Martin Reinecke's avatar
Martin Reinecke committed
232
233
    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
Martin Reinecke's avatar
Martin Reinecke committed
234
235
    }
};
Martin Reinecke's avatar
Martin Reinecke committed
236
237
238
template<typename T> inline void PMINPLACE(T &a, T &b)
  { T t = a; a+=b; b=t-b; }
template<typename T> void PMC(T &a, T &b, const T &c, const T &d)
Martin Reinecke's avatar
Martin Reinecke committed
239
240
241
242
243
244
  { a = c+d; b = c-d; }
template<typename T> cmplx<T> conj(const cmplx<T> &a)
  { return {a.r, -a.i}; }

template<typename T> void ROT90(cmplx<T> &a)
  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
Martin Reinecke's avatar
Martin Reinecke committed
245
246
template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
Martin Reinecke's avatar
Martin Reinecke committed
247
248
249
250
251
252
253
254

//
// twiddle factor section
//

template<typename T> class sincos_2pibyn
  {
  private:
Martin Reinecke's avatar
Martin Reinecke committed
255
    using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
Martin Reinecke's avatar
Martin Reinecke committed
256
257
    arr<T> data;

Martin Reinecke's avatar
Martin Reinecke committed
258
    void my_sincosm1pi (Thigh a_, Thigh *POCKETFFT_RESTRICT res)
Martin Reinecke's avatar
Martin Reinecke committed
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
      {
      if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double
        {
        constexpr Thigh pi = Thigh(3.141592653589793238462643383279502884197L);
        auto s = sin(pi*a_);
        res[1] = s;
        res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1);
        return;
        }
      // adapted from https://stackoverflow.com/questions/42792939/
      // CAUTION: this function only works for arguments in the range
      //          [-0.25; 0.25]!
      double a = double(a_);
      double s = a * a;
      /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
      double r =     -1.0369917389758117e-4;
      r = fma (r, s,  1.9294935641298806e-3);
      r = fma (r, s, -2.5806887942825395e-2);
      r = fma (r, s,  2.3533063028328211e-1);
      r = fma (r, s, -1.3352627688538006e+0);
      r = fma (r, s,  4.0587121264167623e+0);
      r = fma (r, s, -4.9348022005446790e+0);
      double c = r*s;
      /* Approximate sin(pi*x) for x in [-0.25,0.25] */
      r =             4.6151442520157035e-4;
      r = fma (r, s, -7.3700183130883555e-3);
      r = fma (r, s,  8.2145868949323936e-2);
      r = fma (r, s, -5.9926452893214921e-1);
      r = fma (r, s,  2.5501640398732688e+0);
      r = fma (r, s, -5.1677127800499516e+0);
      s = s * a;
      r = r * s;
      s = fma (a, 3.1415926535897931e+0, r);
      res[0] = c;
      res[1] = s;
      }

Martin Reinecke's avatar
Martin Reinecke committed
296
297
    POCKETFFT_NOINLINE void calc_first_octant(size_t den,
      T * POCKETFFT_RESTRICT res)
Martin Reinecke's avatar
Martin Reinecke committed
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
      {
      size_t n = (den+4)>>3;
      if (n==0) return;
      res[0]=1.; res[1]=0.;
      if (n==1) return;
      size_t l1 = size_t(sqrt(n));
      arr<Thigh> tmp(2*l1);
      for (size_t i=1; i<l1; ++i)
        {
        my_sincosm1pi(Thigh(2*i)/Thigh(den),&tmp[2*i]);
        res[2*i  ] = T(tmp[2*i]+1);
        res[2*i+1] = T(tmp[2*i+1]);
        }
      size_t start=l1;
      while(start<n)
        {
        Thigh cs[2];
        my_sincosm1pi((Thigh(2*start))/Thigh(den),cs);
        res[2*start] = T(cs[0]+1);
        res[2*start+1] = T(cs[1]);
        size_t end = l1;
        if (start+end>n) end = n-start;
        for (size_t i=1; i<end; ++i)
          {
          Thigh csx[2]={tmp[2*i], tmp[2*i+1]};
          res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1);
          res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]);
          }
        start += l1;
        }
      }

Martin Reinecke's avatar
Martin Reinecke committed
330
    void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
Martin Reinecke's avatar
Martin Reinecke committed
331
      {
Martin Reinecke's avatar
Martin Reinecke committed
332
      T * POCKETFFT_RESTRICT p = res+n;
Martin Reinecke's avatar
Martin Reinecke committed
333
334
335
336
337
338
339
340
341
342
343
344
      calc_first_octant(n<<1, p);
      size_t ndone=(n+2)>>2;
      size_t i=0, idx1=0, idx2=2*ndone-2;
      for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
        {
        res[idx1] = p[2*i  ]; res[idx1+1] = p[2*i+1];
        res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2];
        }
      if (i!=ndone)
        { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; }
      }

Martin Reinecke's avatar
Martin Reinecke committed
345
    void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res)
Martin Reinecke's avatar
Martin Reinecke committed
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
      {
      int ndone=int(n+1)>>1;
      T * p = res+n-1;
      calc_first_octant(n<<2, p);
      int i4=0, in=int(n), i=0;
      for (; i4<=in-i4; ++i, i4+=4) // octant 0
        { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; }
      for (; i4-in <= 0; ++i, i4+=4) // octant 1
        { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; }
      for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
        { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; }
      for (; i<ndone; ++i, i4+=4) // octant 3
        { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; }
      }

Martin Reinecke's avatar
Martin Reinecke committed
361
    void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
Martin Reinecke's avatar
Martin Reinecke committed
362
363
364
365
366
367
368
369
370
      {
      constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
      size_t quart = n>>2;
      if ((n&7)==0)
        res[quart] = res[quart+1] = hsqt2;
      for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
        { res[j] = res[i+1]; res[j+1] = res[i]; }
      }

Martin Reinecke's avatar
Martin Reinecke committed
371
    POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res)
Martin Reinecke's avatar
Martin Reinecke committed
372
373
374
375
376
377
378
379
380
381
      {
      size_t half = n>>1;
      if ((n&3)==0)
        for (size_t i=0; i<half; i+=2)
          { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; }
      else
        for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
          { res[j] = -res[i]; res[j+1] = res[i+1]; }
      }

Martin Reinecke's avatar
Martin Reinecke committed
382
    void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res)
Martin Reinecke's avatar
Martin Reinecke committed
383
384
385
386
387
388
389
390
391
      {
      if ((n&1)==0)
        for (size_t i=0; i<n; ++i)
          res[i+n] = -res[i];
      else
        for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
          { res[j] = res[i]; res[j+1] = -res[i+1]; }
      }

Martin Reinecke's avatar
Martin Reinecke committed
392
    POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res)
Martin Reinecke's avatar
Martin Reinecke committed
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
      {
      if ((n&3)==0)
        {
        calc_first_octant(n, res);
        fill_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else if ((n&1)==0)
        {
        calc_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else
        calc_first_half(n, res);
      }

  public:
Martin Reinecke's avatar
Martin Reinecke committed
410
    POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half)
Martin Reinecke's avatar
Martin Reinecke committed
411
412
413
414
415
416
417
418
419
420
421
422
423
424
      : data(2*n)
      {
      sincos_2pibyn_half(n, data.data());
      if (!half) fill_second_half(n, data.data());
      }

    T operator[](size_t idx) const { return data[idx]; }
    const T *rdata() const { return data; }
    const cmplx<T> *cdata() const
      { return reinterpret_cast<const cmplx<T> *>(data.data()); }
  };

struct util // hack to avoid duplicate symbols
  {
Martin Reinecke's avatar
Martin Reinecke committed
425
  static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
426
427
428
429
    {
    size_t res=1;
    while ((n&1)==0)
      { res=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
430
431
432
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        { res=x; n/=x; }
Martin Reinecke's avatar
Martin Reinecke committed
433
434
435
436
    if (n>1) res=n;
    return res;
    }

Martin Reinecke's avatar
Martin Reinecke committed
437
  static POCKETFFT_NOINLINE double cost_guess (size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
438
439
440
441
442
443
    {
    constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
    size_t ni=n;
    double result=0.;
    while ((n&1)==0)
      { result+=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
444
445
446
447
448
449
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        {
        result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
        n/=x;
        }
Martin Reinecke's avatar
Martin Reinecke committed
450
451
452
453
454
    if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
    return result*double(ni);
    }

  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
Martin Reinecke's avatar
Martin Reinecke committed
455
  static POCKETFFT_NOINLINE size_t good_size(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
    {
    if (n<=12) return n;

    size_t bestfac=2*n;
    for (size_t f2=1; f2<bestfac; f2*=2)
      for (size_t f23=f2; f23<bestfac; f23*=3)
        for (size_t f235=f23; f235<bestfac; f235*=5)
          for (size_t f2357=f235; f2357<bestfac; f2357*=7)
            for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
              if (f235711>=n) bestfac=f235711;
    return bestfac;
    }

  static size_t prod(const shape_t &shape)
    {
    size_t res=1;
    for (auto sz: shape)
      res*=sz;
    return res;
    }

Martin Reinecke's avatar
Martin Reinecke committed
477
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
478
479
480
481
482
483
484
485
486
487
    const stride_t &stride_in, const stride_t &stride_out, bool inplace)
    {
    auto ndim = shape.size();
    if (ndim<1) throw runtime_error("ndim must be >= 1");
    if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
      throw runtime_error("stride dimension mismatch");
    if (inplace && (stride_in!=stride_out))
      throw runtime_error("stride mismatch");
    }

Martin Reinecke's avatar
Martin Reinecke committed
488
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
489
490
491
492
493
494
495
496
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    const shape_t &axes)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    auto ndim = shape.size();
    shape_t tmp(ndim,0);
    for (auto ax : axes)
      {
Martin Reinecke's avatar
Martin Reinecke committed
497
498
      if (ax>=ndim) throw invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw invalid_argument("axis specified repeatedly");
Martin Reinecke's avatar
Martin Reinecke committed
499
500
501
      }
    }

Martin Reinecke's avatar
Martin Reinecke committed
502
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
503
504
505
506
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    size_t axis)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
Martin Reinecke's avatar
Martin Reinecke committed
507
    if (axis>=shape.size()) throw invalid_argument("bad axis number");
Martin Reinecke's avatar
Martin Reinecke committed
508
509
510
511
512
513
514
515
516
    }

#ifdef POCKETFFT_OPENMP
    static size_t nthreads() { return size_t(omp_get_num_threads()); }
    static size_t thread_num() { return size_t(omp_get_thread_num()); }
    static size_t thread_count (size_t nthreads, const shape_t &shape,
      size_t axis)
      {
      if (nthreads==1) return 1;
Martin Reinecke's avatar
Martin Reinecke committed
517
      if (prod(shape) < 20*shape[axis]) return 1;
Martin Reinecke's avatar
Martin Reinecke committed
518
519
520
      return (nthreads==0) ? size_t(omp_get_max_threads()) : nthreads;
      }
#else
Martin Reinecke's avatar
Martin Reinecke committed
521
522
    static constexpr size_t nthreads() { return 1; }
    static constexpr size_t thread_num() { return 0; }
Martin Reinecke's avatar
Martin Reinecke committed
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
#endif
  };

//
// complex FFTPACK transforms
//

template<typename T0> class cfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      cmplx<T0> *tw, *tws;
      };

    size_t length;
    arr<cmplx<T0>> mem;
    vector<fctdata> fact;

    void add_factor(size_t factor)
      { fact.push_back({factor, nullptr, nullptr}); }

Martin Reinecke's avatar
Martin Reinecke committed
546
547
548
template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
Martin Reinecke committed
549
550
551
  {
  constexpr size_t cdim=2;

Martin Reinecke's avatar
Martin Reinecke committed
552
553
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
Martin Reinecke's avatar
Martin Reinecke committed
554
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
Martin Reinecke's avatar
Martin Reinecke committed
555
556
557
558
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
Martin Reinecke committed
559
560
561
562
563
564
565
566
567
568
569
570
571
572
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      for (size_t i=1; i<ido; ++i)
        {
        CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
Martin Reinecke's avatar
Martin Reinecke committed
573
        CH(i,k,1) = (CC(i,0,k)-CC(i,1,k)).template special_mul<fwd>(WA(0,i));
Martin Reinecke's avatar
Martin Reinecke committed
574
575
576
577
        }
      }
  }

Martin Reinecke's avatar
Martin Reinecke committed
578
#define POCKETFFT_PREP3(idx) \
Martin Reinecke's avatar
Martin Reinecke committed
579
580
581
        T t0 = CC(idx,0,k), t1, t2; \
        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
        CH(idx,k,0)=t0+t1;
Martin Reinecke's avatar
Martin Reinecke committed
582
#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
Martin Reinecke's avatar
Martin Reinecke committed
583
584
585
586
587
588
        { \
        T ca,cb; \
        ca=t0+t1*twr; \
        cb=t2*twi; ROT90(cb); \
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
        }
Martin Reinecke's avatar
Martin Reinecke committed
589
#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
Martin Reinecke's avatar
Martin Reinecke committed
590
591
592
593
594
        { \
        T ca,cb,da,db; \
        ca=t0+t1*twr; \
        cb=t2*twi; ROT90(cb); \
        PMC(da,db,ca,cb); \
Martin Reinecke's avatar
Martin Reinecke committed
595
596
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
Martin Reinecke's avatar
Martin Reinecke committed
597
        }
Martin Reinecke's avatar
Martin Reinecke committed
598
599
600
template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
Martin Reinecke committed
601
602
603
  {
  constexpr size_t cdim=3;
  constexpr T0 tw1r=-0.5,
Martin Reinecke's avatar
Martin Reinecke committed
604
605
606
607
               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
Martin Reinecke's avatar
Martin Reinecke committed
608
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
Martin Reinecke's avatar
Martin Reinecke committed
609
610
611
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };
Martin Reinecke's avatar
Martin Reinecke committed
612
613
614
615

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
Martin Reinecke's avatar
Martin Reinecke committed
616
617
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
Martin Reinecke's avatar
Martin Reinecke committed
618
619
620
621
622
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
Martin Reinecke's avatar
Martin Reinecke committed
623
624
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
Martin Reinecke's avatar
Martin Reinecke committed
625
626
627
      }
      for (size_t i=1; i<ido; ++i)
        {
Martin Reinecke's avatar
Martin Reinecke committed
628
629
        POCKETFFT_PREP3(i)
        POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
Martin Reinecke's avatar
Martin Reinecke committed
630
631
632
633
        }
      }
  }

Martin Reinecke's avatar
Martin Reinecke committed
634
635
636
#undef POCKETFFT_PARTSTEP3b
#undef POCKETFFT_PARTSTEP3a
#undef POCKETFFT_PREP3
Martin Reinecke's avatar
Martin Reinecke committed
637

Martin Reinecke's avatar
Martin Reinecke committed
638
639
640
template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
Martin Reinecke committed
641
642
643
  {
  constexpr size_t cdim=4;

Martin Reinecke's avatar
Martin Reinecke committed
644
645
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
Martin Reinecke's avatar
Martin Reinecke committed
646
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
Martin Reinecke's avatar
Martin Reinecke committed
647
648
649
650
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
Martin Reinecke committed
651
652
653
654
655
656
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T t1, t2, t3, t4;
      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
657
      ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
658
659
660
661
662
663
664
665
666
667
      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T t1, t2, t3, t4;
      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
668
      ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
669
670
671
672
673
674
675
676
677
      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
      }
      for (size_t i=1; i<ido; ++i)
        {
        T c2, c3, c4, t1, t2, t3, t4;
        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
        PMC(t2,t1,cc0,cc2);
        PMC(t3,t4,cc1,cc3);
Martin Reinecke's avatar
Martin Reinecke committed
678
        ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
679
680
        PMC(CH(i,k,0),c3,t2,t3);
        PMC(c2,c4,t1,t4);
Martin Reinecke's avatar
Martin Reinecke committed
681
682
683
        CH(i,k,1) = c2.template special_mul<fwd>(WA(0,i));
        CH(i,k,2) = c3.template special_mul<fwd>(WA(1,i));
        CH(i,k,3) = c4.template special_mul<fwd>(WA(2,i));
Martin Reinecke's avatar
Martin Reinecke committed
684
685
686
687
        }
      }
  }

Martin Reinecke's avatar
Martin Reinecke committed
688
#define POCKETFFT_PREP5(idx) \
Martin Reinecke's avatar
Martin Reinecke committed
689
690
691
692
693
694
        T t0 = CC(idx,0,k), t1, t2, t3, t4; \
        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
        CH(idx,k,0).i=t0.i+t1.i+t2.i;

Martin Reinecke's avatar
Martin Reinecke committed
695
#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
Martin Reinecke's avatar
Martin Reinecke committed
696
697
698
699
700
701
702
703
704
        { \
        T ca,cb; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \
        }

Martin Reinecke's avatar
Martin Reinecke committed
705
#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
Martin Reinecke's avatar
Martin Reinecke committed
706
707
708
709
710
711
712
        { \
        T ca,cb,da,db; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PMC(da,db,ca,cb); \
Martin Reinecke's avatar
Martin Reinecke committed
713
714
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
Martin Reinecke's avatar
Martin Reinecke committed
715
        }
Martin Reinecke's avatar
Martin Reinecke committed
716
717
718
template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
Martin Reinecke committed
719
720
721
  {
  constexpr size_t cdim=5;
  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
722
               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
Martin Reinecke's avatar
Martin Reinecke committed
723
               tw2r= T0(-0.8090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
724
725
726
727
               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
Martin Reinecke's avatar
Martin Reinecke committed
728
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
Martin Reinecke's avatar
Martin Reinecke committed
729
730
731
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };
Martin Reinecke's avatar
Martin Reinecke committed
732
733
734
735

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
Martin Reinecke's avatar
Martin Reinecke committed
736
737
738
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
Martin Reinecke's avatar
Martin Reinecke committed
739
740
741
742
743
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
Martin Reinecke's avatar
Martin Reinecke committed
744
745
746
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
Martin Reinecke's avatar
Martin Reinecke committed
747
748
749
      }
      for (size_t i=1; i<ido; ++i)
        {
Martin Reinecke's avatar
Martin Reinecke committed
750
751
752
        POCKETFFT_PREP5(i)
        POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
        POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
Martin Reinecke's avatar
Martin Reinecke committed
753
754
755
756
        }
      }
  }

Martin Reinecke's avatar
Martin Reinecke committed
757
758
759
#undef POCKETFFT_PARTSTEP5b
#undef POCKETFFT_PARTSTEP5a
#undef POCKETFFT_PREP5
Martin Reinecke's avatar
Martin Reinecke committed
760

Martin Reinecke's avatar
Martin Reinecke committed
761
#define POCKETFFT_PREP7(idx) \
Martin Reinecke's avatar
Martin Reinecke committed
762
763
764
765
766
767
768
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;

Martin Reinecke's avatar
Martin Reinecke committed
769
#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
Martin Reinecke's avatar
Martin Reinecke committed
770
771
772
773
774
775
776
777
        { \
        T ca,cb; \
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
        PMC(out1,out2,ca,cb); \
        }
Martin Reinecke's avatar
Martin Reinecke committed
778
779
780
#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
Martin Reinecke's avatar
Martin Reinecke committed
781
782
        { \
        T da,db; \
Martin Reinecke's avatar
Martin Reinecke committed
783
784
785
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
Martin Reinecke's avatar
Martin Reinecke committed
786
787
        }

Martin Reinecke's avatar
Martin Reinecke committed
788
789
790
template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
Martin Reinecke committed
791
792
793
  {
  constexpr size_t cdim=7;
  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
Martin Reinecke's avatar
Martin Reinecke committed
794
               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
Martin Reinecke's avatar
Martin Reinecke committed
795
               tw2r= T0(-0.2225209339563144042889025644967948L),
Martin Reinecke's avatar
Martin Reinecke committed
796
               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
Martin Reinecke's avatar
Martin Reinecke committed
797
               tw3r= T0(-0.9009688679024191262361023195074451L),
Martin Reinecke's avatar
Martin Reinecke committed
798
799
800
801
               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
Martin Reinecke's avatar
Martin Reinecke committed
802
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
Martin Reinecke's avatar
Martin Reinecke committed
803
804
805
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };
Martin Reinecke's avatar
Martin Reinecke committed
806
807
808
809

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
Martin Reinecke's avatar
Martin Reinecke committed
810
811
812
813
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
Martin Reinecke's avatar
Martin Reinecke committed
814
815
816
817
818
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
Martin Reinecke's avatar
Martin Reinecke committed
819
820
821
822
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
Martin Reinecke's avatar
Martin Reinecke committed
823
824
825
      }
      for (size_t i=1; i<ido; ++i)
        {
Martin Reinecke's avatar
Martin Reinecke committed
826
827
828
829
        POCKETFFT_PREP7(i)
        POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
        POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
Martin Reinecke's avatar
Martin Reinecke committed
830
831
832
833
        }
      }
  }

Martin Reinecke's avatar
Martin Reinecke committed
834
835
836
837
#undef POCKETFFT_PARTSTEP7
#undef POCKETFFT_PARTSTEP7a0
#undef POCKETFFT_PARTSTEP7a
#undef POCKETFFT_PREP7
Martin Reinecke's avatar
Martin Reinecke committed
838

Martin Reinecke's avatar
Martin Reinecke committed
839
template <bool fwd, typename T> void ROTX45(T &a)
Martin Reinecke's avatar
Martin Reinecke committed
840
841
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
842
  if (fwd)
Martin Reinecke's avatar
Martin Reinecke committed
843
    { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
Martin Reinecke's avatar
Martin Reinecke committed
844
845
  else
    { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
Martin Reinecke's avatar
Martin Reinecke committed
846
  }
Martin Reinecke's avatar
Martin Reinecke committed
847
template <bool fwd, typename T> void ROTX135(T &a)
Martin Reinecke's avatar
Martin Reinecke committed
848
849
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
850
  if (fwd)
Martin Reinecke's avatar
Martin Reinecke committed
851
    { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
Martin Reinecke's avatar
Martin Reinecke committed
852
853
  else
    { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
Martin Reinecke's avatar
Martin Reinecke committed
854
855
  }

Martin Reinecke's avatar
Martin Reinecke committed
856
857
858
template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
Martin Reinecke committed
859
860
861
  {
  constexpr size_t cdim=8;

Martin Reinecke's avatar
Martin Reinecke committed
862
863
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
Martin Reinecke's avatar
Martin Reinecke committed
864
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
Martin Reinecke's avatar
Martin Reinecke committed
865
866
867
868
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
Martin Reinecke committed
869
870
871
872
873
874
875
876
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
877
878
      ROTX90<fwd>(a6);
      ROTX90<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
879
880
881
882
      PMINPLACE(a0,a2);
      PMINPLACE(a1,a3);
      PMINPLACE(a4,a6);
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
883
884
885
      ROTX45<fwd>(a5);
      ROTX90<fwd>(a3);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
886
887
888
889
890
891
892
893
894
895
896
897
898
      PMC(CH(0,k,0),CH(0,k,4),a0,a1);
      PMC(CH(0,k,1),CH(0,k,5),a4,a5);
      PMC(CH(0,k,2),CH(0,k,6),a2,a3);
      PMC(CH(0,k,3),CH(0,k,7),a6,a7);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
899
900
      ROTX90<fwd>(a6);
      ROTX90<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
901
902
903
904
      PMINPLACE(a0,a2);
      PMINPLACE(a1,a3);
      PMINPLACE(a4,a6);
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
905
906
907
      ROTX45<fwd>(a5);
      ROTX90<fwd>(a3);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
908
909
910
911
912
913
914
915
916
917
918
919
      PMC(CH(0,k,0),CH(0,k,4),a0,a1);
      PMC(CH(0,k,1),CH(0,k,5),a4,a5);
      PMC(CH(0,k,2),CH(0,k,6),a2,a3);
      PMC(CH(0,k,3),CH(0,k,7),a6,a7);

      for (size_t i=1; i<ido; ++i)
        {
        T a0, a1, a2, a3, a4, a5, a6, a7;
        PMC(a0,a4,CC(i,0,k),CC(i,4,k));
        PMC(a1,a5,CC(i,1,k),CC(i,5,k));
        PMC(a2,a6,CC(i,2,k),CC(i,6,k));
        PMC(a3,a7,CC(i,3,k),CC(i,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
920
921
        ROTX90<fwd>(a6);
        ROTX90<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
922
923
924
925
        PMINPLACE(a0,a2);
        PMINPLACE(a1,a3);
        PMINPLACE(a4,a6);
        PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
926
927
928
        ROTX45<fwd>(a5);
        ROTX90<fwd>(a3);
        ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
929
930
931
932
933
        PMINPLACE(a0,a1);
        PMINPLACE(a2,a3);
        PMINPLACE(a4,a5);
        PMINPLACE(a6,a7);
        CH(i,k,0) = a0;
Martin Reinecke's avatar
Martin Reinecke committed
934
935
936
937
938
939
940
        CH(i,k,1) = a4.template special_mul<fwd>(WA(0,i));
        CH(i,k,2) = a2.template special_mul<fwd>(WA(1,i));
        CH(i,k,3) = a6.template special_mul<fwd>(WA(2,i));
        CH(i,k,4) = a1.template special_mul<fwd>(WA(3,i));
        CH(i,k,5) = a5.template special_mul<fwd>(WA(4,i));
        CH(i,k,6) = a3.template special_mul<fwd>(WA(5,i));
        CH(i,k,7) = a7.template special_mul<fwd>(WA(6,i));
Martin Reinecke's avatar
Martin Reinecke committed
941
942
943
944
945
        }
      }
   }


Martin Reinecke's avatar
Martin Reinecke committed
946
#define POCKETFFT_PREP11(idx) \
Martin Reinecke's avatar
Martin Reinecke committed
947
948
949
950
951
952
953
954
955
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;

Martin Reinecke's avatar
Martin Reinecke committed
956
#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
Martin Reinecke's avatar
Martin Reinecke committed
957
958
959
960
961
962
963
        { \
        T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
          cb; \
        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
        PMC(out1,out2,ca,cb); \
        }
Martin Reinecke's avatar
Martin Reinecke committed
964
965
966
#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
Martin Reinecke's avatar
Martin Reinecke committed
967
968
        { \
        T da,db; \
Martin Reinecke's avatar
Martin Reinecke committed
969
970
971
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
Martin Reinecke's avatar
Martin Reinecke committed
972
973
        }

Martin Reinecke's avatar
Martin Reinecke committed
974
975
976
template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
Martin Reinecke committed
977
978
979
  {
  constexpr size_t cdim=11;
  constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
Martin Reinecke's avatar
Martin Reinecke committed
980
               tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
Martin Reinecke's avatar
Martin Reinecke committed
981
               tw2r= T0(0.4154150130018864255292741492296232L),
Martin Reinecke's avatar
Martin Reinecke committed
982
               tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
Martin Reinecke's avatar
Martin Reinecke committed
983
               tw3r= T0(-0.1423148382732851404437926686163697L),
Martin Reinecke's avatar
Martin Reinecke committed
984
               tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
Martin Reinecke's avatar
Martin Reinecke committed
985
               tw4r= T0(-0.6548607339452850640569250724662936L),
Martin Reinecke's avatar
Martin Reinecke committed
986
               tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
Martin Reinecke's avatar
Martin Reinecke committed
987
               tw5r= T0(-0.9594929736144973898903680570663277L),
Martin Reinecke's avatar
Martin Reinecke committed
988
989
990
991
               tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
Martin Reinecke's avatar
Martin Reinecke committed
992
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
Martin Reinecke's avatar
Martin Reinecke committed
993
994
995
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };
Martin Reinecke's avatar
Martin Reinecke committed
996
997
998
999

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
Martin Reinecke's avatar
Martin Reinecke committed
1000
      POCKETFFT_PREP11(0)
For faster browsing, not all history is shown. View entire blame