pocketfft_hdronly.h 104 KB
Newer Older
1
/*
2
3
4
This file is part of pocketfft.

Copyright (C) 2010-2019 Max-Planck-Society
5
6
7
8
9
10
11
Copyright (C) 2019 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
38
39
40
41

#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H

42
#ifndef __cplusplus
43
#error This file is C++ and requires a C++ compiler.
44
45
#endif

Martin Reinecke's avatar
sync    
Martin Reinecke committed
46
#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
47
#error This file requires at least C++11 support.
48
49
#endif

50
51
52
53
#ifndef POCKETFFT_CACHE_SIZE
#define POCKETFFT_CACHE_SIZE 16
#endif

54
55
56
57
58
59
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <memory>
#include <vector>
Martin Reinecke's avatar
sync    
Martin Reinecke committed
60
#include <complex>
61
62
#if POCKETFFT_CACHE_SIZE!=0
#include <array>
Martin Reinecke's avatar
Martin Reinecke committed
63
#include <mutex>
64
#endif
Peter Bell's avatar
Peter Bell committed
65

66
#ifndef POCKETFFT_NO_MULTITHREADING
Peter Bell's avatar
Peter Bell committed
67
68
69
70
71
72
#include <mutex>
#include <condition_variable>
#include <thread>
#include <queue>
#include <atomic>
#include <functional>
Martin Reinecke's avatar
Martin Reinecke committed
73

74
75
#ifdef POCKETFFT_PTHREADS
#  include <pthread.h>
Martin Reinecke's avatar
Martin Reinecke committed
76
#endif
77
#endif
78

Martin Reinecke's avatar
Martin Reinecke committed
79
#if defined(__GNUC__)
80
81
#define POCKETFFT_NOINLINE __attribute__((noinline))
#define POCKETFFT_RESTRICT __restrict__
Martin Reinecke's avatar
Martin Reinecke committed
82
#elif defined(_MSC_VER)
83
84
#define POCKETFFT_NOINLINE __declspec(noinline)
#define POCKETFFT_RESTRICT __restrict
85
#else
86
87
#define POCKETFFT_NOINLINE
#define POCKETFFT_RESTRICT
88
89
90
91
92
#endif

namespace pocketfft {

namespace detail {
93
94
using std::size_t;
using std::ptrdiff_t;
95

96
97
98
99
// Always use std:: for <cmath> functions
template <typename T> T cos(T) = delete;
template <typename T> T sin(T) = delete;
template <typename T> T sqrt(T) = delete;
Martin Reinecke's avatar
Martin Reinecke committed
100

101
102
using shape_t = std::vector<size_t>;
using stride_t = std::vector<ptrdiff_t>;
Martin Reinecke's avatar
Martin Reinecke committed
103
104
105
106

constexpr bool FORWARD  = true,
               BACKWARD = false;

Martin Reinecke's avatar
sync    
Martin Reinecke committed
107
108
109
110
// only enable vector support for gcc>=5.0 and clang>=5.0
#ifndef POCKETFFT_NO_VECTORS
#define POCKETFFT_NO_VECTORS
#if defined(__INTEL_COMPILER)
111
// do nothing. This is necessary because this compiler also sets __GNUC__.
Martin Reinecke's avatar
sync    
Martin Reinecke committed
112
#elif defined(__clang__)
113
#ifdef __APPLE__
Martin Reinecke's avatar
Martin Reinecke committed
114
115
116
117
118
#  if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 3)
#     undef POCKETFFT_NO_VECTORS
#  endif
#elif __clang_major__ >= 5
#  undef POCKETFFT_NO_VECTORS
119
#endif
Martin Reinecke's avatar
sync    
Martin Reinecke committed
120
121
122
123
124
125
126
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
127
128
template<typename T> struct VLEN { static constexpr size_t val=1; };

Martin Reinecke's avatar
Martin Reinecke committed
129
130
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
Martin Reinecke's avatar
Martin Reinecke committed
131
132
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
Martin Reinecke's avatar
Martin Reinecke committed
133
#elif (defined(__AVX__))
Martin Reinecke's avatar
Martin Reinecke committed
134
135
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
Martin Reinecke's avatar
Martin Reinecke committed
136
#elif (defined(__SSE2__))
Martin Reinecke's avatar
Martin Reinecke committed
137
138
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
sync    
Martin Reinecke committed
139
#elif (defined(__VSX__))
Martin Reinecke's avatar
Martin Reinecke committed
140
141
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
Martin Reinecke committed
142
143
144
145
146
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
147
template<typename T> class arr
148
149
150
151
152
  {
  private:
    T *p;
    size_t sz;

Martin Reinecke's avatar
Martin Reinecke committed
153
#if defined(POCKETFFT_NO_VECTORS)
154
155
156
157
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = malloc(num*sizeof(T));
158
      if (!res) throw std::bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
159
160
161
162
163
164
165
166
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
#elif __cplusplus >= 201703L
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
167
      void *res = aligned_alloc(64,num*sizeof(T));
168
      if (!res) throw std::bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
169
170
171
172
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
173
#else // portable emulation
Martin Reinecke's avatar
Martin Reinecke committed
174
175
176
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
177
      void *ptr = malloc(num*sizeof(T)+64);
178
      if (!ptr) throw std::bad_alloc();
179
180
181
182
      T *res = reinterpret_cast<T *>
        ((reinterpret_cast<size_t>(ptr) & ~(size_t(63))) + 64);
      (reinterpret_cast<void**>(res))[-1] = ptr;
      return res;
Martin Reinecke's avatar
Martin Reinecke committed
183
184
      }
    static void dealloc(T *ptr)
185
      { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
Martin Reinecke's avatar
Martin Reinecke committed
186
#endif
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

  public:
    arr() : p(0), sz(0) {}
    arr(size_t n) : p(ralloc(n)), sz(n) {}
    arr(arr &&other)
      : p(other.p), sz(other.sz)
      { other.p=nullptr; other.sz=0; }
    ~arr() { dealloc(p); }

    void resize(size_t n)
      {
      if (n==sz) return;
      dealloc(p);
      p = ralloc(n);
      sz = n;
      }

    T &operator[](size_t idx) { return p[idx]; }
    const T &operator[](size_t idx) const { return p[idx]; }

    T *data() { return p; }
    const T *data() const { return p; }

    size_t size() const { return sz; }
  };

template<typename T> struct cmplx {
  T r, i;
  cmplx() {}
  cmplx(T r_, T i_) : r(r_), i(i_) {}
  void Set(T r_, T i_) { r=r_; i=i_; }
  void Set(T r_) { r=r_; i=T(0); }
Martin Reinecke's avatar
Martin Reinecke committed
219
220
  void Split(T &r_, T &i_) const { r_=r; i_=i; }
  void SplitConj(T &r_, T &i_) const { r_=r; i_=-i; }
221
222
223
224
  cmplx &operator+= (const cmplx &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator*= (T2 other)
    { r*=other; i*=other; return *this; }
225
226
227
228
229
230
231
  template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
    {
    T tmp = r*other.r - i*other.i;
    i = r*other.i + i*other.r;
    r = tmp;
    return *this;
    }
Martin Reinecke's avatar
Martin Reinecke committed
232
233
234
235
  template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
    { r-=other.r; i-=other.i; return *this; }
236
237
238
  template<typename T2> auto operator* (const T2 &other) const
    -> cmplx<decltype(r*other)>
    { return {r*other, i*other}; }
Martin Reinecke's avatar
Martin Reinecke committed
239
240
241
242
243
244
  template<typename T2> auto operator+ (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r+other.r, i+other.i}; }
  template<typename T2> auto operator- (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r-other.r, i-other.i}; }
245
246
247
  template<typename T2> auto operator* (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
Martin Reinecke's avatar
Martin Reinecke committed
248
  template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
249
250
    -> cmplx<decltype(r+other.r)>
    {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
251
    using Tres = cmplx<decltype(r+other.r)>;
Martin Reinecke's avatar
Martin Reinecke committed
252
253
    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
254
255
    }
};
Martin Reinecke's avatar
Martin Reinecke committed
256
257
template<typename T> inline void PM(T &a, T &b, T c, T d)
  { a=c+d; b=c-d; }
Martin Reinecke's avatar
Martin Reinecke committed
258
259
template<typename T> inline void PMINPLACE(T &a, T &b)
  { T t = a; a+=b; b=t-b; }
260
261
template<typename T> inline void MPINPLACE(T &a, T &b)
  { T t = a; a-=b; b=t+b; }
262
263
template<typename T> cmplx<T> conj(const cmplx<T> &a)
  { return {a.r, -a.i}; }
Martin Reinecke's avatar
Martin Reinecke committed
264
265
266
267
268
template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
  {
  res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
            : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
  }
269
270
271

template<typename T> void ROT90(cmplx<T> &a)
  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
Martin Reinecke's avatar
Martin Reinecke committed
272
273
template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
274
275
276
277
278
279
280

//
// twiddle factor section
//
template<typename T> class sincos_2pibyn
  {
  private:
281
    using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type;
Martin Reinecke's avatar
Martin Reinecke committed
282
    size_t N, mask, shift;
Martin Reinecke's avatar
Martin Reinecke committed
283
    arr<cmplx<Thigh>> v1, v2;
284

Martin Reinecke's avatar
Martin Reinecke committed
285
    static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
286
      {
Martin Reinecke's avatar
Martin Reinecke committed
287
288
      x<<=3;
      if (x<4*n) // first half
289
        {
Martin Reinecke's avatar
Martin Reinecke committed
290
291
        if (x<2*n) // first quadrant
          {
292
293
          if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang));
          return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang));
Martin Reinecke's avatar
Martin Reinecke committed
294
295
296
297
          }
        else // second quadrant
          {
          x-=2*n;
298
299
          if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang));
          return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang));
Martin Reinecke's avatar
Martin Reinecke committed
300
          }
301
302
303
        }
      else
        {
Martin Reinecke's avatar
Martin Reinecke committed
304
305
306
        x=8*n-x;
        if (x<2*n) // third quadrant
          {
307
308
          if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang));
          return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang));
Martin Reinecke's avatar
Martin Reinecke committed
309
310
311
312
          }
        else // fourth quadrant
          {
          x-=2*n;
313
314
          if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang));
          return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang));
Martin Reinecke's avatar
Martin Reinecke committed
315
          }
316
317
318
319
        }
      }

  public:
320
    POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
321
      : N(n)
Martin Reinecke's avatar
Martin Reinecke committed
322
323
324
      {
      constexpr auto pi = 3.141592653589793238462643383279502884197L;
      Thigh ang = Thigh(0.25L*pi/n);
Martin Reinecke's avatar
Martin Reinecke committed
325
      size_t nval = (n+2)/2;
Martin Reinecke's avatar
Martin Reinecke committed
326
      shift = 1;
Martin Reinecke's avatar
Martin Reinecke committed
327
      while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
Martin Reinecke's avatar
Martin Reinecke committed
328
329
330
331
332
      mask = (size_t(1)<<shift)-1;
      v1.resize(mask+1);
      v1[0].Set(Thigh(1), Thigh(0));
      for (size_t i=1; i<v1.size(); ++i)
        v1[i]=calc(i,n,ang);
Martin Reinecke's avatar
Martin Reinecke committed
333
      v2.resize((nval+mask)/(mask+1));
Martin Reinecke's avatar
Martin Reinecke committed
334
335
336
337
      v2[0].Set(Thigh(1), Thigh(0));
      for (size_t i=1; i<v2.size(); ++i)
        v2[i]=calc(i*(mask+1),n,ang);
      }
338
339

    cmplx<T> operator[](size_t idx) const
340
      {
Martin Reinecke's avatar
Martin Reinecke committed
341
342
343
344
345
346
      if (2*idx<=N)
        {
        auto x1=v1[idx&mask], x2=v2[idx>>shift];
        return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
        }
      idx = N-idx;
Martin Reinecke's avatar
Martin Reinecke committed
347
      auto x1=v1[idx&mask], x2=v2[idx>>shift];
Martin Reinecke's avatar
Martin Reinecke committed
348
      return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
349
350
351
      }
  };

Martin Reinecke's avatar
Martin Reinecke committed
352
struct util // hack to avoid duplicate symbols
353
  {
354
  static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
355
    {
Martin Reinecke's avatar
Martin Reinecke committed
356
357
358
    size_t res=1;
    while ((n&1)==0)
      { res=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
359
360
361
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        { res=x; n/=x; }
Martin Reinecke's avatar
Martin Reinecke committed
362
363
364
365
    if (n>1) res=n;
    return res;
    }

366
  static POCKETFFT_NOINLINE double cost_guess (size_t n)
367
    {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
368
    constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
Martin Reinecke's avatar
Martin Reinecke committed
369
370
371
372
    size_t ni=n;
    double result=0.;
    while ((n&1)==0)
      { result+=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
373
374
375
376
377
378
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        {
        result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
        n/=x;
        }
Martin Reinecke's avatar
sync    
Martin Reinecke committed
379
380
    if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
    return result*double(ni);
381
382
    }

Martin Reinecke's avatar
Martin Reinecke committed
383
  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
384
  static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
385
386
387
388
    {
    if (n<=12) return n;

    size_t bestfac=2*n;
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
    for (size_t f11=1; f11<bestfac; f11*=11)
      for (size_t f117=f11; f117<bestfac; f117*=7)
        for (size_t f1175=f117; f1175<bestfac; f1175*=5)
          {
          size_t x=f1175;
          while (x<n) x*=2;
          for (;;)
            {
            if (x<n)
              x*=3;
            else if (x>n)
              {
              if (x<bestfac) bestfac=x;
              if (x&1) break;
              x>>=1;
              }
            else
              return n;
            }
          }
    return bestfac;
    }

  /* returns the smallest composite of 2, 3, 5 which is >= n */
  static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
    {
    if (n<=6) return n;

    size_t bestfac=2*n;
    for (size_t f5=1; f5<bestfac; f5*=5)
      {
      size_t x = f5;
      while (x<n) x *= 2;
      for (;;)
        {
        if (x<n)
          x*=3;
        else if (x>n)
          {
          if (x<bestfac) bestfac=x;
          if (x&1) break;
          x>>=1;
          }
        else
          return n;
        }
      }
Martin Reinecke's avatar
Martin Reinecke committed
436
437
    return bestfac;
    }
438

Martin Reinecke's avatar
Martin Reinecke committed
439
440
441
442
443
444
445
  static size_t prod(const shape_t &shape)
    {
    size_t res=1;
    for (auto sz: shape)
      res*=sz;
    return res;
    }
Martin Reinecke's avatar
Martin Reinecke committed
446

447
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
448
449
450
    const stride_t &stride_in, const stride_t &stride_out, bool inplace)
    {
    auto ndim = shape.size();
451
    if (ndim<1) throw std::runtime_error("ndim must be >= 1");
Martin Reinecke's avatar
Martin Reinecke committed
452
    if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
453
      throw std::runtime_error("stride dimension mismatch");
Martin Reinecke's avatar
sync    
Martin Reinecke committed
454
    if (inplace && (stride_in!=stride_out))
455
      throw std::runtime_error("stride mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
456
457
    }

458
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
459
460
461
462
463
464
465
466
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    const shape_t &axes)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    auto ndim = shape.size();
    shape_t tmp(ndim,0);
    for (auto ax : axes)
      {
467
468
      if (ax>=ndim) throw std::invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
Martin Reinecke's avatar
Martin Reinecke committed
469
470
471
      }
    }

472
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
473
474
475
476
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    size_t axis)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
477
    if (axis>=shape.size()) throw std::invalid_argument("bad axis number");
Martin Reinecke's avatar
Martin Reinecke committed
478
    }
Martin Reinecke's avatar
Martin Reinecke committed
479

480
481
482
483
484
485
486
487
488
489
490
491
492
493
#ifdef POCKETFFT_NO_MULTITHREADING
  static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/,
    size_t /*axis*/, size_t /*vlen*/)
    { return 1; }
#else
  static size_t thread_count (size_t nthreads, const shape_t &shape,
    size_t axis, size_t vlen)
    {
    if (nthreads==1) return 1;
    size_t size = prod(shape);
    size_t parallel = size / (shape[axis] * vlen);
    if (shape[axis] < 1000)
      parallel /= 4;
    size_t max_threads = nthreads == 0 ?
494
495
      std::thread::hardware_concurrency() : nthreads;
    return std::max(size_t(1), std::min(parallel, max_threads));
496
497
    }
#endif
Martin Reinecke's avatar
Martin Reinecke committed
498
  };
499

Peter Bell's avatar
Peter Bell committed
500
namespace threading {
501
502
503

#ifdef POCKETFFT_NO_MULTITHREADING

Martin Reinecke's avatar
fix    
Martin Reinecke committed
504
505
constexpr inline size_t thread_id() { return 0; }
constexpr inline size_t num_threads() { return 1; }
506
507
508
509
510
511
512

template <typename Func>
void thread_map(size_t /* nthreads */, Func f)
  { f(); }

#else

Martin Reinecke's avatar
fix    
Martin Reinecke committed
513
inline size_t &thread_id()
514
515
516
517
  {
  static thread_local size_t thread_id_=0;
  return thread_id_;
  }
Martin Reinecke's avatar
fix    
Martin Reinecke committed
518
inline size_t &num_threads()
519
520
521
522
  {
  static thread_local size_t num_threads_=1;
  return num_threads_;
  }
523
static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency());
Peter Bell's avatar
Peter Bell committed
524
525
526

class latch
  {
527
528
529
530
    std::atomic<size_t> num_left_;
    std::mutex mut_;
    std::condition_variable completed_;
    using lock_t = std::unique_lock<std::mutex>;
Peter Bell's avatar
Peter Bell committed
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552

  public:
    latch(size_t n): num_left_(n) {}

    void count_down()
      {
      lock_t lock(mut_);
      if (--num_left_)
        return;
      completed_.notify_all();
      }

    void wait()
      {
      lock_t lock(mut_);
      completed_.wait(lock, [this]{ return is_ready(); });
      }
    bool is_ready() { return num_left_ == 0; }
  };

template <typename T> class concurrent_queue
  {
553
554
555
    std::queue<T> q_;
    std::mutex mut_;
    std::condition_variable item_added_;
556
    bool shutdown_;
557
    using lock_t = std::unique_lock<std::mutex>;
Peter Bell's avatar
Peter Bell committed
558
559
560
561
562
563
564
565
566

  public:
    concurrent_queue(): shutdown_(false) {}

    void push(T val)
      {
      {
      lock_t lock(mut_);
      if (shutdown_)
567
        throw std::runtime_error("Item added to queue after shutdown");
Peter Bell's avatar
Peter Bell committed
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
      q_.push(move(val));
      }
      item_added_.notify_one();
      }

    bool pop(T & val)
      {
      lock_t lock(mut_);
      item_added_.wait(lock, [this] { return (!q_.empty() || shutdown_); });
      if (q_.empty())
        return false;  // We are shutting down

      val = std::move(q_.front());
      q_.pop();
      return true;
      }

    void shutdown()
      {
      {
      lock_t lock(mut_);
      shutdown_ = true;
      }
      item_added_.notify_all();
      }
593
594

    void restart() { shutdown_ = false; }
Peter Bell's avatar
Peter Bell committed
595
596
597
598
  };

class thread_pool
  {
599
600
    concurrent_queue<std::function<void()>> work_queue_;
    std::vector<std::thread> threads_;
Peter Bell's avatar
Peter Bell committed
601
602
603

    void worker_main()
      {
604
      std::function<void()> work;
Peter Bell's avatar
Peter Bell committed
605
606
607
608
      while (work_queue_.pop(work))
        work();
      }

609
    void create_threads()
Peter Bell's avatar
Peter Bell committed
610
      {
611
      size_t nthreads = threads_.size();
Peter Bell's avatar
Peter Bell committed
612
613
      for (size_t i=0; i<nthreads; ++i)
        {
614
        try { threads_[i] = std::thread([this]{ worker_main(); }); }
Peter Bell's avatar
Peter Bell committed
615
616
        catch (...)
          {
617
          shutdown();
Peter Bell's avatar
Peter Bell committed
618
619
620
621
622
          throw;
          }
        }
      }

623
624
625
626
627
  public:
    explicit thread_pool(size_t nthreads):
      threads_(nthreads)
      { create_threads(); }

628
    thread_pool(): thread_pool(max_threads) {}
Peter Bell's avatar
Peter Bell committed
629

630
631
    ~thread_pool() { shutdown(); }

632
    void submit(std::function<void()> work)
633
634
635
636
637
      {
      work_queue_.push(move(work));
      }

    void shutdown()
Peter Bell's avatar
Peter Bell committed
638
639
      {
      work_queue_.shutdown();
640
641
642
      for (auto &thread : threads_)
        if (thread.joinable())
          thread.join();
Peter Bell's avatar
Peter Bell committed
643
644
      }

645
    void restart()
Peter Bell's avatar
Peter Bell committed
646
      {
647
648
      work_queue_.restart();
      create_threads();
Martin Reinecke's avatar
Martin Reinecke committed
649
      }
Martin Reinecke's avatar
Martin Reinecke committed
650
  };
651

Peter Bell's avatar
Peter Bell committed
652
653
654
thread_pool & get_pool()
  {
  static thread_pool pool;
655
#ifdef POCKETFFT_PTHREADS
656
657
  static std::once_flag f;
  std::call_once(f,
658
659
660
661
662
663
664
665
666
    []{
    pthread_atfork(
      +[]{ get_pool().shutdown(); },  // prepare
      +[]{ get_pool().restart(); },   // parent
      +[]{ get_pool().restart(); }    // child
      );
    });
#endif

Peter Bell's avatar
Peter Bell committed
667
668
669
670
671
672
673
674
  return pool;
  }

/** Map a function f over nthreads */
template <typename Func>
void thread_map(size_t nthreads, Func f)
  {
  if (nthreads == 0)
675
    nthreads = max_threads;
Peter Bell's avatar
Peter Bell committed
676
677
678
679

  if (nthreads == 1)
    { f(); return; }

680
  auto & pool = get_pool();
Peter Bell's avatar
Peter Bell committed
681
  latch counter(nthreads);
682
683
  std::exception_ptr ex;
  std::mutex ex_mut;
Peter Bell's avatar
Peter Bell committed
684
685
686
  for (size_t i=0; i<nthreads; ++i)
    {
    pool.submit(
687
      [&f, &counter, &ex, &ex_mut, i, nthreads] {
688
689
      thread_id() = i;
      num_threads() = nthreads;
690
691
692
      try { f(); }
      catch (...)
        {
693
694
        std::lock_guard<std::mutex> lock(ex_mut);
        ex = std::current_exception();
695
        }
Peter Bell's avatar
Peter Bell committed
696
697
698
699
      counter.count_down();
      });
    }
  counter.wait();
700
  if (ex)
701
    std::rethrow_exception(ex);
Peter Bell's avatar
Peter Bell committed
702
  }
703
704
705

#endif

Peter Bell's avatar
Peter Bell committed
706
707
}

708
709
710
711
712
713
714
715
716
717
718
719
720
//
// complex FFTPACK transforms
//

template<typename T0> class cfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      cmplx<T0> *tw, *tws;
      };

Martin Reinecke's avatar
Martin Reinecke committed
721
    size_t length;
722
    arr<cmplx<T0>> mem;
723
    std::vector<fctdata> fact;
724
725

    void add_factor(size_t factor)
Martin Reinecke's avatar
Martin Reinecke committed
726
      { fact.push_back({factor, nullptr, nullptr}); }
727

Martin Reinecke's avatar
Martin Reinecke committed
728
template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
729
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
730
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
731
  {
732
733
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
734
735
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+2*c)]; };
736
737
738
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

739
740
741
742
743
744
745
746
747
748
749
750
751
752
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      for (size_t i=1; i<ido; ++i)
        {
        CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
Martin Reinecke's avatar
Martin Reinecke committed
753
        special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
754
755
756
757
        }
      }
  }

758
#define POCKETFFT_PREP3(idx) \
759
        T t0 = CC(idx,0,k), t1, t2; \
Martin Reinecke's avatar
Martin Reinecke committed
760
        PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
761
        CH(idx,k,0)=t0+t1;
762
#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
763
        { \
Martin Reinecke's avatar
Martin Reinecke committed
764
765
766
        T ca=t0+t1*twr; \
        T cb{-t2.i*twi, t2.r*twi}; \
        PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
767
        }
768
#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
769
        { \
Martin Reinecke's avatar
Martin Reinecke committed
770
771
772
773
        T ca=t0+t1*twr; \
        T cb{-t2.i*twi, t2.r*twi}; \
        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
774
        }
Martin Reinecke's avatar
Martin Reinecke committed
775
template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
776
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
777
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
778
  {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
779
  constexpr T0 tw1r=-0.5,
Martin Reinecke's avatar
Martin Reinecke committed
780
               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
781

782
783
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
784
785
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+3*c)]; };
786
787
788
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

789
790
791
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
792
793
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
794
795
796
797
798
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
799
800
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
801
802
803
      }
      for (size_t i=1; i<ido; ++i)
        {
804
805
        POCKETFFT_PREP3(i)
        POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
806
807
808
809
        }
      }
  }

810
811
812
#undef POCKETFFT_PARTSTEP3b
#undef POCKETFFT_PARTSTEP3a
#undef POCKETFFT_PREP3
813

Martin Reinecke's avatar
Martin Reinecke committed
814
template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
815
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
816
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
817
  {
818
819
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
820
821
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+4*c)]; };
822
823
824
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

825
826
827
828
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T t1, t2, t3, t4;
Martin Reinecke's avatar
Martin Reinecke committed
829
830
      PM(t2,t1,CC(0,0,k),CC(0,2,k));
      PM(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
831
      ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
832
833
      PM(CH(0,k,0),CH(0,k,2),t2,t3);
      PM(CH(0,k,1),CH(0,k,3),t1,t4);
834
835
836
837
838
839
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T t1, t2, t3, t4;
Martin Reinecke's avatar
Martin Reinecke committed
840
841
      PM(t2,t1,CC(0,0,k),CC(0,2,k));
      PM(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
842
      ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
843
844
      PM(CH(0,k,0),CH(0,k,2),t2,t3);
      PM(CH(0,k,1),CH(0,k,3),t1,t4);
845
846
847
      }
      for (size_t i=1; i<ido; ++i)
        {
Martin Reinecke's avatar
Martin Reinecke committed
848
        T t1, t2, t3, t4;
849
        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
Martin Reinecke's avatar
Martin Reinecke committed
850
851
        PM(t2,t1,cc0,cc2);
        PM(t3,t4,cc1,cc3);
Martin Reinecke's avatar
Martin Reinecke committed
852
        ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
853
854
855
856
        CH(i,k,0) = t2+t3;
        special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
        special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
        special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
857
858
859
860
        }
      }
  }

861
#define POCKETFFT_PREP5(idx) \
862
        T t0 = CC(idx,0,k), t1, t2, t3, t4; \
Martin Reinecke's avatar
Martin Reinecke committed
863
864
        PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
        PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
865
866
867
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
        CH(idx,k,0).i=t0.i+t1.i+t2.i;

868
#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
869
870
871
872
873
874
        { \
        T ca,cb; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
Martin Reinecke's avatar
Martin Reinecke committed
875
        PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
876
877
        }

878
#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
879
880
881
882
883
884
        { \
        T ca,cb,da,db; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
Martin Reinecke's avatar
Martin Reinecke committed
885
886
        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
887
        }
Martin Reinecke's avatar
Martin Reinecke committed
888
template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
889
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
890
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
891
  {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
892
  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
893
               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
Martin Reinecke's avatar
sync    
Martin Reinecke committed
894
               tw2r= T0(-0.8090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
895
               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
896

897
898
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
899
900
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+5*c)]; };
901
902
903
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

904
905
906
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
907
908
909
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
910
911
912
913
914
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
915
916
917
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
918
919
920
      }
      for (size_t i=1; i<ido; ++i)
        {
921
922
923
        POCKETFFT_PREP5(i)
        POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
        POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
924
925
926
927
        }
      }
  }

928
929
930
#undef POCKETFFT_PARTSTEP5b
#undef POCKETFFT_PARTSTEP5a
#undef POCKETFFT_PREP5
931

932
#define POCKETFFT_PREP7(idx) \
933
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
Martin Reinecke's avatar
Martin Reinecke committed
934
935
936
        PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
        PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
        PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
937
938
939
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;

940
#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
941
942
943
944
945
946
        { \
        T ca,cb; \
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
Martin Reinecke's avatar
Martin Reinecke committed
947
        PM(out1,out2,ca,cb); \
948
        }
949
950
951
#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
952
953
        { \
        T da,db; \
954
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
955
956
        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
957
958
        }

Martin Reinecke's avatar
Martin Reinecke committed
959
template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
960
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
961
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
962
  {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
963
  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
Martin Reinecke's avatar
Martin Reinecke committed
964
               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
Martin Reinecke's avatar
sync    
Martin Reinecke committed
965
               tw2r= T0(-0.2225209339563144042889025644967948L),
Martin Reinecke's avatar
Martin Reinecke committed
966
               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
Martin Reinecke's avatar
sync    
Martin Reinecke committed
967
               tw3r= T0(-0.9009688679024191262361023195074451L),
Martin Reinecke's avatar
Martin Reinecke committed
968
               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
969

970
971
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
972
973
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+7*c)]; };
974
975
976
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

977
978
979
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
980
981
982
983
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
984
985
986
987
988
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
989
990
991
992
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
993
994
995
      }
      for (size_t i=1; i<ido; ++i)
        {
996
997
998
999
        POCKETFFT_PREP7(i)
        POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
        POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1000
1001
1002
1003
        }
      }
  }

1004
1005
1006
1007
#undef POCKETFFT_PARTSTEP7
#undef POCKETFFT_PARTSTEP7a0
#undef POCKETFFT_PARTSTEP7a
#undef POCKETFFT_PREP7
1008

Martin Reinecke's avatar
Martin Reinecke committed
1009
template <bool fwd, typename T> void ROTX45(T &a) const
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1010
1011
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
1012
  if (fwd)
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1013
    { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
Martin Reinecke's avatar
Martin Reinecke committed
1014
1015
  else
    { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1016
  }
Martin Reinecke's avatar
Martin Reinecke committed
1017
template <bool fwd, typename T> void ROTX135(T &a) const
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1018
1019
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
1020
  if (fwd)
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1021
    { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
Martin Reinecke's avatar
Martin Reinecke committed
1022
1023
  else
    { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1024
1025
  }

Martin Reinecke's avatar
Martin Reinecke committed
1026
template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
1027
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
1028
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1029
  {
1030
1031
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
1032
1033
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+8*c)]; };
1034
1035
1036
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
sync    
Martin Reinecke committed
1037
1038
1039
1040
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
Martin Reinecke's avatar
Martin Reinecke committed
1041
1042
      PM(a1,a5,CC(0,1,k),CC(0,5,k));
      PM(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1043
      PMINPLACE(a1,a3);
Martin Reinecke's avatar
Martin Reinecke committed
1044
      ROTX90<fwd>(a3);
Martin Reinecke's avatar
Martin Reinecke committed
1045
1046

      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1047
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
1048
1049
      ROTX45<fwd>(a5);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
1050

Martin Reinecke's avatar
Martin Reinecke committed
1051
1052
1053
1054
      PM(a0,a4,CC(0,0,k),CC(0,4,k));
      PM(a2,a6,CC(0,2,k),CC(0,6,k));
      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
Martin Reinecke's avatar
Martin Reinecke committed
1055
      ROTX90<fwd>(a6);
Martin Reinecke's avatar
Martin Reinecke committed
1056
1057
      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1058