pocketfft_hdronly.h 104 KB
Newer Older
1
/*
2
3
4
This file is part of pocketfft.

Copyright (C) 2010-2019 Max-Planck-Society
5
6
7
8
9
10
11
Copyright (C) 2019 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
38
39
40
41

#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H

42
#ifndef __cplusplus
43
#error This file is C++ and requires a C++ compiler.
44
45
#endif

Martin Reinecke's avatar
sync    
Martin Reinecke committed
46
#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
47
#error This file requires at least C++11 support.
48
49
#endif

50
51
52
53
#ifndef POCKETFFT_CACHE_SIZE
#define POCKETFFT_CACHE_SIZE 16
#endif

54
55
56
57
58
59
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <memory>
#include <vector>
Martin Reinecke's avatar
sync    
Martin Reinecke committed
60
#include <complex>
61
62
#if POCKETFFT_CACHE_SIZE!=0
#include <array>
Martin Reinecke's avatar
Martin Reinecke committed
63
#include <mutex>
64
#endif
Peter Bell's avatar
Peter Bell committed
65

66
#ifndef POCKETFFT_NO_MULTITHREADING
Peter Bell's avatar
Peter Bell committed
67
68
69
70
71
72
#include <mutex>
#include <condition_variable>
#include <thread>
#include <queue>
#include <atomic>
#include <functional>
Martin Reinecke's avatar
Martin Reinecke committed
73

74
75
#ifdef POCKETFFT_PTHREADS
#  include <pthread.h>
Martin Reinecke's avatar
Martin Reinecke committed
76
#endif
77
#endif
78

Martin Reinecke's avatar
Martin Reinecke committed
79
#if defined(__GNUC__)
80
81
#define POCKETFFT_NOINLINE __attribute__((noinline))
#define POCKETFFT_RESTRICT __restrict__
Martin Reinecke's avatar
Martin Reinecke committed
82
#elif defined(_MSC_VER)
83
84
#define POCKETFFT_NOINLINE __declspec(noinline)
#define POCKETFFT_RESTRICT __restrict
85
#else
86
87
#define POCKETFFT_NOINLINE
#define POCKETFFT_RESTRICT
88
89
90
91
92
#endif

namespace pocketfft {

namespace detail {
93
94
using std::size_t;
using std::ptrdiff_t;
95

96
97
98
99
// Always use std:: for <cmath> functions
template <typename T> T cos(T) = delete;
template <typename T> T sin(T) = delete;
template <typename T> T sqrt(T) = delete;
Martin Reinecke's avatar
Martin Reinecke committed
100

101
102
using shape_t = std::vector<size_t>;
using stride_t = std::vector<ptrdiff_t>;
Martin Reinecke's avatar
Martin Reinecke committed
103
104
105
106

constexpr bool FORWARD  = true,
               BACKWARD = false;

Martin Reinecke's avatar
sync    
Martin Reinecke committed
107
108
109
110
// only enable vector support for gcc>=5.0 and clang>=5.0
#ifndef POCKETFFT_NO_VECTORS
#define POCKETFFT_NO_VECTORS
#if defined(__INTEL_COMPILER)
111
// do nothing. This is necessary because this compiler also sets __GNUC__.
Martin Reinecke's avatar
sync    
Martin Reinecke committed
112
#elif defined(__clang__)
113
114
115
116
117
#ifdef __APPLE__
#if __clang__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#else
Martin Reinecke's avatar
Martin Reinecke committed
118
#if __clang_major__>=5
Martin Reinecke's avatar
sync    
Martin Reinecke committed
119
120
#undef POCKETFFT_NO_VECTORS
#endif
121
#endif
Martin Reinecke's avatar
sync    
Martin Reinecke committed
122
123
124
125
126
127
128
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
129
130
template<typename T> struct VLEN { static constexpr size_t val=1; };

Martin Reinecke's avatar
Martin Reinecke committed
131
132
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
Martin Reinecke's avatar
Martin Reinecke committed
133
134
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
Martin Reinecke's avatar
Martin Reinecke committed
135
#elif (defined(__AVX__))
Martin Reinecke's avatar
Martin Reinecke committed
136
137
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
Martin Reinecke's avatar
Martin Reinecke committed
138
#elif (defined(__SSE2__))
Martin Reinecke's avatar
Martin Reinecke committed
139
140
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
sync    
Martin Reinecke committed
141
#elif (defined(__VSX__))
Martin Reinecke's avatar
Martin Reinecke committed
142
143
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
Martin Reinecke committed
144
145
146
147
148
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
149
template<typename T> class arr
150
151
152
153
154
  {
  private:
    T *p;
    size_t sz;

Martin Reinecke's avatar
Martin Reinecke committed
155
#if defined(POCKETFFT_NO_VECTORS)
156
157
158
159
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = malloc(num*sizeof(T));
160
      if (!res) throw std::bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
161
162
163
164
165
166
167
168
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
#elif __cplusplus >= 201703L
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
169
      void *res = aligned_alloc(64,num*sizeof(T));
170
      if (!res) throw std::bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
171
172
173
174
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
175
#else // portable emulation
Martin Reinecke's avatar
Martin Reinecke committed
176
177
178
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
179
      void *ptr = malloc(num*sizeof(T)+64);
180
      if (!ptr) throw std::bad_alloc();
181
182
183
184
      T *res = reinterpret_cast<T *>
        ((reinterpret_cast<size_t>(ptr) & ~(size_t(63))) + 64);
      (reinterpret_cast<void**>(res))[-1] = ptr;
      return res;
Martin Reinecke's avatar
Martin Reinecke committed
185
186
      }
    static void dealloc(T *ptr)
187
      { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
Martin Reinecke's avatar
Martin Reinecke committed
188
#endif
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220

  public:
    arr() : p(0), sz(0) {}
    arr(size_t n) : p(ralloc(n)), sz(n) {}
    arr(arr &&other)
      : p(other.p), sz(other.sz)
      { other.p=nullptr; other.sz=0; }
    ~arr() { dealloc(p); }

    void resize(size_t n)
      {
      if (n==sz) return;
      dealloc(p);
      p = ralloc(n);
      sz = n;
      }

    T &operator[](size_t idx) { return p[idx]; }
    const T &operator[](size_t idx) const { return p[idx]; }

    T *data() { return p; }
    const T *data() const { return p; }

    size_t size() const { return sz; }
  };

template<typename T> struct cmplx {
  T r, i;
  cmplx() {}
  cmplx(T r_, T i_) : r(r_), i(i_) {}
  void Set(T r_, T i_) { r=r_; i=i_; }
  void Set(T r_) { r=r_; i=T(0); }
Martin Reinecke's avatar
Martin Reinecke committed
221
222
  void Split(T &r_, T &i_) const { r_=r; i_=i; }
  void SplitConj(T &r_, T &i_) const { r_=r; i_=-i; }
223
224
225
226
  cmplx &operator+= (const cmplx &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator*= (T2 other)
    { r*=other; i*=other; return *this; }
227
228
229
230
231
232
233
  template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
    {
    T tmp = r*other.r - i*other.i;
    i = r*other.i + i*other.r;
    r = tmp;
    return *this;
    }
Martin Reinecke's avatar
Martin Reinecke committed
234
235
236
237
  template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
    { r-=other.r; i-=other.i; return *this; }
238
239
240
  template<typename T2> auto operator* (const T2 &other) const
    -> cmplx<decltype(r*other)>
    { return {r*other, i*other}; }
Martin Reinecke's avatar
Martin Reinecke committed
241
242
243
244
245
246
  template<typename T2> auto operator+ (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r+other.r, i+other.i}; }
  template<typename T2> auto operator- (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r-other.r, i-other.i}; }
247
248
249
  template<typename T2> auto operator* (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
Martin Reinecke's avatar
Martin Reinecke committed
250
  template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
251
252
    -> cmplx<decltype(r+other.r)>
    {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
253
    using Tres = cmplx<decltype(r+other.r)>;
Martin Reinecke's avatar
Martin Reinecke committed
254
255
    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
256
257
    }
};
Martin Reinecke's avatar
Martin Reinecke committed
258
259
template<typename T> inline void PM(T &a, T &b, T c, T d)
  { a=c+d; b=c-d; }
Martin Reinecke's avatar
Martin Reinecke committed
260
261
template<typename T> inline void PMINPLACE(T &a, T &b)
  { T t = a; a+=b; b=t-b; }
262
263
template<typename T> inline void MPINPLACE(T &a, T &b)
  { T t = a; a-=b; b=t+b; }
264
265
template<typename T> cmplx<T> conj(const cmplx<T> &a)
  { return {a.r, -a.i}; }
Martin Reinecke's avatar
Martin Reinecke committed
266
267
268
269
270
template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
  {
  res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
            : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
  }
271
272
273

template<typename T> void ROT90(cmplx<T> &a)
  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
Martin Reinecke's avatar
Martin Reinecke committed
274
275
template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
276
277
278
279
280
281
282

//
// twiddle factor section
//
template<typename T> class sincos_2pibyn
  {
  private:
283
    using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type;
Martin Reinecke's avatar
Martin Reinecke committed
284
    size_t N, mask, shift;
Martin Reinecke's avatar
Martin Reinecke committed
285
    arr<cmplx<Thigh>> v1, v2;
286

Martin Reinecke's avatar
Martin Reinecke committed
287
    static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
288
      {
Martin Reinecke's avatar
Martin Reinecke committed
289
290
      x<<=3;
      if (x<4*n) // first half
291
        {
Martin Reinecke's avatar
Martin Reinecke committed
292
293
        if (x<2*n) // first quadrant
          {
294
295
          if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang));
          return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang));
Martin Reinecke's avatar
Martin Reinecke committed
296
297
298
299
          }
        else // second quadrant
          {
          x-=2*n;
300
301
          if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang));
          return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang));
Martin Reinecke's avatar
Martin Reinecke committed
302
          }
303
304
305
        }
      else
        {
Martin Reinecke's avatar
Martin Reinecke committed
306
307
308
        x=8*n-x;
        if (x<2*n) // third quadrant
          {
309
310
          if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang));
          return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang));
Martin Reinecke's avatar
Martin Reinecke committed
311
312
313
314
          }
        else // fourth quadrant
          {
          x-=2*n;
315
316
          if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang));
          return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang));
Martin Reinecke's avatar
Martin Reinecke committed
317
          }
318
319
320
321
        }
      }

  public:
322
    POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
323
      : N(n)
Martin Reinecke's avatar
Martin Reinecke committed
324
325
326
      {
      constexpr auto pi = 3.141592653589793238462643383279502884197L;
      Thigh ang = Thigh(0.25L*pi/n);
Martin Reinecke's avatar
Martin Reinecke committed
327
      size_t nval = (n+2)/2;
Martin Reinecke's avatar
Martin Reinecke committed
328
      shift = 1;
Martin Reinecke's avatar
Martin Reinecke committed
329
      while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
Martin Reinecke's avatar
Martin Reinecke committed
330
331
332
333
334
      mask = (size_t(1)<<shift)-1;
      v1.resize(mask+1);
      v1[0].Set(Thigh(1), Thigh(0));
      for (size_t i=1; i<v1.size(); ++i)
        v1[i]=calc(i,n,ang);
Martin Reinecke's avatar
Martin Reinecke committed
335
      v2.resize((nval+mask)/(mask+1));
Martin Reinecke's avatar
Martin Reinecke committed
336
337
338
339
      v2[0].Set(Thigh(1), Thigh(0));
      for (size_t i=1; i<v2.size(); ++i)
        v2[i]=calc(i*(mask+1),n,ang);
      }
340
341

    cmplx<T> operator[](size_t idx) const
342
      {
Martin Reinecke's avatar
Martin Reinecke committed
343
344
345
346
347
348
      if (2*idx<=N)
        {
        auto x1=v1[idx&mask], x2=v2[idx>>shift];
        return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
        }
      idx = N-idx;
Martin Reinecke's avatar
Martin Reinecke committed
349
      auto x1=v1[idx&mask], x2=v2[idx>>shift];
Martin Reinecke's avatar
Martin Reinecke committed
350
      return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
351
352
353
      }
  };

Martin Reinecke's avatar
Martin Reinecke committed
354
struct util // hack to avoid duplicate symbols
355
  {
356
  static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
357
    {
Martin Reinecke's avatar
Martin Reinecke committed
358
359
360
    size_t res=1;
    while ((n&1)==0)
      { res=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
361
362
363
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        { res=x; n/=x; }
Martin Reinecke's avatar
Martin Reinecke committed
364
365
366
367
    if (n>1) res=n;
    return res;
    }

368
  static POCKETFFT_NOINLINE double cost_guess (size_t n)
369
    {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
370
    constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
Martin Reinecke's avatar
Martin Reinecke committed
371
372
373
374
    size_t ni=n;
    double result=0.;
    while ((n&1)==0)
      { result+=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
375
376
377
378
379
380
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        {
        result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
        n/=x;
        }
Martin Reinecke's avatar
sync    
Martin Reinecke committed
381
382
    if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
    return result*double(ni);
383
384
    }

Martin Reinecke's avatar
Martin Reinecke committed
385
  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
386
  static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
387
388
389
390
    {
    if (n<=12) return n;

    size_t bestfac=2*n;
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
    for (size_t f11=1; f11<bestfac; f11*=11)
      for (size_t f117=f11; f117<bestfac; f117*=7)
        for (size_t f1175=f117; f1175<bestfac; f1175*=5)
          {
          size_t x=f1175;
          while (x<n) x*=2;
          for (;;)
            {
            if (x<n)
              x*=3;
            else if (x>n)
              {
              if (x<bestfac) bestfac=x;
              if (x&1) break;
              x>>=1;
              }
            else
              return n;
            }
          }
    return bestfac;
    }

  /* returns the smallest composite of 2, 3, 5 which is >= n */
  static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
    {
    if (n<=6) return n;

    size_t bestfac=2*n;
    for (size_t f5=1; f5<bestfac; f5*=5)
      {
      size_t x = f5;
      while (x<n) x *= 2;
      for (;;)
        {
        if (x<n)
          x*=3;
        else if (x>n)
          {
          if (x<bestfac) bestfac=x;
          if (x&1) break;
          x>>=1;
          }
        else
          return n;
        }
      }
Martin Reinecke's avatar
Martin Reinecke committed
438
439
    return bestfac;
    }
440

Martin Reinecke's avatar
Martin Reinecke committed
441
442
443
444
445
446
447
  static size_t prod(const shape_t &shape)
    {
    size_t res=1;
    for (auto sz: shape)
      res*=sz;
    return res;
    }
Martin Reinecke's avatar
Martin Reinecke committed
448

449
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
450
451
452
    const stride_t &stride_in, const stride_t &stride_out, bool inplace)
    {
    auto ndim = shape.size();
453
    if (ndim<1) throw std::runtime_error("ndim must be >= 1");
Martin Reinecke's avatar
Martin Reinecke committed
454
    if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
455
      throw std::runtime_error("stride dimension mismatch");
Martin Reinecke's avatar
sync    
Martin Reinecke committed
456
    if (inplace && (stride_in!=stride_out))
457
      throw std::runtime_error("stride mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
458
459
    }

460
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
461
462
463
464
465
466
467
468
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    const shape_t &axes)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    auto ndim = shape.size();
    shape_t tmp(ndim,0);
    for (auto ax : axes)
      {
469
470
      if (ax>=ndim) throw std::invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
Martin Reinecke's avatar
Martin Reinecke committed
471
472
473
      }
    }

474
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
475
476
477
478
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    size_t axis)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
479
    if (axis>=shape.size()) throw std::invalid_argument("bad axis number");
Martin Reinecke's avatar
Martin Reinecke committed
480
    }
Martin Reinecke's avatar
Martin Reinecke committed
481

482
483
484
485
486
487
488
489
490
491
492
493
494
495
#ifdef POCKETFFT_NO_MULTITHREADING
  static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/,
    size_t /*axis*/, size_t /*vlen*/)
    { return 1; }
#else
  static size_t thread_count (size_t nthreads, const shape_t &shape,
    size_t axis, size_t vlen)
    {
    if (nthreads==1) return 1;
    size_t size = prod(shape);
    size_t parallel = size / (shape[axis] * vlen);
    if (shape[axis] < 1000)
      parallel /= 4;
    size_t max_threads = nthreads == 0 ?
496
497
      std::thread::hardware_concurrency() : nthreads;
    return std::max(size_t(1), std::min(parallel, max_threads));
498
499
    }
#endif
Martin Reinecke's avatar
Martin Reinecke committed
500
  };
501

Peter Bell's avatar
Peter Bell committed
502
namespace threading {
503
504
505

#ifdef POCKETFFT_NO_MULTITHREADING

Martin Reinecke's avatar
fix    
Martin Reinecke committed
506
507
constexpr inline size_t thread_id() { return 0; }
constexpr inline size_t num_threads() { return 1; }
508
509
510
511
512
513
514

template <typename Func>
void thread_map(size_t /* nthreads */, Func f)
  { f(); }

#else

Martin Reinecke's avatar
fix    
Martin Reinecke committed
515
inline size_t &thread_id()
516
517
518
519
  {
  static thread_local size_t thread_id_=0;
  return thread_id_;
  }
Martin Reinecke's avatar
fix    
Martin Reinecke committed
520
inline size_t &num_threads()
521
522
523
524
  {
  static thread_local size_t num_threads_=1;
  return num_threads_;
  }
525
static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency());
Peter Bell's avatar
Peter Bell committed
526
527
528

class latch
  {
529
530
531
532
    std::atomic<size_t> num_left_;
    std::mutex mut_;
    std::condition_variable completed_;
    using lock_t = std::unique_lock<std::mutex>;
Peter Bell's avatar
Peter Bell committed
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554

  public:
    latch(size_t n): num_left_(n) {}

    void count_down()
      {
      lock_t lock(mut_);
      if (--num_left_)
        return;
      completed_.notify_all();
      }

    void wait()
      {
      lock_t lock(mut_);
      completed_.wait(lock, [this]{ return is_ready(); });
      }
    bool is_ready() { return num_left_ == 0; }
  };

template <typename T> class concurrent_queue
  {
555
556
557
    std::queue<T> q_;
    std::mutex mut_;
    std::condition_variable item_added_;
558
    bool shutdown_;
559
    using lock_t = std::unique_lock<std::mutex>;
Peter Bell's avatar
Peter Bell committed
560
561
562
563
564
565
566
567
568

  public:
    concurrent_queue(): shutdown_(false) {}

    void push(T val)
      {
      {
      lock_t lock(mut_);
      if (shutdown_)
569
        throw std::runtime_error("Item added to queue after shutdown");
Peter Bell's avatar
Peter Bell committed
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
      q_.push(move(val));
      }
      item_added_.notify_one();
      }

    bool pop(T & val)
      {
      lock_t lock(mut_);
      item_added_.wait(lock, [this] { return (!q_.empty() || shutdown_); });
      if (q_.empty())
        return false;  // We are shutting down

      val = std::move(q_.front());
      q_.pop();
      return true;
      }

    void shutdown()
      {
      {
      lock_t lock(mut_);
      shutdown_ = true;
      }
      item_added_.notify_all();
      }
595
596

    void restart() { shutdown_ = false; }
Peter Bell's avatar
Peter Bell committed
597
598
599
600
  };

class thread_pool
  {
601
602
    concurrent_queue<std::function<void()>> work_queue_;
    std::vector<std::thread> threads_;
Peter Bell's avatar
Peter Bell committed
603
604
605

    void worker_main()
      {
606
      std::function<void()> work;
Peter Bell's avatar
Peter Bell committed
607
608
609
610
      while (work_queue_.pop(work))
        work();
      }

611
    void create_threads()
Peter Bell's avatar
Peter Bell committed
612
      {
613
      size_t nthreads = threads_.size();
Peter Bell's avatar
Peter Bell committed
614
615
      for (size_t i=0; i<nthreads; ++i)
        {
616
        try { threads_[i] = std::thread([this]{ worker_main(); }); }
Peter Bell's avatar
Peter Bell committed
617
618
        catch (...)
          {
619
          shutdown();
Peter Bell's avatar
Peter Bell committed
620
621
622
623
624
          throw;
          }
        }
      }

625
626
627
628
629
  public:
    explicit thread_pool(size_t nthreads):
      threads_(nthreads)
      { create_threads(); }

630
    thread_pool(): thread_pool(max_threads) {}
Peter Bell's avatar
Peter Bell committed
631

632
633
    ~thread_pool() { shutdown(); }

634
    void submit(std::function<void()> work)
635
636
637
638
639
      {
      work_queue_.push(move(work));
      }

    void shutdown()
Peter Bell's avatar
Peter Bell committed
640
641
      {
      work_queue_.shutdown();
642
643
644
      for (auto &thread : threads_)
        if (thread.joinable())
          thread.join();
Peter Bell's avatar
Peter Bell committed
645
646
      }

647
    void restart()
Peter Bell's avatar
Peter Bell committed
648
      {
649
650
      work_queue_.restart();
      create_threads();
Martin Reinecke's avatar
Martin Reinecke committed
651
      }
Martin Reinecke's avatar
Martin Reinecke committed
652
  };
653

Peter Bell's avatar
Peter Bell committed
654
655
656
thread_pool & get_pool()
  {
  static thread_pool pool;
657
#ifdef POCKETFFT_PTHREADS
658
659
  static std::once_flag f;
  std::call_once(f,
660
661
662
663
664
665
666
667
668
    []{
    pthread_atfork(
      +[]{ get_pool().shutdown(); },  // prepare
      +[]{ get_pool().restart(); },   // parent
      +[]{ get_pool().restart(); }    // child
      );
    });
#endif

Peter Bell's avatar
Peter Bell committed
669
670
671
672
673
674
675
676
  return pool;
  }

/** Map a function f over nthreads */
template <typename Func>
void thread_map(size_t nthreads, Func f)
  {
  if (nthreads == 0)
677
    nthreads = max_threads;
Peter Bell's avatar
Peter Bell committed
678
679
680
681

  if (nthreads == 1)
    { f(); return; }

682
  auto & pool = get_pool();
Peter Bell's avatar
Peter Bell committed
683
  latch counter(nthreads);
684
685
  std::exception_ptr ex;
  std::mutex ex_mut;
Peter Bell's avatar
Peter Bell committed
686
687
688
  for (size_t i=0; i<nthreads; ++i)
    {
    pool.submit(
689
      [&f, &counter, &ex, &ex_mut, i, nthreads] {
690
691
      thread_id() = i;
      num_threads() = nthreads;
692
693
694
      try { f(); }
      catch (...)
        {
695
696
        std::lock_guard<std::mutex> lock(ex_mut);
        ex = std::current_exception();
697
        }
Peter Bell's avatar
Peter Bell committed
698
699
700
701
      counter.count_down();
      });
    }
  counter.wait();
702
  if (ex)
703
    std::rethrow_exception(ex);
Peter Bell's avatar
Peter Bell committed
704
  }
705
706
707

#endif

Peter Bell's avatar
Peter Bell committed
708
709
}

710
711
712
713
714
715
716
717
718
719
720
721
722
//
// complex FFTPACK transforms
//

template<typename T0> class cfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      cmplx<T0> *tw, *tws;
      };

Martin Reinecke's avatar
Martin Reinecke committed
723
    size_t length;
724
    arr<cmplx<T0>> mem;
725
    std::vector<fctdata> fact;
726
727

    void add_factor(size_t factor)
Martin Reinecke's avatar
Martin Reinecke committed
728
      { fact.push_back({factor, nullptr, nullptr}); }
729

Martin Reinecke's avatar
Martin Reinecke committed
730
template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
731
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
732
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
733
  {
734
735
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
736
737
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+2*c)]; };
738
739
740
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

741
742
743
744
745
746
747
748
749
750
751
752
753
754
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      for (size_t i=1; i<ido; ++i)
        {
        CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
Martin Reinecke's avatar
Martin Reinecke committed
755
        special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
756
757
758
759
        }
      }
  }

760
#define POCKETFFT_PREP3(idx) \
761
        T t0 = CC(idx,0,k), t1, t2; \
Martin Reinecke's avatar
Martin Reinecke committed
762
        PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
763
        CH(idx,k,0)=t0+t1;
764
#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
765
        { \
Martin Reinecke's avatar
Martin Reinecke committed
766
767
768
        T ca=t0+t1*twr; \
        T cb{-t2.i*twi, t2.r*twi}; \
        PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
769
        }
770
#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
771
        { \
Martin Reinecke's avatar
Martin Reinecke committed
772
773
774
775
        T ca=t0+t1*twr; \
        T cb{-t2.i*twi, t2.r*twi}; \
        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
776
        }
Martin Reinecke's avatar
Martin Reinecke committed
777
template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
778
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
779
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
780
  {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
781
  constexpr T0 tw1r=-0.5,
Martin Reinecke's avatar
Martin Reinecke committed
782
               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
783

784
785
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
786
787
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+3*c)]; };
788
789
790
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

791
792
793
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
794
795
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
796
797
798
799
800
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
801
802
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
803
804
805
      }
      for (size_t i=1; i<ido; ++i)
        {
806
807
        POCKETFFT_PREP3(i)
        POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
808
809
810
811
        }
      }
  }

812
813
814
#undef POCKETFFT_PARTSTEP3b
#undef POCKETFFT_PARTSTEP3a
#undef POCKETFFT_PREP3
815

Martin Reinecke's avatar
Martin Reinecke committed
816
template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
817
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
818
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
819
  {
820
821
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
822
823
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+4*c)]; };
824
825
826
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

827
828
829
830
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T t1, t2, t3, t4;
Martin Reinecke's avatar
Martin Reinecke committed
831
832
      PM(t2,t1,CC(0,0,k),CC(0,2,k));
      PM(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
833
      ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
834
835
      PM(CH(0,k,0),CH(0,k,2),t2,t3);
      PM(CH(0,k,1),CH(0,k,3),t1,t4);
836
837
838
839
840
841
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T t1, t2, t3, t4;
Martin Reinecke's avatar
Martin Reinecke committed
842
843
      PM(t2,t1,CC(0,0,k),CC(0,2,k));
      PM(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
844
      ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
845
846
      PM(CH(0,k,0),CH(0,k,2),t2,t3);
      PM(CH(0,k,1),CH(0,k,3),t1,t4);
847
848
849
      }
      for (size_t i=1; i<ido; ++i)
        {
Martin Reinecke's avatar
Martin Reinecke committed
850
        T t1, t2, t3, t4;
851
        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
Martin Reinecke's avatar
Martin Reinecke committed
852
853
        PM(t2,t1,cc0,cc2);
        PM(t3,t4,cc1,cc3);
Martin Reinecke's avatar
Martin Reinecke committed
854
        ROTX90<fwd>(t4);
Martin Reinecke's avatar
Martin Reinecke committed
855
856
857
858
        CH(i,k,0) = t2+t3;
        special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
        special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
        special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
859
860
861
862
        }
      }
  }

863
#define POCKETFFT_PREP5(idx) \
864
        T t0 = CC(idx,0,k), t1, t2, t3, t4; \
Martin Reinecke's avatar
Martin Reinecke committed
865
866
        PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
        PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
867
868
869
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
        CH(idx,k,0).i=t0.i+t1.i+t2.i;

870
#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
871
872
873
874
875
876
        { \
        T ca,cb; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
Martin Reinecke's avatar
Martin Reinecke committed
877
        PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
878
879
        }

880
#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
881
882
883
884
885
886
        { \
        T ca,cb,da,db; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
Martin Reinecke's avatar
Martin Reinecke committed
887
888
        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
889
        }
Martin Reinecke's avatar
Martin Reinecke committed
890
template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
891
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
892
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
893
  {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
894
  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
895
               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
Martin Reinecke's avatar
sync    
Martin Reinecke committed
896
               tw2r= T0(-0.8090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
897
               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
898

899
900
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
901
902
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+5*c)]; };
903
904
905
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

906
907
908
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
909
910
911
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
912
913
914
915
916
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
917
918
919
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
920
921
922
      }
      for (size_t i=1; i<ido; ++i)
        {
923
924
925
        POCKETFFT_PREP5(i)
        POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
        POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
926
927
928
929
        }
      }
  }

930
931
932
#undef POCKETFFT_PARTSTEP5b
#undef POCKETFFT_PARTSTEP5a
#undef POCKETFFT_PREP5
933

934
#define POCKETFFT_PREP7(idx) \
935
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
Martin Reinecke's avatar
Martin Reinecke committed
936
937
938
        PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
        PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
        PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
939
940
941
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;

942
#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
943
944
945
946
947
948
        { \
        T ca,cb; \
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
Martin Reinecke's avatar
Martin Reinecke committed
949
        PM(out1,out2,ca,cb); \
950
        }
951
952
953
#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
954
955
        { \
        T da,db; \
956
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
957
958
        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
959
960
        }

Martin Reinecke's avatar
Martin Reinecke committed
961
template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
962
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
963
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
964
  {
Martin Reinecke's avatar
sync    
Martin Reinecke committed
965
  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
Martin Reinecke's avatar
Martin Reinecke committed
966
               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
Martin Reinecke's avatar
sync    
Martin Reinecke committed
967
               tw2r= T0(-0.2225209339563144042889025644967948L),
Martin Reinecke's avatar
Martin Reinecke committed
968
               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
Martin Reinecke's avatar
sync    
Martin Reinecke committed
969
               tw3r= T0(-0.9009688679024191262361023195074451L),
Martin Reinecke's avatar
Martin Reinecke committed
970
               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
971

972
973
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
974
975
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+7*c)]; };
976
977
978
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

979
980
981
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
982
983
984
985
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
986
987
988
989
990
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
991
992
993
994
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
995
996
997
      }
      for (size_t i=1; i<ido; ++i)
        {
998
999
1000
1001
        POCKETFFT_PREP7(i)
        POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
        POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1002
1003
1004
1005
        }
      }
  }

1006
1007
1008
1009
#undef POCKETFFT_PARTSTEP7
#undef POCKETFFT_PARTSTEP7a0
#undef POCKETFFT_PARTSTEP7a
#undef POCKETFFT_PREP7
1010

Martin Reinecke's avatar
Martin Reinecke committed
1011
template <bool fwd, typename T> void ROTX45(T &a) const
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1012
1013
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
1014
  if (fwd)
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1015
    { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
Martin Reinecke's avatar
Martin Reinecke committed
1016
1017
  else
    { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1018
  }
Martin Reinecke's avatar
Martin Reinecke committed
1019
template <bool fwd, typename T> void ROTX135(T &a) const
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1020
1021
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
1022
  if (fwd)
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1023
    { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
Martin Reinecke's avatar
Martin Reinecke committed
1024
1025
  else
    { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1026
1027
  }

Martin Reinecke's avatar
Martin Reinecke committed
1028
template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
1029
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
Martin Reinecke's avatar
Martin Reinecke committed
1030
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1031
  {
1032
1033
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
1034
1035
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+8*c)]; };
1036
1037
1038
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
sync    
Martin Reinecke committed
1039
1040
1041
1042
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
Martin Reinecke's avatar
Martin Reinecke committed
1043
1044
      PM(a1,a5,CC(0,1,k),CC(0,5,k));
      PM(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1045
      PMINPLACE(a1,a3);
Martin Reinecke's avatar
Martin Reinecke committed
1046
      ROTX90<fwd>(a3);
Martin Reinecke's avatar
Martin Reinecke committed
1047
1048

      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync    
Martin Reinecke committed
1049
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
1050
1051
      ROTX45<fwd>(a5);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
1052

Martin Reinecke's avatar
Martin Reinecke committed
1053
1054
1055
1056
      PM(a0,a4,CC(0,0,k),CC(0,4,k));
      PM(a2,a6,CC(0,2,k),CC(0,6,k));
      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
Martin Reinecke's avatar
Martin Reinecke committed