pocketfft_hdronly.h 105 KB
Newer Older
1
/*
2 3 4
This file is part of pocketfft.

Copyright (C) 2010-2019 Max-Planck-Society
5 6 7 8 9 10 11
Copyright (C) 2019 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
38 39 40 41

#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H

42
#ifndef __cplusplus
43
#error This file is C++ and requires a C++ compiler.
44 45
#endif

Martin Reinecke's avatar
sync  
Martin Reinecke committed
46
#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
47
#error This file requires at least C++11 support.
48 49
#endif

50 51 52 53
#ifndef POCKETFFT_CACHE_SIZE
#define POCKETFFT_CACHE_SIZE 16
#endif

54 55 56 57 58 59
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <memory>
#include <vector>
Martin Reinecke's avatar
sync  
Martin Reinecke committed
60
#include <complex>
61 62
#if POCKETFFT_CACHE_SIZE!=0
#include <array>
Martin Reinecke's avatar
Martin Reinecke committed
63
#include <mutex>
64
#endif
Martin Reinecke's avatar
Martin Reinecke committed
65 66 67 68
#ifdef POCKETFFT_OPENMP
#include <omp.h>
#endif

69

Martin Reinecke's avatar
Martin Reinecke committed
70
#if defined(__GNUC__)
71 72
#define POCKETFFT_NOINLINE __attribute__((noinline))
#define POCKETFFT_RESTRICT __restrict__
Martin Reinecke's avatar
Martin Reinecke committed
73
#elif defined(_MSC_VER)
74 75
#define POCKETFFT_NOINLINE __declspec(noinline)
#define POCKETFFT_RESTRICT __restrict
76
#else
77 78
#define POCKETFFT_NOINLINE
#define POCKETFFT_RESTRICT
79 80 81 82 83 84
#endif

namespace pocketfft {

namespace detail {

Martin Reinecke's avatar
Martin Reinecke committed
85 86
using namespace std;

Martin Reinecke's avatar
Martin Reinecke committed
87 88 89 90 91 92
using shape_t = vector<size_t>;
using stride_t = vector<ptrdiff_t>;

constexpr bool FORWARD  = true,
               BACKWARD = false;

Martin Reinecke's avatar
sync  
Martin Reinecke committed
93 94 95 96
// only enable vector support for gcc>=5.0 and clang>=5.0
#ifndef POCKETFFT_NO_VECTORS
#define POCKETFFT_NO_VECTORS
#if defined(__INTEL_COMPILER)
97
// do nothing. This is necessary because this compiler also sets __GNUC__.
Martin Reinecke's avatar
sync  
Martin Reinecke committed
98 99 100 101 102 103 104 105 106 107 108
#elif defined(__clang__)
#if __clang__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
109 110
template<typename T> struct VLEN { static constexpr size_t val=1; };

Martin Reinecke's avatar
Martin Reinecke committed
111 112
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
Martin Reinecke's avatar
Martin Reinecke committed
113 114
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
Martin Reinecke's avatar
Martin Reinecke committed
115
#elif (defined(__AVX__))
Martin Reinecke's avatar
Martin Reinecke committed
116 117
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
Martin Reinecke's avatar
Martin Reinecke committed
118
#elif (defined(__SSE2__))
Martin Reinecke's avatar
Martin Reinecke committed
119 120
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
sync  
Martin Reinecke committed
121
#elif (defined(__VSX__))
Martin Reinecke's avatar
Martin Reinecke committed
122 123
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
Martin Reinecke committed
124 125 126 127 128
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
129
template<typename T> class arr
130 131 132 133 134
  {
  private:
    T *p;
    size_t sz;

Martin Reinecke's avatar
Martin Reinecke committed
135
#if defined(POCKETFFT_NO_VECTORS)
136 137 138 139 140
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = malloc(num*sizeof(T));
      if (!res) throw bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
141 142 143 144 145 146 147 148
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
#elif __cplusplus >= 201703L
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
149 150
      void *res = aligned_alloc(64,num*sizeof(T));
      if (!res) throw bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
151 152 153 154
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
155
#else // portable emulation
Martin Reinecke's avatar
Martin Reinecke committed
156 157 158
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
159 160 161 162 163 164
      void *ptr = malloc(num*sizeof(T)+64);
      if (!ptr) throw bad_alloc();
      T *res = reinterpret_cast<T *>
        ((reinterpret_cast<size_t>(ptr) & ~(size_t(63))) + 64);
      (reinterpret_cast<void**>(res))[-1] = ptr;
      return res;
Martin Reinecke's avatar
Martin Reinecke committed
165 166
      }
    static void dealloc(T *ptr)
167
      { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
Martin Reinecke's avatar
Martin Reinecke committed
168
#endif
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204

  public:
    arr() : p(0), sz(0) {}
    arr(size_t n) : p(ralloc(n)), sz(n) {}
    arr(arr &&other)
      : p(other.p), sz(other.sz)
      { other.p=nullptr; other.sz=0; }
    ~arr() { dealloc(p); }

    void resize(size_t n)
      {
      if (n==sz) return;
      dealloc(p);
      p = ralloc(n);
      sz = n;
      }

    T &operator[](size_t idx) { return p[idx]; }
    const T &operator[](size_t idx) const { return p[idx]; }

    T *data() { return p; }
    const T *data() const { return p; }

    size_t size() const { return sz; }
  };

template<typename T> struct cmplx {
  T r, i;
  cmplx() {}
  cmplx(T r_, T i_) : r(r_), i(i_) {}
  void Set(T r_, T i_) { r=r_; i=i_; }
  void Set(T r_) { r=r_; i=T(0); }
  cmplx &operator+= (const cmplx &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator*= (T2 other)
    { r*=other; i*=other; return *this; }
205 206 207 208 209 210 211
  template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
    {
    T tmp = r*other.r - i*other.i;
    i = r*other.i + i*other.r;
    r = tmp;
    return *this;
    }
Martin Reinecke's avatar
Martin Reinecke committed
212 213 214 215
  template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
    { r-=other.r; i-=other.i; return *this; }
216 217 218
  template<typename T2> auto operator* (const T2 &other) const
    -> cmplx<decltype(r*other)>
    { return {r*other, i*other}; }
Martin Reinecke's avatar
Martin Reinecke committed
219 220 221 222 223 224
  template<typename T2> auto operator+ (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r+other.r, i+other.i}; }
  template<typename T2> auto operator- (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r-other.r, i-other.i}; }
225 226 227
  template<typename T2> auto operator* (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
Martin Reinecke's avatar
Martin Reinecke committed
228
  template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
229 230
    -> cmplx<decltype(r+other.r)>
    {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
231
    using Tres = cmplx<decltype(r+other.r)>;
Martin Reinecke's avatar
Martin Reinecke committed
232 233
    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
234 235
    }
};
Martin Reinecke's avatar
Martin Reinecke committed
236 237
template<typename T> inline void PMINPLACE(T &a, T &b)
  { T t = a; a+=b; b=t-b; }
238 239
template<typename T> inline void MPINPLACE(T &a, T &b)
  { T t = a; a-=b; b=t+b; }
Martin Reinecke's avatar
Martin Reinecke committed
240
template<typename T> void PMC(T &a, T &b, const T &c, const T &d)
241 242 243 244 245 246
  { a = c+d; b = c-d; }
template<typename T> cmplx<T> conj(const cmplx<T> &a)
  { return {a.r, -a.i}; }

template<typename T> void ROT90(cmplx<T> &a)
  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
Martin Reinecke's avatar
Martin Reinecke committed
247 248
template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
249 250 251 252 253 254 255 256

//
// twiddle factor section
//

template<typename T> class sincos_2pibyn
  {
  private:
Martin Reinecke's avatar
cleanup  
Martin Reinecke committed
257
    using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
258 259
    arr<T> data;

260
    void my_sincosm1pi (Thigh a_, Thigh *POCKETFFT_RESTRICT res)
261
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
262 263
      if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
264 265 266
        constexpr Thigh pi = Thigh(3.141592653589793238462643383279502884197L);
        auto s = sin(pi*a_);
        res[1] = s;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
267 268 269
        res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1);
        return;
        }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
270 271 272
      // adapted from https://stackoverflow.com/questions/42792939/
      // CAUTION: this function only works for arguments in the range
      //          [-0.25; 0.25]!
Martin Reinecke's avatar
sync  
Martin Reinecke committed
273
      double a = double(a_);
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
      double s = a * a;
      /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
      double r =     -1.0369917389758117e-4;
      r = fma (r, s,  1.9294935641298806e-3);
      r = fma (r, s, -2.5806887942825395e-2);
      r = fma (r, s,  2.3533063028328211e-1);
      r = fma (r, s, -1.3352627688538006e+0);
      r = fma (r, s,  4.0587121264167623e+0);
      r = fma (r, s, -4.9348022005446790e+0);
      double c = r*s;
      /* Approximate sin(pi*x) for x in [-0.25,0.25] */
      r =             4.6151442520157035e-4;
      r = fma (r, s, -7.3700183130883555e-3);
      r = fma (r, s,  8.2145868949323936e-2);
      r = fma (r, s, -5.9926452893214921e-1);
      r = fma (r, s,  2.5501640398732688e+0);
      r = fma (r, s, -5.1677127800499516e+0);
      s = s * a;
      r = r * s;
      s = fma (a, 3.1415926535897931e+0, r);
      res[0] = c;
      res[1] = s;
      }

298 299
    POCKETFFT_NOINLINE void calc_first_octant(size_t den,
      T * POCKETFFT_RESTRICT res)
300 301 302 303 304
      {
      size_t n = (den+4)>>3;
      if (n==0) return;
      res[0]=1.; res[1]=0.;
      if (n==1) return;
Martin Reinecke's avatar
Martin Reinecke committed
305
      size_t l1 = size_t(sqrt(n));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
306
      arr<Thigh> tmp(2*l1);
307 308
      for (size_t i=1; i<l1; ++i)
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
309 310 311
        my_sincosm1pi(Thigh(2*i)/Thigh(den),&tmp[2*i]);
        res[2*i  ] = T(tmp[2*i]+1);
        res[2*i+1] = T(tmp[2*i+1]);
312 313 314 315
        }
      size_t start=l1;
      while(start<n)
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
316
        Thigh cs[2];
Martin Reinecke's avatar
sync  
Martin Reinecke committed
317 318 319
        my_sincosm1pi((Thigh(2*start))/Thigh(den),cs);
        res[2*start] = T(cs[0]+1);
        res[2*start+1] = T(cs[1]);
320 321 322 323
        size_t end = l1;
        if (start+end>n) end = n-start;
        for (size_t i=1; i<end; ++i)
          {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
324
          Thigh csx[2]={tmp[2*i], tmp[2*i+1]};
Martin Reinecke's avatar
sync  
Martin Reinecke committed
325 326
          res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1);
          res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]);
327 328 329 330 331
          }
        start += l1;
        }
      }

332
    void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
333
      {
334
      T * POCKETFFT_RESTRICT p = res+n;
335 336 337 338 339
      calc_first_octant(n<<1, p);
      size_t ndone=(n+2)>>2;
      size_t i=0, idx1=0, idx2=2*ndone-2;
      for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
        {
Martin Reinecke's avatar
Martin Reinecke committed
340 341
        res[idx1] = p[2*i  ]; res[idx1+1] = p[2*i+1];
        res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2];
342 343
        }
      if (i!=ndone)
Martin Reinecke's avatar
Martin Reinecke committed
344
        { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; }
345 346
      }

347
    void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res)
348
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
349
      int ndone=int(n+1)>>1;
350 351
      T * p = res+n-1;
      calc_first_octant(n<<2, p);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
352
      int i4=0, in=int(n), i=0;
353
      for (; i4<=in-i4; ++i, i4+=4) // octant 0
Martin Reinecke's avatar
Martin Reinecke committed
354
        { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; }
355
      for (; i4-in <= 0; ++i, i4+=4) // octant 1
Martin Reinecke's avatar
sync  
Martin Reinecke committed
356
        { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; }
357
      for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
Martin Reinecke's avatar
sync  
Martin Reinecke committed
358
        { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; }
359
      for (; i<ndone; ++i, i4+=4) // octant 3
Martin Reinecke's avatar
sync  
Martin Reinecke committed
360
        { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; }
361 362
      }

363
    void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
364
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
365
      constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
366 367 368 369
      size_t quart = n>>2;
      if ((n&7)==0)
        res[quart] = res[quart+1] = hsqt2;
      for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
370
        { res[j] = res[i+1]; res[j+1] = res[i]; }
371 372
      }

373
    POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res)
374 375 376 377
      {
      size_t half = n>>1;
      if ((n&3)==0)
        for (size_t i=0; i<half; i+=2)
Martin Reinecke's avatar
Martin Reinecke committed
378
          { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; }
379 380
      else
        for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
381
          { res[j] = -res[i]; res[j+1] = res[i+1]; }
382 383
      }

384
    void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res)
385 386 387 388 389 390
      {
      if ((n&1)==0)
        for (size_t i=0; i<n; ++i)
          res[i+n] = -res[i];
      else
        for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
391
          { res[j] = res[i]; res[j+1] = -res[i+1]; }
392 393
      }

394
    POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res)
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
      {
      if ((n&3)==0)
        {
        calc_first_octant(n, res);
        fill_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else if ((n&1)==0)
        {
        calc_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else
        calc_first_half(n, res);
      }

  public:
412
    POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half)
413 414 415 416 417 418 419 420 421 422 423 424
      : data(2*n)
      {
      sincos_2pibyn_half(n, data.data());
      if (!half) fill_second_half(n, data.data());
      }

    T operator[](size_t idx) const { return data[idx]; }
    const T *rdata() const { return data; }
    const cmplx<T> *cdata() const
      { return reinterpret_cast<const cmplx<T> *>(data.data()); }
  };

Martin Reinecke's avatar
Martin Reinecke committed
425
struct util // hack to avoid duplicate symbols
426
  {
427
  static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
428
    {
Martin Reinecke's avatar
Martin Reinecke committed
429 430 431
    size_t res=1;
    while ((n&1)==0)
      { res=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
432 433 434
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        { res=x; n/=x; }
Martin Reinecke's avatar
Martin Reinecke committed
435 436 437 438
    if (n>1) res=n;
    return res;
    }

439
  static POCKETFFT_NOINLINE double cost_guess (size_t n)
440
    {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
441
    constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
Martin Reinecke's avatar
Martin Reinecke committed
442 443 444 445
    size_t ni=n;
    double result=0.;
    while ((n&1)==0)
      { result+=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
446 447 448 449 450 451
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        {
        result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
        n/=x;
        }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
452 453
    if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
    return result*double(ni);
454 455
    }

Martin Reinecke's avatar
Martin Reinecke committed
456
  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
457
  static POCKETFFT_NOINLINE size_t good_size(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
458 459 460 461 462 463 464 465 466 467 468 469
    {
    if (n<=12) return n;

    size_t bestfac=2*n;
    for (size_t f2=1; f2<bestfac; f2*=2)
      for (size_t f23=f2; f23<bestfac; f23*=3)
        for (size_t f235=f23; f235<bestfac; f235*=5)
          for (size_t f2357=f235; f2357<bestfac; f2357*=7)
            for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
              if (f235711>=n) bestfac=f235711;
    return bestfac;
    }
470

Martin Reinecke's avatar
Martin Reinecke committed
471 472 473 474 475 476 477
  static size_t prod(const shape_t &shape)
    {
    size_t res=1;
    for (auto sz: shape)
      res*=sz;
    return res;
    }
Martin Reinecke's avatar
Martin Reinecke committed
478

479
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
480 481 482 483 484 485
    const stride_t &stride_in, const stride_t &stride_out, bool inplace)
    {
    auto ndim = shape.size();
    if (ndim<1) throw runtime_error("ndim must be >= 1");
    if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
      throw runtime_error("stride dimension mismatch");
Martin Reinecke's avatar
sync  
Martin Reinecke committed
486 487
    if (inplace && (stride_in!=stride_out))
      throw runtime_error("stride mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
488 489
    }

490
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
491 492 493 494 495 496 497 498
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    const shape_t &axes)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    auto ndim = shape.size();
    shape_t tmp(ndim,0);
    for (auto ax : axes)
      {
499 500
      if (ax>=ndim) throw invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw invalid_argument("axis specified repeatedly");
Martin Reinecke's avatar
Martin Reinecke committed
501 502 503
      }
    }

504
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
505 506 507 508
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    size_t axis)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
509
    if (axis>=shape.size()) throw invalid_argument("bad axis number");
Martin Reinecke's avatar
Martin Reinecke committed
510
    }
Martin Reinecke's avatar
Martin Reinecke committed
511 512

#ifdef POCKETFFT_OPENMP
Martin Reinecke's avatar
sync  
Martin Reinecke committed
513 514
    static size_t nthreads() { return size_t(omp_get_num_threads()); }
    static size_t thread_num() { return size_t(omp_get_thread_num()); }
Martin Reinecke's avatar
Martin Reinecke committed
515 516 517 518
    static size_t thread_count (size_t nthreads, const shape_t &shape,
      size_t axis)
      {
      if (nthreads==1) return 1;
Martin Reinecke's avatar
Martin Reinecke committed
519
      if (prod(shape) < 20*shape[axis]) return 1;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
520
      return (nthreads==0) ? size_t(omp_get_max_threads()) : nthreads;
Martin Reinecke's avatar
Martin Reinecke committed
521
      }
Martin Reinecke's avatar
Martin Reinecke committed
522
#else
Martin Reinecke's avatar
Martin Reinecke committed
523 524
    static constexpr size_t nthreads() { return 1; }
    static constexpr size_t thread_num() { return 0; }
Martin Reinecke's avatar
Martin Reinecke committed
525
#endif
Martin Reinecke's avatar
Martin Reinecke committed
526
  };
527 528 529 530 531 532 533 534 535 536 537 538 539 540

//
// complex FFTPACK transforms
//

template<typename T0> class cfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      cmplx<T0> *tw, *tws;
      };

Martin Reinecke's avatar
Martin Reinecke committed
541
    size_t length;
542
    arr<cmplx<T0>> mem;
Martin Reinecke's avatar
Martin Reinecke committed
543
    vector<fctdata> fact;
544 545

    void add_factor(size_t factor)
Martin Reinecke's avatar
Martin Reinecke committed
546
      { fact.push_back({factor, nullptr, nullptr}); }
547

Martin Reinecke's avatar
Martin Reinecke committed
548
template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
549 550
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
551 552 553
  {
  constexpr size_t cdim=2;

554 555
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
556
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
557 558 559 560
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

561 562 563 564 565 566 567 568 569 570 571 572 573 574
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      for (size_t i=1; i<ido; ++i)
        {
        CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
Martin Reinecke's avatar
Martin Reinecke committed
575
        CH(i,k,1) = (CC(i,0,k)-CC(i,1,k)).template special_mul<fwd>(WA(0,i));
576 577 578 579
        }
      }
  }

580
#define POCKETFFT_PREP3(idx) \
581 582 583
        T t0 = CC(idx,0,k), t1, t2; \
        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
        CH(idx,k,0)=t0+t1;
584
#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
585 586 587 588 589 590
        { \
        T ca,cb; \
        ca=t0+t1*twr; \
        cb=t2*twi; ROT90(cb); \
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
        }
591
#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
592 593 594 595 596
        { \
        T ca,cb,da,db; \
        ca=t0+t1*twr; \
        cb=t2*twi; ROT90(cb); \
        PMC(da,db,ca,cb); \
Martin Reinecke's avatar
Martin Reinecke committed
597 598
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
599
        }
Martin Reinecke's avatar
Martin Reinecke committed
600
template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
601 602
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
603 604
  {
  constexpr size_t cdim=3;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
605
  constexpr T0 tw1r=-0.5,
Martin Reinecke's avatar
Martin Reinecke committed
606
               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
607

608 609
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
610
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
611 612 613 614
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

615 616 617
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
618 619
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
620 621 622 623 624
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
625 626
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
627 628 629
      }
      for (size_t i=1; i<ido; ++i)
        {
630 631
        POCKETFFT_PREP3(i)
        POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
632 633 634 635
        }
      }
  }

636 637 638
#undef POCKETFFT_PARTSTEP3b
#undef POCKETFFT_PARTSTEP3a
#undef POCKETFFT_PREP3
639

Martin Reinecke's avatar
Martin Reinecke committed
640
template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
641 642
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
643 644 645
  {
  constexpr size_t cdim=4;

646 647
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
648
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
649 650 651 652
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

653 654 655 656 657 658
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T t1, t2, t3, t4;
      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
659
      ROTX90<fwd>(t4);
660 661 662 663 664 665 666 667 668 669
      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T t1, t2, t3, t4;
      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
670
      ROTX90<fwd>(t4);
671 672 673 674 675 676 677 678 679
      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
      }
      for (size_t i=1; i<ido; ++i)
        {
        T c2, c3, c4, t1, t2, t3, t4;
        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
        PMC(t2,t1,cc0,cc2);
        PMC(t3,t4,cc1,cc3);
Martin Reinecke's avatar
Martin Reinecke committed
680
        ROTX90<fwd>(t4);
681 682
        PMC(CH(i,k,0),c3,t2,t3);
        PMC(c2,c4,t1,t4);
Martin Reinecke's avatar
Martin Reinecke committed
683 684 685
        CH(i,k,1) = c2.template special_mul<fwd>(WA(0,i));
        CH(i,k,2) = c3.template special_mul<fwd>(WA(1,i));
        CH(i,k,3) = c4.template special_mul<fwd>(WA(2,i));
686 687 688 689
        }
      }
  }

690
#define POCKETFFT_PREP5(idx) \
691 692 693 694 695 696
        T t0 = CC(idx,0,k), t1, t2, t3, t4; \
        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
        CH(idx,k,0).i=t0.i+t1.i+t2.i;

697
#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
698 699 700 701 702 703 704 705 706
        { \
        T ca,cb; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \
        }

707
#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
708 709 710 711 712 713 714
        { \
        T ca,cb,da,db; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PMC(da,db,ca,cb); \
Martin Reinecke's avatar
Martin Reinecke committed
715 716
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
717
        }
Martin Reinecke's avatar
Martin Reinecke committed
718
template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
719 720
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
721 722
  {
  constexpr size_t cdim=5;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
723
  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
724
               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
725
               tw2r= T0(-0.8090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
726
               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
727

728 729
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
730
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
731 732 733 734
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

735 736 737
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
738 739 740
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
741 742 743 744 745
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
746 747 748
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
749 750 751
      }
      for (size_t i=1; i<ido; ++i)
        {
752 753 754
        POCKETFFT_PREP5(i)
        POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
        POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
755 756 757 758
        }
      }
  }

759 760 761
#undef POCKETFFT_PARTSTEP5b
#undef POCKETFFT_PARTSTEP5a
#undef POCKETFFT_PREP5
762

763
#define POCKETFFT_PREP7(idx) \
764 765 766 767 768 769 770
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;

771
#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
772 773 774 775 776 777 778 779
        { \
        T ca,cb; \
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
        PMC(out1,out2,ca,cb); \
        }
780 781 782
#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
783 784
        { \
        T da,db; \
785
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
786 787
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
788 789
        }

Martin Reinecke's avatar
Martin Reinecke committed
790
template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
791 792
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
793 794
  {
  constexpr size_t cdim=7;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
795
  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
Martin Reinecke's avatar
Martin Reinecke committed
796
               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
797
               tw2r= T0(-0.2225209339563144042889025644967948L),
Martin Reinecke's avatar
Martin Reinecke committed
798
               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
799
               tw3r= T0(-0.9009688679024191262361023195074451L),
Martin Reinecke's avatar
Martin Reinecke committed
800
               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
801

802 803
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
804
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
805 806 807 808
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

809 810 811
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
812 813 814 815
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
816 817 818 819 820
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
821 822 823 824
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
825 826 827
      }
      for (size_t i=1; i<ido; ++i)
        {
828 829 830 831
        POCKETFFT_PREP7(i)
        POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
        POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
832 833 834 835
        }
      }
  }

836 837 838 839
#undef POCKETFFT_PARTSTEP7
#undef POCKETFFT_PARTSTEP7a0
#undef POCKETFFT_PARTSTEP7a
#undef POCKETFFT_PREP7
840

Martin Reinecke's avatar
Martin Reinecke committed
841
template <bool fwd, typename T> void ROTX45(T &a)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
842 843
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
844
  if (fwd)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
845
    { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
Martin Reinecke's avatar
Martin Reinecke committed
846 847
  else
    { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
848
  }
Martin Reinecke's avatar
Martin Reinecke committed
849
template <bool fwd, typename T> void ROTX135(T &a)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
850 851
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
852
  if (fwd)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
853
    { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
Martin Reinecke's avatar
Martin Reinecke committed
854 855
  else
    { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
856 857
  }

Martin Reinecke's avatar
Martin Reinecke committed
858
template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
859 860
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
861 862 863
  {
  constexpr size_t cdim=8;

864 865
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
866
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
867 868 869 870
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
sync  
Martin Reinecke committed
871 872 873 874 875 876 877 878
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
879 880
      ROTX90<fwd>(a6);
      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
881 882 883 884
      PMINPLACE(a0,a2);
      PMINPLACE(a1,a3);
      PMINPLACE(a4,a6);
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
885 886 887
      ROTX45<fwd>(a5);
      ROTX90<fwd>(a3);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
888 889 890 891 892 893 894 895 896 897 898 899 900
      PMC(CH(0,k,0),CH(0,k,4),a0,a1);
      PMC(CH(0,k,1),CH(0,k,5),a4,a5);
      PMC(CH(0,k,2),CH(0,k,6),a2,a3);
      PMC(CH(0,k,3),CH(0,k,7),a6,a7);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
901 902
      ROTX90<fwd>(a6);
      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
903 904 905 906
      PMINPLACE(a0,a2);
      PMINPLACE(a1,a3);
      PMINPLACE(a4,a6);
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
907 908 909
      ROTX45<fwd>(a5);
      ROTX90<fwd>(a3);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
910 911 912 913 914 915 916 917 918 919 920 921
      PMC(CH(0,k,0),CH(0,k,4),a0,a1);
      PMC(CH(0,k,1),CH(0,k,5),a4,a5);
      PMC(CH(0,k,2),CH(0,k,6),a2,a3);
      PMC(CH(0,k,3),CH(0,k,7),a6,a7);

      for (size_t i=1; i<ido; ++i)
        {
        T a0, a1, a2, a3, a4, a5, a6, a7;
        PMC(a0,a4,CC(i,0,k),CC(i,4,k));
        PMC(a1,a5,CC(i,1,k),CC(i,5,k));
        PMC(a2,a6,CC(i,2,k),CC(i,6,k));
        PMC(a3,a7,CC(i,3,k),CC(i,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
922 923
        ROTX90<fwd>(a6);
        ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
924 925 926 927
        PMINPLACE(a0,a2);
        PMINPLACE(a1,a3);
        PMINPLACE(a4,a6);
        PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
928 929 930
        ROTX45<fwd>(a5);
        ROTX90<fwd>(a3);
        ROTX135<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
931 932 933 934 935
        PMINPLACE(a0,a1);
        PMINPLACE(a2,a3);
        PMINPLACE(a4,a5);
        PMINPLACE(a6,a7);
        CH(i,k,0) = a0;
Martin Reinecke's avatar
Martin Reinecke committed
936 937 938 939 940 941 942
        CH(i,k,1) = a4.template special_mul<fwd>(WA(0,i));
        CH(i,k,2) = a2.template special_mul<fwd>(WA(1,i));
        CH(i,k,3) = a6.template special_mul<fwd>(WA(2,i));
        CH(i,k,4) = a1.template special_mul<fwd>(WA(3,i));
        CH(i,k,5) = a5.template special_mul<fwd>(WA(4,i));
        CH(i,k,6) = a3.template special_mul<fwd>(WA(5,i));
        CH(i,k,7) = a7.template special_mul<fwd>(WA(6,i));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
943 944 945 946 947
        }
      }
   }


948
#define POCKETFFT_PREP11(idx) \
949 950 951 952 953 954 955 956 957
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;

958
#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
959 960 961 962 963 964 965
        { \
        T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
          cb; \
        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
        PMC(out1,out2,ca,cb); \
        }
966 967 968
#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
969 970
        { \
        T da,db; \
971
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
972 973
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
974 975
        }

Martin Reinecke's avatar
Martin Reinecke committed
976
template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
977 978
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
979
  {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
980 981
  constexpr size_t cdim=11;
  constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
Martin Reinecke's avatar
Martin Reinecke committed
982
               tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
983
               tw2r= T0(0.4154150130018864255292741492296232L),
Martin Reinecke's avatar
Martin Reinecke committed
984
               tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
985
               tw3r= T0(-0.1423148382732851404437926686163697L),
Martin Reinecke's avatar
Martin Reinecke committed
986
               tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
987
               tw4r= T0(-0.6548607339452850640569250724662936L),
Martin Reinecke's avatar
Martin Reinecke committed
988
               tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
989
               tw5r= T0(-0.9594929736144973898903680570663277L),
Martin Reinecke's avatar
Martin Reinecke committed
990
               tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);
991

992 993
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
994
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
995 996 997 998
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

999 1000 1001
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
1002
      POCKETFFT_PREP11(0)
1003 1004 1005 1006 1007
      POCKETFFT_PARTSTEP11a(1,10,tw1r,