pocketfft_hdronly.h 107 KB
Newer Older
1
/*
2 3 4
This file is part of pocketfft.

Copyright (C) 2010-2019 Max-Planck-Society
5 6 7 8 9 10 11
Copyright (C) 2019 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
38 39 40 41

#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H

42
#ifndef __cplusplus
43
#error This file is C++ and requires a C++ compiler.
44 45
#endif

Martin Reinecke's avatar
sync  
Martin Reinecke committed
46
#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
47
#error This file requires at least C++11 support.
48 49
#endif

50 51 52 53
#ifndef POCKETFFT_CACHE_SIZE
#define POCKETFFT_CACHE_SIZE 16
#endif

54 55 56 57 58 59
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <memory>
#include <vector>
Martin Reinecke's avatar
sync  
Martin Reinecke committed
60
#include <complex>
61 62
#if POCKETFFT_CACHE_SIZE!=0
#include <array>
Martin Reinecke's avatar
Martin Reinecke committed
63
#include <mutex>
64
#endif
Martin Reinecke's avatar
Martin Reinecke committed
65 66 67 68
#ifdef POCKETFFT_OPENMP
#include <omp.h>
#endif

69

Martin Reinecke's avatar
Martin Reinecke committed
70
#if defined(__GNUC__)
71 72
#define POCKETFFT_NOINLINE __attribute__((noinline))
#define POCKETFFT_RESTRICT __restrict__
Martin Reinecke's avatar
Martin Reinecke committed
73
#elif defined(_MSC_VER)
74 75
#define POCKETFFT_NOINLINE __declspec(noinline)
#define POCKETFFT_RESTRICT __restrict
76
#else
77 78
#define POCKETFFT_NOINLINE
#define POCKETFFT_RESTRICT
79 80 81 82 83 84
#endif

namespace pocketfft {

namespace detail {

Martin Reinecke's avatar
Martin Reinecke committed
85 86
using namespace std;

Martin Reinecke's avatar
Martin Reinecke committed
87 88 89 90 91 92
using shape_t = vector<size_t>;
using stride_t = vector<ptrdiff_t>;

constexpr bool FORWARD  = true,
               BACKWARD = false;

Martin Reinecke's avatar
sync  
Martin Reinecke committed
93 94 95 96
// only enable vector support for gcc>=5.0 and clang>=5.0
#ifndef POCKETFFT_NO_VECTORS
#define POCKETFFT_NO_VECTORS
#if defined(__INTEL_COMPILER)
97
// do nothing. This is necessary because this compiler also sets __GNUC__.
Martin Reinecke's avatar
sync  
Martin Reinecke committed
98 99 100 101 102 103 104 105 106 107 108
#elif defined(__clang__)
#if __clang__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
109 110
template<typename T> struct VLEN { static constexpr size_t val=1; };

Martin Reinecke's avatar
Martin Reinecke committed
111 112
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
Martin Reinecke's avatar
Martin Reinecke committed
113 114
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
Martin Reinecke's avatar
Martin Reinecke committed
115
#elif (defined(__AVX__))
Martin Reinecke's avatar
Martin Reinecke committed
116 117
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
Martin Reinecke's avatar
Martin Reinecke committed
118
#elif (defined(__SSE2__))
Martin Reinecke's avatar
Martin Reinecke committed
119 120
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
sync  
Martin Reinecke committed
121
#elif (defined(__VSX__))
Martin Reinecke's avatar
Martin Reinecke committed
122 123
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
Martin Reinecke committed
124 125 126 127 128
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
129
template<typename T> class arr
130 131 132 133 134
  {
  private:
    T *p;
    size_t sz;

Martin Reinecke's avatar
Martin Reinecke committed
135
#if defined(POCKETFFT_NO_VECTORS)
136 137 138 139 140
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = malloc(num*sizeof(T));
      if (!res) throw bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
141 142 143 144 145 146 147 148
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
#elif __cplusplus >= 201703L
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
149 150
      void *res = aligned_alloc(64,num*sizeof(T));
      if (!res) throw bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
151 152 153 154
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
155
#else // portable emulation
Martin Reinecke's avatar
Martin Reinecke committed
156 157 158
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
159 160 161 162 163 164
      void *ptr = malloc(num*sizeof(T)+64);
      if (!ptr) throw bad_alloc();
      T *res = reinterpret_cast<T *>
        ((reinterpret_cast<size_t>(ptr) & ~(size_t(63))) + 64);
      (reinterpret_cast<void**>(res))[-1] = ptr;
      return res;
Martin Reinecke's avatar
Martin Reinecke committed
165 166
      }
    static void dealloc(T *ptr)
167
      { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
Martin Reinecke's avatar
Martin Reinecke committed
168
#endif
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204

  public:
    arr() : p(0), sz(0) {}
    arr(size_t n) : p(ralloc(n)), sz(n) {}
    arr(arr &&other)
      : p(other.p), sz(other.sz)
      { other.p=nullptr; other.sz=0; }
    ~arr() { dealloc(p); }

    void resize(size_t n)
      {
      if (n==sz) return;
      dealloc(p);
      p = ralloc(n);
      sz = n;
      }

    T &operator[](size_t idx) { return p[idx]; }
    const T &operator[](size_t idx) const { return p[idx]; }

    T *data() { return p; }
    const T *data() const { return p; }

    size_t size() const { return sz; }
  };

template<typename T> struct cmplx {
  T r, i;
  cmplx() {}
  cmplx(T r_, T i_) : r(r_), i(i_) {}
  void Set(T r_, T i_) { r=r_; i=i_; }
  void Set(T r_) { r=r_; i=T(0); }
  cmplx &operator+= (const cmplx &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator*= (T2 other)
    { r*=other; i*=other; return *this; }
205 206 207 208 209 210 211
  template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
    {
    T tmp = r*other.r - i*other.i;
    i = r*other.i + i*other.r;
    r = tmp;
    return *this;
    }
212 213 214 215 216 217 218 219 220 221
  cmplx operator+ (const cmplx &other) const
    { return cmplx(r+other.r, i+other.i); }
  cmplx operator- (const cmplx &other) const
    { return cmplx(r-other.r, i-other.i); }
  template<typename T2> auto operator* (const T2 &other) const
    -> cmplx<decltype(r*other)>
    { return {r*other, i*other}; }
  template<typename T2> auto operator* (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
Martin Reinecke's avatar
Martin Reinecke committed
222
  template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
223 224
    -> cmplx<decltype(r+other.r)>
    {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
225
    using Tres = cmplx<decltype(r+other.r)>;
Martin Reinecke's avatar
Martin Reinecke committed
226 227
    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
228 229 230 231 232 233 234 235 236 237
    }
};
template<typename T> void PMC(cmplx<T> &a, cmplx<T> &b,
  const cmplx<T> &c, const cmplx<T> &d)
  { a = c+d; b = c-d; }
template<typename T> cmplx<T> conj(const cmplx<T> &a)
  { return {a.r, -a.i}; }

template<typename T> void ROT90(cmplx<T> &a)
  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
Martin Reinecke's avatar
Martin Reinecke committed
238 239
template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
240 241 242 243 244 245 246 247

//
// twiddle factor section
//

template<typename T> class sincos_2pibyn
  {
  private:
Martin Reinecke's avatar
cleanup  
Martin Reinecke committed
248
    using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
249 250
    arr<T> data;

251
    void my_sincosm1pi (Thigh a_, Thigh *POCKETFFT_RESTRICT res)
252
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
253 254
      if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
255 256 257
        constexpr Thigh pi = Thigh(3.141592653589793238462643383279502884197L);
        auto s = sin(pi*a_);
        res[1] = s;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
258 259 260
        res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1);
        return;
        }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
261 262 263
      // adapted from https://stackoverflow.com/questions/42792939/
      // CAUTION: this function only works for arguments in the range
      //          [-0.25; 0.25]!
Martin Reinecke's avatar
sync  
Martin Reinecke committed
264
      double a = double(a_);
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
      double s = a * a;
      /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
      double r =     -1.0369917389758117e-4;
      r = fma (r, s,  1.9294935641298806e-3);
      r = fma (r, s, -2.5806887942825395e-2);
      r = fma (r, s,  2.3533063028328211e-1);
      r = fma (r, s, -1.3352627688538006e+0);
      r = fma (r, s,  4.0587121264167623e+0);
      r = fma (r, s, -4.9348022005446790e+0);
      double c = r*s;
      /* Approximate sin(pi*x) for x in [-0.25,0.25] */
      r =             4.6151442520157035e-4;
      r = fma (r, s, -7.3700183130883555e-3);
      r = fma (r, s,  8.2145868949323936e-2);
      r = fma (r, s, -5.9926452893214921e-1);
      r = fma (r, s,  2.5501640398732688e+0);
      r = fma (r, s, -5.1677127800499516e+0);
      s = s * a;
      r = r * s;
      s = fma (a, 3.1415926535897931e+0, r);
      res[0] = c;
      res[1] = s;
      }

289 290
    POCKETFFT_NOINLINE void calc_first_octant(size_t den,
      T * POCKETFFT_RESTRICT res)
291 292 293 294 295
      {
      size_t n = (den+4)>>3;
      if (n==0) return;
      res[0]=1.; res[1]=0.;
      if (n==1) return;
Martin Reinecke's avatar
Martin Reinecke committed
296
      size_t l1 = size_t(sqrt(n));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
297
      arr<Thigh> tmp(2*l1);
298 299
      for (size_t i=1; i<l1; ++i)
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
300 301 302
        my_sincosm1pi(Thigh(2*i)/Thigh(den),&tmp[2*i]);
        res[2*i  ] = T(tmp[2*i]+1);
        res[2*i+1] = T(tmp[2*i+1]);
303 304 305 306
        }
      size_t start=l1;
      while(start<n)
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
307
        Thigh cs[2];
Martin Reinecke's avatar
sync  
Martin Reinecke committed
308 309 310
        my_sincosm1pi((Thigh(2*start))/Thigh(den),cs);
        res[2*start] = T(cs[0]+1);
        res[2*start+1] = T(cs[1]);
311 312 313 314
        size_t end = l1;
        if (start+end>n) end = n-start;
        for (size_t i=1; i<end; ++i)
          {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
315
          Thigh csx[2]={tmp[2*i], tmp[2*i+1]};
Martin Reinecke's avatar
sync  
Martin Reinecke committed
316 317
          res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1);
          res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]);
318 319 320 321 322
          }
        start += l1;
        }
      }

323
    void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
324
      {
325
      T * POCKETFFT_RESTRICT p = res+n;
326 327 328 329 330
      calc_first_octant(n<<1, p);
      size_t ndone=(n+2)>>2;
      size_t i=0, idx1=0, idx2=2*ndone-2;
      for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
        {
Martin Reinecke's avatar
Martin Reinecke committed
331 332
        res[idx1] = p[2*i  ]; res[idx1+1] = p[2*i+1];
        res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2];
333 334
        }
      if (i!=ndone)
Martin Reinecke's avatar
Martin Reinecke committed
335
        { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; }
336 337
      }

338
    void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res)
339
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
340
      int ndone=int(n+1)>>1;
341 342
      T * p = res+n-1;
      calc_first_octant(n<<2, p);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
343
      int i4=0, in=int(n), i=0;
344
      for (; i4<=in-i4; ++i, i4+=4) // octant 0
Martin Reinecke's avatar
Martin Reinecke committed
345
        { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; }
346
      for (; i4-in <= 0; ++i, i4+=4) // octant 1
Martin Reinecke's avatar
sync  
Martin Reinecke committed
347
        { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; }
348
      for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
Martin Reinecke's avatar
sync  
Martin Reinecke committed
349
        { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; }
350
      for (; i<ndone; ++i, i4+=4) // octant 3
Martin Reinecke's avatar
sync  
Martin Reinecke committed
351
        { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; }
352 353
      }

354
    void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
355
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
356
      constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
357 358 359 360
      size_t quart = n>>2;
      if ((n&7)==0)
        res[quart] = res[quart+1] = hsqt2;
      for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
361
        { res[j] = res[i+1]; res[j+1] = res[i]; }
362 363
      }

364
    POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res)
365 366 367 368
      {
      size_t half = n>>1;
      if ((n&3)==0)
        for (size_t i=0; i<half; i+=2)
Martin Reinecke's avatar
Martin Reinecke committed
369
          { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; }
370 371
      else
        for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
372
          { res[j] = -res[i]; res[j+1] = res[i+1]; }
373 374
      }

375
    void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res)
376 377 378 379 380 381
      {
      if ((n&1)==0)
        for (size_t i=0; i<n; ++i)
          res[i+n] = -res[i];
      else
        for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
382
          { res[j] = res[i]; res[j+1] = -res[i+1]; }
383 384
      }

385
    POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res)
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
      {
      if ((n&3)==0)
        {
        calc_first_octant(n, res);
        fill_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else if ((n&1)==0)
        {
        calc_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else
        calc_first_half(n, res);
      }

  public:
403
    POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half)
404 405 406 407 408 409 410 411 412 413 414 415
      : data(2*n)
      {
      sincos_2pibyn_half(n, data.data());
      if (!half) fill_second_half(n, data.data());
      }

    T operator[](size_t idx) const { return data[idx]; }
    const T *rdata() const { return data; }
    const cmplx<T> *cdata() const
      { return reinterpret_cast<const cmplx<T> *>(data.data()); }
  };

Martin Reinecke's avatar
Martin Reinecke committed
416
struct util // hack to avoid duplicate symbols
417
  {
418
  static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
419
    {
Martin Reinecke's avatar
Martin Reinecke committed
420 421 422
    size_t res=1;
    while ((n&1)==0)
      { res=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
423 424 425
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        { res=x; n/=x; }
Martin Reinecke's avatar
Martin Reinecke committed
426 427 428 429
    if (n>1) res=n;
    return res;
    }

430
  static POCKETFFT_NOINLINE double cost_guess (size_t n)
431
    {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
432
    constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
Martin Reinecke's avatar
Martin Reinecke committed
433 434 435 436
    size_t ni=n;
    double result=0.;
    while ((n&1)==0)
      { result+=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
437 438 439 440 441 442
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        {
        result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
        n/=x;
        }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
443 444
    if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
    return result*double(ni);
445 446
    }

Martin Reinecke's avatar
Martin Reinecke committed
447
  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
448
  static POCKETFFT_NOINLINE size_t good_size(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
449 450 451 452 453 454 455 456 457 458 459 460
    {
    if (n<=12) return n;

    size_t bestfac=2*n;
    for (size_t f2=1; f2<bestfac; f2*=2)
      for (size_t f23=f2; f23<bestfac; f23*=3)
        for (size_t f235=f23; f235<bestfac; f235*=5)
          for (size_t f2357=f235; f2357<bestfac; f2357*=7)
            for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
              if (f235711>=n) bestfac=f235711;
    return bestfac;
    }
461

Martin Reinecke's avatar
Martin Reinecke committed
462 463 464 465 466 467 468
  static size_t prod(const shape_t &shape)
    {
    size_t res=1;
    for (auto sz: shape)
      res*=sz;
    return res;
    }
Martin Reinecke's avatar
Martin Reinecke committed
469

470
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
471 472 473 474 475 476
    const stride_t &stride_in, const stride_t &stride_out, bool inplace)
    {
    auto ndim = shape.size();
    if (ndim<1) throw runtime_error("ndim must be >= 1");
    if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
      throw runtime_error("stride dimension mismatch");
Martin Reinecke's avatar
sync  
Martin Reinecke committed
477 478
    if (inplace && (stride_in!=stride_out))
      throw runtime_error("stride mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
479 480
    }

481
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
482 483 484 485 486 487 488 489
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    const shape_t &axes)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    auto ndim = shape.size();
    shape_t tmp(ndim,0);
    for (auto ax : axes)
      {
490 491
      if (ax>=ndim) throw invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw invalid_argument("axis specified repeatedly");
Martin Reinecke's avatar
Martin Reinecke committed
492 493 494
      }
    }

495
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
496 497 498 499
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    size_t axis)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
500
    if (axis>=shape.size()) throw invalid_argument("bad axis number");
Martin Reinecke's avatar
Martin Reinecke committed
501
    }
Martin Reinecke's avatar
Martin Reinecke committed
502 503

#ifdef POCKETFFT_OPENMP
Martin Reinecke's avatar
sync  
Martin Reinecke committed
504 505
    static size_t nthreads() { return size_t(omp_get_num_threads()); }
    static size_t thread_num() { return size_t(omp_get_thread_num()); }
Martin Reinecke's avatar
Martin Reinecke committed
506 507 508 509
    static size_t thread_count (size_t nthreads, const shape_t &shape,
      size_t axis)
      {
      if (nthreads==1) return 1;
Martin Reinecke's avatar
Martin Reinecke committed
510
      if (prod(shape) < 20*shape[axis]) return 1;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
511
      return (nthreads==0) ? size_t(omp_get_max_threads()) : nthreads;
Martin Reinecke's avatar
Martin Reinecke committed
512
      }
Martin Reinecke's avatar
Martin Reinecke committed
513
#else
Martin Reinecke's avatar
Martin Reinecke committed
514 515
    static constexpr size_t nthreads() { return 1; }
    static constexpr size_t thread_num() { return 0; }
Martin Reinecke's avatar
Martin Reinecke committed
516
#endif
Martin Reinecke's avatar
Martin Reinecke committed
517
  };
518 519 520 521 522 523 524 525 526 527 528 529 530 531

//
// complex FFTPACK transforms
//

template<typename T0> class cfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      cmplx<T0> *tw, *tws;
      };

Martin Reinecke's avatar
Martin Reinecke committed
532
    size_t length;
533
    arr<cmplx<T0>> mem;
Martin Reinecke's avatar
Martin Reinecke committed
534
    vector<fctdata> fact;
535 536

    void add_factor(size_t factor)
Martin Reinecke's avatar
Martin Reinecke committed
537
      { fact.push_back({factor, nullptr, nullptr}); }
538

Martin Reinecke's avatar
Martin Reinecke committed
539
template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
540 541
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
542 543 544
  {
  constexpr size_t cdim=2;

545 546
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
547
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
548 549 550 551
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

552 553 554 555 556 557 558 559 560 561 562 563 564 565
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      for (size_t i=1; i<ido; ++i)
        {
        CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
Martin Reinecke's avatar
Martin Reinecke committed
566
        CH(i,k,1) = (CC(i,0,k)-CC(i,1,k)).template special_mul<fwd>(WA(0,i));
567 568 569 570
        }
      }
  }

571
#define POCKETFFT_PREP3(idx) \
572 573 574
        T t0 = CC(idx,0,k), t1, t2; \
        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
        CH(idx,k,0)=t0+t1;
575
#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
576 577 578 579 580 581
        { \
        T ca,cb; \
        ca=t0+t1*twr; \
        cb=t2*twi; ROT90(cb); \
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
        }
582
#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
583 584 585 586 587
        { \
        T ca,cb,da,db; \
        ca=t0+t1*twr; \
        cb=t2*twi; ROT90(cb); \
        PMC(da,db,ca,cb); \
Martin Reinecke's avatar
Martin Reinecke committed
588 589
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
590
        }
Martin Reinecke's avatar
Martin Reinecke committed
591
template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
592 593
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
594 595
  {
  constexpr size_t cdim=3;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
596
  constexpr T0 tw1r=-0.5,
Martin Reinecke's avatar
Martin Reinecke committed
597
               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
598

599 600
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
601
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
602 603 604 605
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

606 607 608
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
609 610
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
611 612 613 614 615
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
616 617
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
618 619 620
      }
      for (size_t i=1; i<ido; ++i)
        {
621 622
        POCKETFFT_PREP3(i)
        POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
623 624 625 626
        }
      }
  }

627 628 629
#undef POCKETFFT_PARTSTEP3b
#undef POCKETFFT_PARTSTEP3a
#undef POCKETFFT_PREP3
630

Martin Reinecke's avatar
Martin Reinecke committed
631
template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
632 633
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
634 635 636
  {
  constexpr size_t cdim=4;

637 638
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
639
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
640 641 642 643
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

644 645 646 647 648 649
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T t1, t2, t3, t4;
      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
650
      ROTX90<fwd>(t4);
651 652 653 654 655 656 657 658 659 660
      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T t1, t2, t3, t4;
      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
661
      ROTX90<fwd>(t4);
662 663 664 665 666 667 668 669 670
      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
      }
      for (size_t i=1; i<ido; ++i)
        {
        T c2, c3, c4, t1, t2, t3, t4;
        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
        PMC(t2,t1,cc0,cc2);
        PMC(t3,t4,cc1,cc3);
Martin Reinecke's avatar
Martin Reinecke committed
671
        ROTX90<fwd>(t4);
672 673
        PMC(CH(i,k,0),c3,t2,t3);
        PMC(c2,c4,t1,t4);
Martin Reinecke's avatar
Martin Reinecke committed
674 675 676
        CH(i,k,1) = c2.template special_mul<fwd>(WA(0,i));
        CH(i,k,2) = c3.template special_mul<fwd>(WA(1,i));
        CH(i,k,3) = c4.template special_mul<fwd>(WA(2,i));
677 678 679 680
        }
      }
  }

681
#define POCKETFFT_PREP5(idx) \
682 683 684 685 686 687
        T t0 = CC(idx,0,k), t1, t2, t3, t4; \
        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
        CH(idx,k,0).i=t0.i+t1.i+t2.i;

688
#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
689 690 691 692 693 694 695 696 697
        { \
        T ca,cb; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \
        }

698
#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
699 700 701 702 703 704 705
        { \
        T ca,cb,da,db; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PMC(da,db,ca,cb); \
Martin Reinecke's avatar
Martin Reinecke committed
706 707
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
708
        }
Martin Reinecke's avatar
Martin Reinecke committed
709
template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
710 711
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
712 713
  {
  constexpr size_t cdim=5;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
714
  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
715
               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
716
               tw2r= T0(-0.8090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
717
               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
718

719 720
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
721
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
722 723 724 725
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

726 727 728
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
729 730 731
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
732 733 734 735 736
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
737 738 739
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
740 741 742
      }
      for (size_t i=1; i<ido; ++i)
        {
743 744 745
        POCKETFFT_PREP5(i)
        POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
        POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
746 747 748 749
        }
      }
  }

750 751 752
#undef POCKETFFT_PARTSTEP5b
#undef POCKETFFT_PARTSTEP5a
#undef POCKETFFT_PREP5
753

754
#define POCKETFFT_PREP7(idx) \
755 756 757 758 759 760 761
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;

762
#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
763 764 765 766 767 768 769 770
        { \
        T ca,cb; \
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
        PMC(out1,out2,ca,cb); \
        }
771 772 773
#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
774 775
        { \
        T da,db; \
776
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
777 778
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
779 780
        }

Martin Reinecke's avatar
Martin Reinecke committed
781
template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
782 783
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
784 785
  {
  constexpr size_t cdim=7;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
786
  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
Martin Reinecke's avatar
Martin Reinecke committed
787
               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
788
               tw2r= T0(-0.2225209339563144042889025644967948L),
Martin Reinecke's avatar
Martin Reinecke committed
789
               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
790
               tw3r= T0(-0.9009688679024191262361023195074451L),
Martin Reinecke's avatar
Martin Reinecke committed
791
               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
792

793 794
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
795
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
796 797 798 799
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

800 801 802
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
803 804 805 806
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
807 808 809 810 811
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
812 813 814 815
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
816 817 818
      }
      for (size_t i=1; i<ido; ++i)
        {
819 820 821 822
        POCKETFFT_PREP7(i)
        POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
        POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
823 824 825 826
        }
      }
  }

827 828 829 830
#undef POCKETFFT_PARTSTEP7
#undef POCKETFFT_PARTSTEP7a0
#undef POCKETFFT_PARTSTEP7a
#undef POCKETFFT_PREP7
831

Martin Reinecke's avatar
Martin Reinecke committed
832
template <bool fwd, typename T> void ROTX45(T &a)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
833 834
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
835
  if (fwd)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
836
    { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
Martin Reinecke's avatar
Martin Reinecke committed
837 838
  else
    { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
839
  }
Martin Reinecke's avatar
Martin Reinecke committed
840
template <bool fwd, typename T> void ROTX135(T &a)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
841 842
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
843
  if (fwd)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
844
    { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
Martin Reinecke's avatar
Martin Reinecke committed
845 846
  else
    { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
847 848 849 850
  }
template<typename T> inline void PMINPLACE(T &a, T &b)
  { T t = a; a.r+=b.r; a.i+=b.i; b.r=t.r-b.r; b.i=t.i-b.i; }

Martin Reinecke's avatar
Martin Reinecke committed
851
template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
852 853
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
854 855 856
  {
  constexpr size_t cdim=8;

857 858
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
859
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
860 861 862 863
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
sync  
Martin Reinecke committed
864 865 866 867 868 869 870 871
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
872 873
      ROTX90<fwd>(a6);
      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
874 875 876 877
      PMINPLACE(a0,a2);
      PMINPLACE(a1,a3);
      PMINPLACE(a4,a6);
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
878 879 880
      ROTX45<fwd>(a5);
      ROTX90<fwd>(a3);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
881 882 883 884 885 886 887 888 889 890 891 892 893
      PMC(CH(0,k,0),CH(0,k,4),a0,a1);
      PMC(CH(0,k,1),CH(0,k,5),a4,a5);
      PMC(CH(0,k,2),CH(0,k,6),a2,a3);
      PMC(CH(0,k,3),CH(0,k,7),a6,a7);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
894 895
      ROTX90<fwd>(a6);
      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
896 897 898 899
      PMINPLACE(a0,a2);
      PMINPLACE(a1,a3);
      PMINPLACE(a4,a6);
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
900 901 902
      ROTX45<fwd>(a5);
      ROTX90<fwd>(a3);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
903 904 905 906 907 908 909 910 911 912 913 914
      PMC(CH(0,k,0),CH(0,k,4),a0,a1);
      PMC(CH(0,k,1),CH(0,k,5),a4,a5);
      PMC(CH(0,k,2),CH(0,k,6),a2,a3);
      PMC(CH(0,k,3),CH(0,k,7),a6,a7);

      for (size_t i=1; i<ido; ++i)
        {
        T a0, a1, a2, a3, a4, a5, a6, a7;
        PMC(a0,a4,CC(i,0,k),CC(i,4,k));
        PMC(a1,a5,CC(i,1,k),CC(i,5,k));
        PMC(a2,a6,CC(i,2,k),CC(i,6,k));
        PMC(a3,a7,CC(i,3,k),CC(i,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
915 916
        ROTX90<fwd>(a6);
        ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
917 918 919 920
        PMINPLACE(a0,a2);
        PMINPLACE(a1,a3);
        PMINPLACE(a4,a6);
        PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
921 922 923
        ROTX45<fwd>(a5);
        ROTX90<fwd>(a3);
        ROTX135<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
924 925 926 927 928
        PMINPLACE(a0,a1);
        PMINPLACE(a2,a3);
        PMINPLACE(a4,a5);
        PMINPLACE(a6,a7);
        CH(i,k,0) = a0;
Martin Reinecke's avatar
Martin Reinecke committed
929 930 931 932 933 934 935
        CH(i,k,1) = a4.template special_mul<fwd>(WA(0,i));
        CH(i,k,2) = a2.template special_mul<fwd>(WA(1,i));
        CH(i,k,3) = a6.template special_mul<fwd>(WA(2,i));
        CH(i,k,4) = a1.template special_mul<fwd>(WA(3,i));
        CH(i,k,5) = a5.template special_mul<fwd>(WA(4,i));
        CH(i,k,6) = a3.template special_mul<fwd>(WA(5,i));
        CH(i,k,7) = a7.template special_mul<fwd>(WA(6,i));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
936 937 938 939 940
        }
      }
   }


941
#define POCKETFFT_PREP11(idx) \
942 943 944 945 946 947 948 949 950
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;

951
#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
952 953 954 955 956 957 958
        { \
        T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
          cb; \
        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
        PMC(out1,out2,ca,cb); \
        }
959 960 961
#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
962 963
        { \
        T da,db; \
964
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
965 966
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
967 968
        }

Martin Reinecke's avatar
Martin Reinecke committed
969
template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
970 971
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
972
  {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
973 974
  constexpr size_t cdim=11;
  constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
Martin Reinecke's avatar
Martin Reinecke committed
975
               tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
976
               tw2r= T0(0.4154150130018864255292741492296232L),
Martin Reinecke's avatar
Martin Reinecke committed
977
               tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
978
               tw3r= T0(-0.1423148382732851404437926686163697L),
Martin Reinecke's avatar
Martin Reinecke committed
979
               tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
980
               tw4r= T0(-0.6548607339452850640569250724662936L),
Martin Reinecke's avatar
Martin Reinecke committed
981
               tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
982
               tw5r= T0(-0.9594929736144973898903680570663277L),
Martin Reinecke's avatar
Martin Reinecke committed
983
               tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);
984

985 986
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
987
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
988 989 990 991
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

992 993 994
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
995
      POCKETFFT_PREP11(0)
996 997 998 999 1000
      POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
      POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
      POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
      POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1001 1002 1003 1004 1005
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
1006
      POCKETFFT_PREP11(0)
1007 1008 1009 1010 1011
      POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
      POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
      POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,