pocketfft_hdronly.h 105 KB
Newer Older
1
/*
2 3 4
This file is part of pocketfft.

Copyright (C) 2010-2019 Max-Planck-Society
5 6 7 8 9 10 11
Copyright (C) 2019 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
38 39 40 41

#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H

42
#ifndef __cplusplus
43
#error This file is C++ and requires a C++ compiler.
44 45
#endif

Martin Reinecke's avatar
sync  
Martin Reinecke committed
46
#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
47
#error This file requires at least C++11 support.
48 49
#endif

50 51 52 53
#ifndef POCKETFFT_CACHE_SIZE
#define POCKETFFT_CACHE_SIZE 16
#endif

54 55 56 57 58 59
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <stdexcept>
#include <memory>
#include <vector>
Martin Reinecke's avatar
sync  
Martin Reinecke committed
60
#include <complex>
61 62
#if POCKETFFT_CACHE_SIZE!=0
#include <array>
Martin Reinecke's avatar
Martin Reinecke committed
63
#include <mutex>
64
#endif
Martin Reinecke's avatar
Martin Reinecke committed
65 66 67 68
#ifdef POCKETFFT_OPENMP
#include <omp.h>
#endif

69

Martin Reinecke's avatar
Martin Reinecke committed
70
#if defined(__GNUC__)
71 72
#define POCKETFFT_NOINLINE __attribute__((noinline))
#define POCKETFFT_RESTRICT __restrict__
Martin Reinecke's avatar
Martin Reinecke committed
73
#elif defined(_MSC_VER)
74 75
#define POCKETFFT_NOINLINE __declspec(noinline)
#define POCKETFFT_RESTRICT __restrict
76
#else
77 78
#define POCKETFFT_NOINLINE
#define POCKETFFT_RESTRICT
79 80 81 82 83 84
#endif

namespace pocketfft {

namespace detail {

Martin Reinecke's avatar
Martin Reinecke committed
85 86
using namespace std;

Martin Reinecke's avatar
Martin Reinecke committed
87 88 89 90 91 92
using shape_t = vector<size_t>;
using stride_t = vector<ptrdiff_t>;

constexpr bool FORWARD  = true,
               BACKWARD = false;

Martin Reinecke's avatar
sync  
Martin Reinecke committed
93 94 95 96
// only enable vector support for gcc>=5.0 and clang>=5.0
#ifndef POCKETFFT_NO_VECTORS
#define POCKETFFT_NO_VECTORS
#if defined(__INTEL_COMPILER)
97
// do nothing. This is necessary because this compiler also sets __GNUC__.
Martin Reinecke's avatar
sync  
Martin Reinecke committed
98 99 100 101 102 103 104 105 106 107 108
#elif defined(__clang__)
#if __clang__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
109 110
template<typename T> struct VLEN { static constexpr size_t val=1; };

Martin Reinecke's avatar
Martin Reinecke committed
111 112
#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
Martin Reinecke's avatar
Martin Reinecke committed
113 114
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
Martin Reinecke's avatar
Martin Reinecke committed
115
#elif (defined(__AVX__))
Martin Reinecke's avatar
Martin Reinecke committed
116 117
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
Martin Reinecke's avatar
Martin Reinecke committed
118
#elif (defined(__SSE2__))
Martin Reinecke's avatar
Martin Reinecke committed
119 120
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
sync  
Martin Reinecke committed
121
#elif (defined(__VSX__))
Martin Reinecke's avatar
Martin Reinecke committed
122 123
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
Martin Reinecke's avatar
Martin Reinecke committed
124 125 126 127 128
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif

Martin Reinecke's avatar
Martin Reinecke committed
129
template<typename T> class arr
130 131 132 133 134
  {
  private:
    T *p;
    size_t sz;

Martin Reinecke's avatar
Martin Reinecke committed
135
#if defined(POCKETFFT_NO_VECTORS)
136 137 138 139 140
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = malloc(num*sizeof(T));
      if (!res) throw bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
141 142 143 144 145 146 147 148
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
#elif __cplusplus >= 201703L
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
149 150
      void *res = aligned_alloc(64,num*sizeof(T));
      if (!res) throw bad_alloc();
Martin Reinecke's avatar
Martin Reinecke committed
151 152 153 154
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
155
#else // portable emulation
Martin Reinecke's avatar
Martin Reinecke committed
156 157 158
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
159 160 161 162 163 164
      void *ptr = malloc(num*sizeof(T)+64);
      if (!ptr) throw bad_alloc();
      T *res = reinterpret_cast<T *>
        ((reinterpret_cast<size_t>(ptr) & ~(size_t(63))) + 64);
      (reinterpret_cast<void**>(res))[-1] = ptr;
      return res;
Martin Reinecke's avatar
Martin Reinecke committed
165 166
      }
    static void dealloc(T *ptr)
167
      { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
Martin Reinecke's avatar
Martin Reinecke committed
168
#endif
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200

  public:
    arr() : p(0), sz(0) {}
    arr(size_t n) : p(ralloc(n)), sz(n) {}
    arr(arr &&other)
      : p(other.p), sz(other.sz)
      { other.p=nullptr; other.sz=0; }
    ~arr() { dealloc(p); }

    void resize(size_t n)
      {
      if (n==sz) return;
      dealloc(p);
      p = ralloc(n);
      sz = n;
      }

    T &operator[](size_t idx) { return p[idx]; }
    const T &operator[](size_t idx) const { return p[idx]; }

    T *data() { return p; }
    const T *data() const { return p; }

    size_t size() const { return sz; }
  };

template<typename T> struct cmplx {
  T r, i;
  cmplx() {}
  cmplx(T r_, T i_) : r(r_), i(i_) {}
  void Set(T r_, T i_) { r=r_; i=i_; }
  void Set(T r_) { r=r_; i=T(0); }
Martin Reinecke's avatar
Martin Reinecke committed
201 202
  void Split(T &r_, T &i_) const { r_=r; i_=i; }
  void SplitConj(T &r_, T &i_) const { r_=r; i_=-i; }
203 204 205 206
  cmplx &operator+= (const cmplx &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator*= (T2 other)
    { r*=other; i*=other; return *this; }
207 208 209 210 211 212 213
  template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
    {
    T tmp = r*other.r - i*other.i;
    i = r*other.i + i*other.r;
    r = tmp;
    return *this;
    }
Martin Reinecke's avatar
Martin Reinecke committed
214 215 216 217
  template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
    { r-=other.r; i-=other.i; return *this; }
218 219 220
  template<typename T2> auto operator* (const T2 &other) const
    -> cmplx<decltype(r*other)>
    { return {r*other, i*other}; }
Martin Reinecke's avatar
Martin Reinecke committed
221 222 223 224 225 226
  template<typename T2> auto operator+ (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r+other.r, i+other.i}; }
  template<typename T2> auto operator- (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r-other.r, i-other.i}; }
227 228 229
  template<typename T2> auto operator* (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
Martin Reinecke's avatar
Martin Reinecke committed
230
  template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
231 232
    -> cmplx<decltype(r+other.r)>
    {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
233
    using Tres = cmplx<decltype(r+other.r)>;
Martin Reinecke's avatar
Martin Reinecke committed
234 235
    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
236 237
    }
};
Martin Reinecke's avatar
Martin Reinecke committed
238 239
template<typename T> inline void PMINPLACE(T &a, T &b)
  { T t = a; a+=b; b=t-b; }
240 241
template<typename T> inline void MPINPLACE(T &a, T &b)
  { T t = a; a-=b; b=t+b; }
Martin Reinecke's avatar
Martin Reinecke committed
242
template<typename T> void PMC(T &a, T &b, const T &c, const T &d)
243 244 245 246 247 248
  { a = c+d; b = c-d; }
template<typename T> cmplx<T> conj(const cmplx<T> &a)
  { return {a.r, -a.i}; }

template<typename T> void ROT90(cmplx<T> &a)
  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
Martin Reinecke's avatar
Martin Reinecke committed
249 250
template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
251 252 253 254 255 256 257 258

//
// twiddle factor section
//

template<typename T> class sincos_2pibyn
  {
  private:
Martin Reinecke's avatar
cleanup  
Martin Reinecke committed
259
    using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
260 261
    arr<T> data;

262
    void my_sincosm1pi (Thigh a_, Thigh *POCKETFFT_RESTRICT res)
263
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
264 265
      if (sizeof(Thigh)>sizeof(double)) // don't have the code for long double
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
266 267 268
        constexpr Thigh pi = Thigh(3.141592653589793238462643383279502884197L);
        auto s = sin(pi*a_);
        res[1] = s;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
269 270 271
        res[0] = (s*s)/(-sqrt((1-s)*(1+s))-1);
        return;
        }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
272 273 274
      // adapted from https://stackoverflow.com/questions/42792939/
      // CAUTION: this function only works for arguments in the range
      //          [-0.25; 0.25]!
Martin Reinecke's avatar
sync  
Martin Reinecke committed
275
      double a = double(a_);
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
      double s = a * a;
      /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
      double r =     -1.0369917389758117e-4;
      r = fma (r, s,  1.9294935641298806e-3);
      r = fma (r, s, -2.5806887942825395e-2);
      r = fma (r, s,  2.3533063028328211e-1);
      r = fma (r, s, -1.3352627688538006e+0);
      r = fma (r, s,  4.0587121264167623e+0);
      r = fma (r, s, -4.9348022005446790e+0);
      double c = r*s;
      /* Approximate sin(pi*x) for x in [-0.25,0.25] */
      r =             4.6151442520157035e-4;
      r = fma (r, s, -7.3700183130883555e-3);
      r = fma (r, s,  8.2145868949323936e-2);
      r = fma (r, s, -5.9926452893214921e-1);
      r = fma (r, s,  2.5501640398732688e+0);
      r = fma (r, s, -5.1677127800499516e+0);
      s = s * a;
      r = r * s;
      s = fma (a, 3.1415926535897931e+0, r);
      res[0] = c;
      res[1] = s;
      }

300 301
    POCKETFFT_NOINLINE void calc_first_octant(size_t den,
      T * POCKETFFT_RESTRICT res)
302 303 304 305 306
      {
      size_t n = (den+4)>>3;
      if (n==0) return;
      res[0]=1.; res[1]=0.;
      if (n==1) return;
Martin Reinecke's avatar
Martin Reinecke committed
307
      size_t l1 = size_t(sqrt(n));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
308
      arr<Thigh> tmp(2*l1);
309 310
      for (size_t i=1; i<l1; ++i)
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
311 312 313
        my_sincosm1pi(Thigh(2*i)/Thigh(den),&tmp[2*i]);
        res[2*i  ] = T(tmp[2*i]+1);
        res[2*i+1] = T(tmp[2*i+1]);
314 315 316 317
        }
      size_t start=l1;
      while(start<n)
        {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
318
        Thigh cs[2];
Martin Reinecke's avatar
sync  
Martin Reinecke committed
319 320 321
        my_sincosm1pi((Thigh(2*start))/Thigh(den),cs);
        res[2*start] = T(cs[0]+1);
        res[2*start+1] = T(cs[1]);
322 323 324 325
        size_t end = l1;
        if (start+end>n) end = n-start;
        for (size_t i=1; i<end; ++i)
          {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
326
          Thigh csx[2]={tmp[2*i], tmp[2*i+1]};
Martin Reinecke's avatar
sync  
Martin Reinecke committed
327 328
          res[2*(start+i)] = T(((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1);
          res[2*(start+i)+1] = T((cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1]);
329 330 331 332 333
          }
        start += l1;
        }
      }

334
    void calc_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
335
      {
336
      T * POCKETFFT_RESTRICT p = res+n;
337 338 339 340 341
      calc_first_octant(n<<1, p);
      size_t ndone=(n+2)>>2;
      size_t i=0, idx1=0, idx2=2*ndone-2;
      for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
        {
Martin Reinecke's avatar
Martin Reinecke committed
342 343
        res[idx1] = p[2*i  ]; res[idx1+1] = p[2*i+1];
        res[idx2] = p[2*i+3]; res[idx2+1] = p[2*i+2];
344 345
        }
      if (i!=ndone)
Martin Reinecke's avatar
Martin Reinecke committed
346
        { res[idx1] = p[2*i]; res[idx1+1] = p[2*i+1]; }
347 348
      }

349
    void calc_first_half(size_t n, T * POCKETFFT_RESTRICT res)
350
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
351
      int ndone=int(n+1)>>1;
352 353
      T * p = res+n-1;
      calc_first_octant(n<<2, p);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
354
      int i4=0, in=int(n), i=0;
355
      for (; i4<=in-i4; ++i, i4+=4) // octant 0
Martin Reinecke's avatar
Martin Reinecke committed
356
        { res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1]; }
357
      for (; i4-in <= 0; ++i, i4+=4) // octant 1
Martin Reinecke's avatar
sync  
Martin Reinecke committed
358
        { auto xm = in-i4; res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm]; }
359
      for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
Martin Reinecke's avatar
sync  
Martin Reinecke committed
360
        { auto xm = i4-in; res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm]; }
361
      for (; i<ndone; ++i, i4+=4) // octant 3
Martin Reinecke's avatar
sync  
Martin Reinecke committed
362
        { auto xm = 2*in-i4; res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1]; }
363 364
      }

365
    void fill_first_quadrant(size_t n, T * POCKETFFT_RESTRICT res)
366
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
367
      constexpr T hsqt2 = T(0.707106781186547524400844362104849L);
368 369 370 371
      size_t quart = n>>2;
      if ((n&7)==0)
        res[quart] = res[quart+1] = hsqt2;
      for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
372
        { res[j] = res[i+1]; res[j+1] = res[i]; }
373 374
      }

375
    POCKETFFT_NOINLINE void fill_first_half(size_t n, T * POCKETFFT_RESTRICT res)
376 377 378 379
      {
      size_t half = n>>1;
      if ((n&3)==0)
        for (size_t i=0; i<half; i+=2)
Martin Reinecke's avatar
Martin Reinecke committed
380
          { res[i+half] = -res[i+1]; res[i+half+1] = res[i]; }
381 382
      else
        for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
383
          { res[j] = -res[i]; res[j+1] = res[i+1]; }
384 385
      }

386
    void fill_second_half(size_t n, T * POCKETFFT_RESTRICT res)
387 388 389 390 391 392
      {
      if ((n&1)==0)
        for (size_t i=0; i<n; ++i)
          res[i+n] = -res[i];
      else
        for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
Martin Reinecke's avatar
Martin Reinecke committed
393
          { res[j] = res[i]; res[j+1] = -res[i+1]; }
394 395
      }

396
    POCKETFFT_NOINLINE void sincos_2pibyn_half(size_t n, T * POCKETFFT_RESTRICT res)
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
      {
      if ((n&3)==0)
        {
        calc_first_octant(n, res);
        fill_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else if ((n&1)==0)
        {
        calc_first_quadrant(n, res);
        fill_first_half(n, res);
        }
      else
        calc_first_half(n, res);
      }

  public:
414
    POCKETFFT_NOINLINE sincos_2pibyn(size_t n, bool half)
415 416 417 418 419 420 421 422 423 424 425 426
      : data(2*n)
      {
      sincos_2pibyn_half(n, data.data());
      if (!half) fill_second_half(n, data.data());
      }

    T operator[](size_t idx) const { return data[idx]; }
    const T *rdata() const { return data; }
    const cmplx<T> *cdata() const
      { return reinterpret_cast<const cmplx<T> *>(data.data()); }
  };

Martin Reinecke's avatar
Martin Reinecke committed
427
struct util // hack to avoid duplicate symbols
428
  {
429
  static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
430
    {
Martin Reinecke's avatar
Martin Reinecke committed
431 432 433
    size_t res=1;
    while ((n&1)==0)
      { res=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
434 435 436
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        { res=x; n/=x; }
Martin Reinecke's avatar
Martin Reinecke committed
437 438 439 440
    if (n>1) res=n;
    return res;
    }

441
  static POCKETFFT_NOINLINE double cost_guess (size_t n)
442
    {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
443
    constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
Martin Reinecke's avatar
Martin Reinecke committed
444 445 446 447
    size_t ni=n;
    double result=0.;
    while ((n&1)==0)
      { result+=2; n>>=1; }
Martin Reinecke's avatar
Martin Reinecke committed
448 449 450 451 452 453
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        {
        result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
        n/=x;
        }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
454 455
    if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
    return result*double(ni);
456 457
    }

Martin Reinecke's avatar
Martin Reinecke committed
458
  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
459
  static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
Martin Reinecke's avatar
Martin Reinecke committed
460 461 462 463
    {
    if (n<=12) return n;

    size_t bestfac=2*n;
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
    for (size_t f11=1; f11<bestfac; f11*=11)
      for (size_t f117=f11; f117<bestfac; f117*=7)
        for (size_t f1175=f117; f1175<bestfac; f1175*=5)
          {
          size_t x=f1175;
          while (x<n) x*=2;
          for (;;)
            {
            if (x<n)
              x*=3;
            else if (x>n)
              {
              if (x<bestfac) bestfac=x;
              if (x&1) break;
              x>>=1;
              }
            else
              return n;
            }
          }
    return bestfac;
    }

  /* returns the smallest composite of 2, 3, 5 which is >= n */
  static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
    {
    if (n<=6) return n;

    size_t bestfac=2*n;
    for (size_t f5=1; f5<bestfac; f5*=5)
      {
      size_t x = f5;
      while (x<n) x *= 2;
      for (;;)
        {
        if (x<n)
          x*=3;
        else if (x>n)
          {
          if (x<bestfac) bestfac=x;
          if (x&1) break;
          x>>=1;
          }
        else
          return n;
        }
      }
Martin Reinecke's avatar
Martin Reinecke committed
511 512
    return bestfac;
    }
513

Martin Reinecke's avatar
Martin Reinecke committed
514 515 516 517 518 519 520
  static size_t prod(const shape_t &shape)
    {
    size_t res=1;
    for (auto sz: shape)
      res*=sz;
    return res;
    }
Martin Reinecke's avatar
Martin Reinecke committed
521

522
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
523 524 525 526 527 528
    const stride_t &stride_in, const stride_t &stride_out, bool inplace)
    {
    auto ndim = shape.size();
    if (ndim<1) throw runtime_error("ndim must be >= 1");
    if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
      throw runtime_error("stride dimension mismatch");
Martin Reinecke's avatar
sync  
Martin Reinecke committed
529 530
    if (inplace && (stride_in!=stride_out))
      throw runtime_error("stride mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
531 532
    }

533
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
534 535 536 537 538 539 540 541
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    const shape_t &axes)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    auto ndim = shape.size();
    shape_t tmp(ndim,0);
    for (auto ax : axes)
      {
542 543
      if (ax>=ndim) throw invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw invalid_argument("axis specified repeatedly");
Martin Reinecke's avatar
Martin Reinecke committed
544 545 546
      }
    }

547
  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
Martin Reinecke's avatar
Martin Reinecke committed
548 549 550 551
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    size_t axis)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
552
    if (axis>=shape.size()) throw invalid_argument("bad axis number");
Martin Reinecke's avatar
Martin Reinecke committed
553
    }
Martin Reinecke's avatar
Martin Reinecke committed
554 555

#ifdef POCKETFFT_OPENMP
Martin Reinecke's avatar
sync  
Martin Reinecke committed
556 557
    static size_t nthreads() { return size_t(omp_get_num_threads()); }
    static size_t thread_num() { return size_t(omp_get_thread_num()); }
Martin Reinecke's avatar
Martin Reinecke committed
558 559 560 561
    static size_t thread_count (size_t nthreads, const shape_t &shape,
      size_t axis)
      {
      if (nthreads==1) return 1;
Martin Reinecke's avatar
Martin Reinecke committed
562
      if (prod(shape) < 20*shape[axis]) return 1;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
563
      return (nthreads==0) ? size_t(omp_get_max_threads()) : nthreads;
Martin Reinecke's avatar
Martin Reinecke committed
564
      }
Martin Reinecke's avatar
Martin Reinecke committed
565
#else
Martin Reinecke's avatar
Martin Reinecke committed
566 567
    static constexpr size_t nthreads() { return 1; }
    static constexpr size_t thread_num() { return 0; }
Martin Reinecke's avatar
Martin Reinecke committed
568
#endif
Martin Reinecke's avatar
Martin Reinecke committed
569
  };
570 571 572 573 574 575 576 577 578 579 580 581 582 583

//
// complex FFTPACK transforms
//

template<typename T0> class cfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      cmplx<T0> *tw, *tws;
      };

Martin Reinecke's avatar
Martin Reinecke committed
584
    size_t length;
585
    arr<cmplx<T0>> mem;
Martin Reinecke's avatar
Martin Reinecke committed
586
    vector<fctdata> fact;
587 588

    void add_factor(size_t factor)
Martin Reinecke's avatar
Martin Reinecke committed
589
      { fact.push_back({factor, nullptr, nullptr}); }
590

Martin Reinecke's avatar
Martin Reinecke committed
591
template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
592 593
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
594 595 596
  {
  constexpr size_t cdim=2;

597 598
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
599
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
600 601 602 603
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

604 605 606 607 608 609 610 611 612 613 614 615 616 617
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      for (size_t i=1; i<ido; ++i)
        {
        CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
Martin Reinecke's avatar
Martin Reinecke committed
618
        CH(i,k,1) = (CC(i,0,k)-CC(i,1,k)).template special_mul<fwd>(WA(0,i));
619 620 621 622
        }
      }
  }

623
#define POCKETFFT_PREP3(idx) \
624 625 626
        T t0 = CC(idx,0,k), t1, t2; \
        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
        CH(idx,k,0)=t0+t1;
627
#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
628 629 630 631 632 633
        { \
        T ca,cb; \
        ca=t0+t1*twr; \
        cb=t2*twi; ROT90(cb); \
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
        }
634
#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
635 636 637 638 639
        { \
        T ca,cb,da,db; \
        ca=t0+t1*twr; \
        cb=t2*twi; ROT90(cb); \
        PMC(da,db,ca,cb); \
Martin Reinecke's avatar
Martin Reinecke committed
640 641
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
642
        }
Martin Reinecke's avatar
Martin Reinecke committed
643
template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
644 645
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
646 647
  {
  constexpr size_t cdim=3;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
648
  constexpr T0 tw1r=-0.5,
Martin Reinecke's avatar
Martin Reinecke committed
649
               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
650

651 652
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
653
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
654 655 656 657
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

658 659 660
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
661 662
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
663 664 665 666 667
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
668 669
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
670 671 672
      }
      for (size_t i=1; i<ido; ++i)
        {
673 674
        POCKETFFT_PREP3(i)
        POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
675 676 677 678
        }
      }
  }

679 680 681
#undef POCKETFFT_PARTSTEP3b
#undef POCKETFFT_PARTSTEP3a
#undef POCKETFFT_PREP3
682

Martin Reinecke's avatar
Martin Reinecke committed
683
template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
684 685
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
686 687 688
  {
  constexpr size_t cdim=4;

689 690
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
691
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
692 693 694 695
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

696 697 698 699 700 701
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T t1, t2, t3, t4;
      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
702
      ROTX90<fwd>(t4);
703 704 705 706 707 708 709 710 711 712
      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T t1, t2, t3, t4;
      PMC(t2,t1,CC(0,0,k),CC(0,2,k));
      PMC(t3,t4,CC(0,1,k),CC(0,3,k));
Martin Reinecke's avatar
Martin Reinecke committed
713
      ROTX90<fwd>(t4);
714 715 716 717 718 719 720 721 722
      PMC(CH(0,k,0),CH(0,k,2),t2,t3);
      PMC(CH(0,k,1),CH(0,k,3),t1,t4);
      }
      for (size_t i=1; i<ido; ++i)
        {
        T c2, c3, c4, t1, t2, t3, t4;
        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
        PMC(t2,t1,cc0,cc2);
        PMC(t3,t4,cc1,cc3);
Martin Reinecke's avatar
Martin Reinecke committed
723
        ROTX90<fwd>(t4);
724 725
        PMC(CH(i,k,0),c3,t2,t3);
        PMC(c2,c4,t1,t4);
Martin Reinecke's avatar
Martin Reinecke committed
726 727 728
        CH(i,k,1) = c2.template special_mul<fwd>(WA(0,i));
        CH(i,k,2) = c3.template special_mul<fwd>(WA(1,i));
        CH(i,k,3) = c4.template special_mul<fwd>(WA(2,i));
729 730 731 732
        }
      }
  }

733
#define POCKETFFT_PREP5(idx) \
734 735 736 737 738 739
        T t0 = CC(idx,0,k), t1, t2, t3, t4; \
        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
        CH(idx,k,0).i=t0.i+t1.i+t2.i;

740
#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
741 742 743 744 745 746 747 748 749
        { \
        T ca,cb; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb); \
        }

750
#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
751 752 753 754 755 756 757
        { \
        T ca,cb,da,db; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PMC(da,db,ca,cb); \
Martin Reinecke's avatar
Martin Reinecke committed
758 759
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
760
        }
Martin Reinecke's avatar
Martin Reinecke committed
761
template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
762 763
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
764 765
  {
  constexpr size_t cdim=5;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
766
  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
767
               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
768
               tw2r= T0(-0.8090169943749474241022934171828191L),
Martin Reinecke's avatar
Martin Reinecke committed
769
               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
770

771 772
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
773
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
774 775 776 777
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

778 779 780
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
781 782 783
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
784 785 786 787 788
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
789 790 791
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
792 793 794
      }
      for (size_t i=1; i<ido; ++i)
        {
795 796 797
        POCKETFFT_PREP5(i)
        POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
        POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
798 799 800 801
        }
      }
  }

802 803 804
#undef POCKETFFT_PARTSTEP5b
#undef POCKETFFT_PARTSTEP5a
#undef POCKETFFT_PREP5
805

806
#define POCKETFFT_PREP7(idx) \
807 808 809 810 811 812 813
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;

814
#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
815 816 817 818 819 820 821 822
        { \
        T ca,cb; \
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
        PMC(out1,out2,ca,cb); \
        }
823 824 825
#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
826 827
        { \
        T da,db; \
828
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
829 830
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
831 832
        }

Martin Reinecke's avatar
Martin Reinecke committed
833
template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
834 835
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
836 837
  {
  constexpr size_t cdim=7;
Martin Reinecke's avatar
sync  
Martin Reinecke committed
838
  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
Martin Reinecke's avatar
Martin Reinecke committed
839
               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
840
               tw2r= T0(-0.2225209339563144042889025644967948L),
Martin Reinecke's avatar
Martin Reinecke committed
841
               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
Martin Reinecke's avatar
sync  
Martin Reinecke committed
842
               tw3r= T0(-0.9009688679024191262361023195074451L),
Martin Reinecke's avatar
Martin Reinecke committed
843
               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
844

845 846
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
847
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
848 849 850 851
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

852 853 854
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
855 856 857 858
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
859 860 861 862 863
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
864 865 866 867
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
868 869 870
      }
      for (size_t i=1; i<ido; ++i)
        {
871 872 873 874
        POCKETFFT_PREP7(i)
        POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
        POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
875 876 877 878
        }
      }
  }

879 880 881 882
#undef POCKETFFT_PARTSTEP7
#undef POCKETFFT_PARTSTEP7a0
#undef POCKETFFT_PARTSTEP7a
#undef POCKETFFT_PREP7
883

Martin Reinecke's avatar
Martin Reinecke committed
884
template <bool fwd, typename T> void ROTX45(T &a)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
885 886
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
887
  if (fwd)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
888
    { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
Martin Reinecke's avatar
Martin Reinecke committed
889 890
  else
    { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
891
  }
Martin Reinecke's avatar
Martin Reinecke committed
892
template <bool fwd, typename T> void ROTX135(T &a)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
893 894
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
Martin Reinecke's avatar
Martin Reinecke committed
895
  if (fwd)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
896
    { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
Martin Reinecke's avatar
Martin Reinecke committed
897 898
  else
    { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
899 900
  }

Martin Reinecke's avatar
Martin Reinecke committed
901
template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
902 903
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa)
Martin Reinecke's avatar
sync  
Martin Reinecke committed
904 905 906
  {
  constexpr size_t cdim=8;

907 908
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
909
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
910 911 912 913
    { return cc[a+ido*(b+cdim*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

Martin Reinecke's avatar
sync  
Martin Reinecke committed
914 915 916 917 918 919 920
  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
      PMINPLACE(a1,a3);
Martin Reinecke's avatar
Martin Reinecke committed
921
      ROTX90<fwd>(a3);
Martin Reinecke's avatar
Martin Reinecke committed
922 923

      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
924
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
925 926
      ROTX45<fwd>(a5);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
927

Martin Reinecke's avatar
Martin Reinecke committed
928 929
      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
Martin Reinecke's avatar
Martin Reinecke committed
930 931
      PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1);
      PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3);
Martin Reinecke's avatar
Martin Reinecke committed
932 933
      ROTX90<fwd>(a6);
      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
Martin Reinecke's avatar
Martin Reinecke committed
934
      PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
935 936 937
      }
  else
    for (size_t k=0; k<l1; ++k)
Martin Reinecke's avatar
Martin Reinecke committed
938
      {
Martin Reinecke's avatar
sync  
Martin Reinecke committed
939 940 941 942 943
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PMC(a1,a5,CC(0,1,k),CC(0,5,k));
      PMC(a3,a7,CC(0,3,k),CC(0,7,k));
      PMINPLACE(a1,a3);
Martin Reinecke's avatar
Martin Reinecke committed
944
      ROTX90<fwd>(a3);
Martin Reinecke's avatar
Martin Reinecke committed
945 946

      ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
947
      PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
948 949
      ROTX45<fwd>(a5);
      ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
950

Martin Reinecke's avatar
Martin Reinecke committed
951 952
      PMC(a0,a4,CC(0,0,k),CC(0,4,k));
      PMC(a2,a6,CC(0,2,k),CC(0,6,k));
Martin Reinecke's avatar
Martin Reinecke committed
953 954
      PMC(CH(0,k,0),CH(0,k,4),a0+a2,a1);
      PMC(CH(0,k,2),CH(0,k,6),a0-a2,a3);
Martin Reinecke's avatar
Martin Reinecke committed
955 956
      ROTX90<fwd>(a6);
      PMC(CH(0,k,1),CH(0,k,5),a4+a6,a5);
Martin Reinecke's avatar
Martin Reinecke committed
957 958
      PMC(CH(0,k,3),CH(0,k,7),a4-a6,a7);
      }
Martin Reinecke's avatar
sync  
Martin Reinecke committed
959 960 961 962 963
      for (size_t i=1; i<ido; ++i)
        {
        T a0, a1, a2, a3, a4, a5, a6, a7;
        PMC(a1,a5,CC(i,1,k),CC(i,5,k));
        PMC(a3,a7,CC(i,3,k),CC(i,7,k));
Martin Reinecke's avatar
Martin Reinecke committed
964
        ROTX90<fwd>(a7);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
965
        PMINPLACE(a1,a3);
Martin Reinecke's avatar
Martin Reinecke committed
966
        ROTX90<fwd>(a3);
Martin Reinecke's avatar
sync  
Martin Reinecke committed
967
        PMINPLACE(a5,a7);
Martin Reinecke's avatar
Martin Reinecke committed
968 969
        ROTX45<fwd>(a5);
        ROTX135<fwd>(a7);
Martin Reinecke's avatar
Martin Reinecke committed
970 971 972
        PMC(a0,a4,CC(i,0,k),CC(i,4,k));
        PMC(a2,a6,CC(i,2,k),CC(i,6,k));
        PMINPLACE(a0,a2);
Martin Reinecke's avatar
Martin Reinecke committed
973 974 975 976
        CH(i,k,0) = a0+a1;
        CH(i,k,4) = (a0-a1).template special_mul<fwd>(WA(3,i));
        CH(i,k,2) = (a2+a3).template special_mul<fwd>(WA(1,i));
        CH(i,k,6) = (a2-a3).template special_mul<fwd>(WA(5,i));
Martin Reinecke's avatar
Martin Reinecke committed
977 978 979 980
        ROTX90<fwd>(a6);
        PMINPLACE(a4,a6);
        CH(i,k,1) = (a4+a5).template special_mul<fwd>(WA(0,i));
        CH(i,k,5) = (a4-a5).template special_mul<fwd>(WA(4,i));
Martin Reinecke's avatar
Martin Reinecke committed
981 982
        CH(i,k,3) = (a6+a7).template special_mul<fwd>(WA(2,i));
        CH(i,k,7) = (a6-a7).template special_mul<fwd>(WA(6,i));
Martin Reinecke's avatar
sync  
Martin Reinecke committed
983 984 985 986 987
        }
      }
   }


988
#define POCKETFFT_PREP11(idx) \
989 990 991 992 993 994 995 996 997
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;

998
#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
999 1000 1001 1002 1003 1004 1005
        { \
        T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
          cb; \
        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
        PMC(out1,out2,ca,cb); \
        }
1006 1007 1008
#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1009 1010
        { \
        T da,db; \
1011
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
Martin Reinecke's avatar
Martin Reinecke committed
1012 1013
        CH(i,k,u1) = da.template special_mul<fwd>(WA(u1-1,i)); \
        CH(i,k,u2) = db.template special_mul<fwd>(WA(u2-1,i)); \
Martin Reinecke's avatar