fft.h 46.6 KB
Newer Older
Martin Reinecke's avatar
Martin Reinecke committed
1
2
3
/*
This file is part of pocketfft.

Martin Reinecke's avatar
Martin Reinecke committed
4
Copyright (C) 2010-2021 Max-Planck-Society
Martin Reinecke's avatar
Martin Reinecke committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
Copyright (C) 2019 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

Martin Reinecke's avatar
Martin Reinecke committed
39
40
#ifndef DUCC0_FFT_H
#define DUCC0_FFT_H
41

Martin Reinecke's avatar
Martin Reinecke committed
42
#include <cmath>
Martin Reinecke's avatar
Martin Reinecke committed
43
44
#include <cstddef>
#include <numeric>
Martin Reinecke's avatar
Martin Reinecke committed
45
46
47
48
#include <stdexcept>
#include <memory>
#include <vector>
#include <complex>
49
#include <algorithm>
Martin Reinecke's avatar
Martin Reinecke committed
50
51
#include "ducc0/infra/useful_macros.h"
#include "ducc0/infra/error_handling.h"
Martin Reinecke's avatar
Martin Reinecke committed
52
#include "ducc0/infra/threading.h"
Martin Reinecke's avatar
Martin Reinecke committed
53
#include "ducc0/infra/misc_utils.h"
Martin Reinecke's avatar
Martin Reinecke committed
54
55
#include "ducc0/infra/simd.h"
#include "ducc0/infra/mav.h"
Martin Reinecke's avatar
Martin Reinecke committed
56
57
58
#include "ducc0/infra/aligned_array.h"
#include "ducc0/math/cmplx.h"
#include "ducc0/math/unity_roots.h"
Martin Reinecke's avatar
Martin Reinecke committed
59
#include "ducc0/fft/fft1d.h"
Martin Reinecke's avatar
Martin Reinecke committed
60

Martin Reinecke's avatar
Martin Reinecke committed
61
62
/** \file fft.h
 *  Implementation of multi-dimensional Fast Fourier and related transforms
Martin Reinecke's avatar
Martin Reinecke committed
63
64
65
66
67
68
69
70
 *  \copyright Copyright (C) 2010-2021 Max-Planck-Society
 *  \copyright Copyright (C) 2019 Peter Bell
 *  \copyright  
 *  \copyright For the odd-sized DCT-IV transforms:
 *  \copyright   Copyright (C) 2003, 2007-14 Matteo Frigo
 *  \copyright   Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology
 *
 * \authors Martin Reinecke, Peter Bell
Martin Reinecke's avatar
Martin Reinecke committed
71
72
 */

Martin Reinecke's avatar
Martin Reinecke committed
73
namespace ducc0 {
Martin Reinecke's avatar
Martin Reinecke committed
74
75
76

namespace detail_fft {

Martin Reinecke's avatar
Martin Reinecke committed
77
using shape_t=fmav_info::shape_t;
Martin Reinecke's avatar
Martin Reinecke committed
78
using stride_t=fmav_info::stride_t;
Martin Reinecke's avatar
Martin Reinecke committed
79

Martin Reinecke's avatar
Martin Reinecke committed
80
81
82
83
84
constexpr bool FORWARD  = true,
               BACKWARD = false;

struct util // hack to avoid duplicate symbols
  {
Martin Reinecke's avatar
Martin Reinecke committed
85
  static void sanity_check_axes(size_t ndim, const shape_t &axes)
Martin Reinecke's avatar
Martin Reinecke committed
86
87
    {
    shape_t tmp(ndim,0);
Martin Reinecke's avatar
Martin Reinecke committed
88
    if (axes.empty()) throw std::invalid_argument("no axes specified");
Martin Reinecke's avatar
Martin Reinecke committed
89
90
    for (auto ax : axes)
      {
Martin Reinecke's avatar
Martin Reinecke committed
91
92
      if (ax>=ndim) throw std::invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
Martin Reinecke's avatar
Martin Reinecke committed
93
94
95
      }
    }

96
  DUCC0_NOINLINE static void sanity_check_onetype(const fmav_info &a1,
Martin Reinecke's avatar
Martin Reinecke committed
97
    const fmav_info &a2, bool inplace, const shape_t &axes)
Martin Reinecke's avatar
Martin Reinecke committed
98
    {
Martin Reinecke's avatar
Martin Reinecke committed
99
100
101
102
    sanity_check_axes(a1.ndim(), axes);
    MR_assert(a1.conformable(a2), "array sizes are not conformable");
    if (inplace) MR_assert(a1.stride()==a2.stride(), "stride mismatch");
    }
103
  DUCC0_NOINLINE static void sanity_check_cr(const fmav_info &ac,
Martin Reinecke's avatar
Martin Reinecke committed
104
105
106
107
108
109
110
111
    const fmav_info &ar, const shape_t &axes)
    {
    sanity_check_axes(ac.ndim(), axes);
    MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch");
    for (size_t i=0; i<ac.ndim(); ++i)
      MR_assert(ac.shape(i)== (i==axes.back()) ? (ar.shape(i)/2+1) : ar.shape(i),
        "axis length mismatch");
    }
112
  DUCC0_NOINLINE static void sanity_check_cr(const fmav_info &ac,
Martin Reinecke's avatar
Martin Reinecke committed
113
114
115
116
117
118
119
    const fmav_info &ar, const size_t axis)
    {
    if (axis>=ac.ndim()) throw std::invalid_argument("bad axis number");
    MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch");
    for (size_t i=0; i<ac.ndim(); ++i)
      MR_assert(ac.shape(i)== (i==axis) ? (ar.shape(i)/2+1) : ar.shape(i),
        "axis length mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
120
121
    }

Martin Reinecke's avatar
Martin Reinecke committed
122
#ifdef DUCC0_NO_THREADING
Martin Reinecke's avatar
Martin Reinecke committed
123
  static size_t thread_count (size_t /*nthreads*/, const fmav_info &/*info*/,
Martin Reinecke's avatar
Martin Reinecke committed
124
125
126
    size_t /*axis*/, size_t /*vlen*/)
    { return 1; }
#else
Martin Reinecke's avatar
Martin Reinecke committed
127
  static size_t thread_count (size_t nthreads, const fmav_info &info,
Martin Reinecke's avatar
Martin Reinecke committed
128
129
130
    size_t axis, size_t vlen)
    {
    if (nthreads==1) return 1;
Martin Reinecke's avatar
Martin Reinecke committed
131
132
133
    size_t size = info.size();
    size_t parallel = size / (info.shape(axis) * vlen);
    if (info.shape(axis) < 1000)
Martin Reinecke's avatar
Martin Reinecke committed
134
      parallel /= 4;
135
    size_t max_threads = (nthreads==0) ? ducc0::get_default_nthreads() : nthreads;
136
    return std::max(size_t(1), std::min(parallel, max_threads));
Martin Reinecke's avatar
Martin Reinecke committed
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
    }
#endif
  };


//
// sine/cosine transforms
//

template<typename T0> class T_dct1
  {
  private:
    pocketfft_r<T0> fftplan;

  public:
Martin Reinecke's avatar
Martin Reinecke committed
152
    DUCC0_NOINLINE T_dct1(size_t length, bool /*vectorize*/=false)
Martin Reinecke's avatar
Martin Reinecke committed
153
154
      : fftplan(2*(length-1)) {}

Martin Reinecke's avatar
Martin Reinecke committed
155
    template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho,
Martin Reinecke's avatar
more    
Martin Reinecke committed
156
      int /*type*/, bool /*cosine*/, size_t nthreads=1) const
Martin Reinecke's avatar
Martin Reinecke committed
157
158
159
160
161
      {
      constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
      size_t N=fftplan.length(), n=N/2+1;
      if (ortho)
        { c[0]*=sqrt2; c[n-1]*=sqrt2; }
Martin Reinecke's avatar
Martin Reinecke committed
162
      auto tmp=&buf[0];
Martin Reinecke's avatar
Martin Reinecke committed
163
164
165
      tmp[0] = c[0];
      for (size_t i=1; i<n; ++i)
        tmp[i] = tmp[N-i] = c[i];
Martin Reinecke's avatar
more    
Martin Reinecke committed
166
      auto res = fftplan.exec(tmp, &buf[N], fct, true, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
167
      c[0] = res[0];
Martin Reinecke's avatar
Martin Reinecke committed
168
      for (size_t i=1; i<n; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
169
        c[i] = res[2*i-1];
Martin Reinecke's avatar
Martin Reinecke committed
170
171
      if (ortho)
        { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
Martin Reinecke's avatar
Martin Reinecke committed
172
173
174
      return c;
      }
    template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho,
Martin Reinecke's avatar
more    
Martin Reinecke committed
175
      int /*type*/, bool /*cosine*/, size_t nthreads=1) const
Martin Reinecke's avatar
Martin Reinecke committed
176
177
      {
      aligned_array<T> buf(bufsize());
Martin Reinecke's avatar
more    
Martin Reinecke committed
178
      exec(c, buf.data(), fct, ortho, 1, true, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
179
180
181
      }

    size_t length() const { return fftplan.length()/2+1; }
Martin Reinecke's avatar
Martin Reinecke committed
182
    size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); }
Martin Reinecke's avatar
Martin Reinecke committed
183
184
185
186
187
188
189
190
  };

template<typename T0> class T_dst1
  {
  private:
    pocketfft_r<T0> fftplan;

  public:
Martin Reinecke's avatar
Martin Reinecke committed
191
    DUCC0_NOINLINE T_dst1(size_t length, bool /*vectorize*/=false)
Martin Reinecke's avatar
Martin Reinecke committed
192
193
      : fftplan(2*(length+1)) {}

Martin Reinecke's avatar
Martin Reinecke committed
194
    template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct,
Martin Reinecke's avatar
more    
Martin Reinecke committed
195
      bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const
Martin Reinecke's avatar
Martin Reinecke committed
196
197
      {
      size_t N=fftplan.length(), n=N/2-1;
Martin Reinecke's avatar
Martin Reinecke committed
198
      auto tmp = &buf[0];
Martin Reinecke's avatar
Martin Reinecke committed
199
200
201
      tmp[0] = tmp[n+1] = c[0]*0;
      for (size_t i=0; i<n; ++i)
        { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
Martin Reinecke's avatar
more    
Martin Reinecke committed
202
      auto res = fftplan.exec(tmp, buf+N, fct, true, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
203
      for (size_t i=0; i<n; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
204
205
206
207
        c[i] = -res[2*i+2];
      return c;
      }
    template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct,
Martin Reinecke's avatar
more    
Martin Reinecke committed
208
      bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads) const
Martin Reinecke's avatar
Martin Reinecke committed
209
210
      {
      aligned_array<T> buf(bufsize());
Martin Reinecke's avatar
more    
Martin Reinecke committed
211
      exec(c, buf.data(), fct, true, 1, false, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
212
213
214
      }

    size_t length() const { return fftplan.length()/2-1; }
Martin Reinecke's avatar
Martin Reinecke committed
215
    size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); }
Martin Reinecke's avatar
Martin Reinecke committed
216
217
218
219
220
221
  };

template<typename T0> class T_dcst23
  {
  private:
    pocketfft_r<T0> fftplan;
Martin Reinecke's avatar
Martin Reinecke committed
222
    std::vector<T0> twiddle;
Martin Reinecke's avatar
Martin Reinecke committed
223
224

  public:
Martin Reinecke's avatar
Martin Reinecke committed
225
    DUCC0_NOINLINE T_dcst23(size_t length, bool /*vectorize*/=false)
Martin Reinecke's avatar
Martin Reinecke committed
226
227
228
229
230
231
232
      : fftplan(length), twiddle(length)
      {
      UnityRoots<T0,Cmplx<T0>> tw(4*length);
      for (size_t i=0; i<length; ++i)
        twiddle[i] = tw[i+1].r;
      }

Martin Reinecke's avatar
Martin Reinecke committed
233
    template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho,
Martin Reinecke's avatar
more    
Martin Reinecke committed
234
      int type, bool cosine, size_t nthreads=1) const
Martin Reinecke's avatar
Martin Reinecke committed
235
236
237
238
239
240
241
242
243
244
245
246
247
      {
      constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
      size_t N=length();
      size_t NS2 = (N+1)/2;
      if (type==2)
        {
        if (!cosine)
          for (size_t k=1; k<N; k+=2)
            c[k] = -c[k];
        c[0] *= 2;
        if ((N&1)==0) c[N-1]*=2;
        for (size_t k=1; k<N-1; k+=2)
          MPINPLACE(c[k+1], c[k]);
Martin Reinecke's avatar
more    
Martin Reinecke committed
248
        auto res = fftplan.exec(c, buf, fct, false, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
249
        c[0] = res[0];
Martin Reinecke's avatar
Martin Reinecke committed
250
251
        for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
          {
Martin Reinecke's avatar
Martin Reinecke committed
252
253
          T t1 = twiddle[k-1]*res[kc]+twiddle[kc-1]*res[k];
          T t2 = twiddle[k-1]*res[k]-twiddle[kc-1]*res[kc];
Martin Reinecke's avatar
Martin Reinecke committed
254
255
256
          c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
          }
        if ((N&1)==0)
Martin Reinecke's avatar
Martin Reinecke committed
257
          c[NS2] = res[NS2]*twiddle[NS2-1];
Martin Reinecke's avatar
Martin Reinecke committed
258
259
        if (!cosine)
          for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
Martin Reinecke's avatar
Martin Reinecke committed
260
            std::swap(c[k], c[kc]);
Martin Reinecke's avatar
Martin Reinecke committed
261
262
263
264
265
266
267
        if (ortho) c[0]*=sqrt2*T0(0.5);
        }
      else
        {
        if (ortho) c[0]*=sqrt2;
        if (!cosine)
          for (size_t k=0, kc=N-1; k<NS2; ++k, --kc)
Martin Reinecke's avatar
Martin Reinecke committed
268
            std::swap(c[k], c[kc]);
Martin Reinecke's avatar
Martin Reinecke committed
269
270
271
272
273
274
275
276
        for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
          {
          T t1=c[k]+c[kc], t2=c[k]-c[kc];
          c[k] = twiddle[k-1]*t2+twiddle[kc-1]*t1;
          c[kc]= twiddle[k-1]*t1-twiddle[kc-1]*t2;
          }
        if ((N&1)==0)
          c[NS2] *= 2*twiddle[NS2-1];
Martin Reinecke's avatar
more    
Martin Reinecke committed
277
        auto res = fftplan.exec(c, buf, fct, true, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
278
        if (res != c) // FIXME: not yet optimal
Martin Reinecke's avatar
Martin Reinecke committed
279
          copy_n(res, N, c);
Martin Reinecke's avatar
Martin Reinecke committed
280
281
282
283
284
285
        for (size_t k=1; k<N-1; k+=2)
          MPINPLACE(c[k], c[k+1]);
        if (!cosine)
          for (size_t k=1; k<N; k+=2)
            c[k] = -c[k];
        }
Martin Reinecke's avatar
Martin Reinecke committed
286
287
288
      return c;
      }
    template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho,
Martin Reinecke's avatar
more    
Martin Reinecke committed
289
      int type, bool cosine, size_t nthreads=1) const
Martin Reinecke's avatar
Martin Reinecke committed
290
      {
Martin Reinecke's avatar
fix    
Martin Reinecke committed
291
      aligned_array<T> buf(bufsize());
Martin Reinecke's avatar
more    
Martin Reinecke committed
292
      exec(c, &buf[0], fct, ortho, type, cosine, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
293
294
295
      }

    size_t length() const { return fftplan.length(); }
Martin Reinecke's avatar
Martin Reinecke committed
296
    size_t bufsize() const { return fftplan.bufsize(); }
Martin Reinecke's avatar
Martin Reinecke committed
297
298
299
300
301
302
  };

template<typename T0> class T_dcst4
  {
  private:
    size_t N;
Martin Reinecke's avatar
Martin Reinecke committed
303
304
    std::unique_ptr<pocketfft_c<T0>> fft;
    std::unique_ptr<pocketfft_r<T0>> rfft;
305
    aligned_array<Cmplx<T0>> C2;
Martin Reinecke's avatar
Martin Reinecke committed
306
307

  public:
Martin Reinecke's avatar
Martin Reinecke committed
308
    DUCC0_NOINLINE T_dcst4(size_t length, bool /*vectorize*/=false)
Martin Reinecke's avatar
Martin Reinecke committed
309
      : N(length),
Martin Reinecke's avatar
tweak    
Martin Reinecke committed
310
311
        fft((N&1) ? nullptr : make_unique<pocketfft_c<T0>>(N/2)),
        rfft((N&1)? make_unique<pocketfft_r<T0>>(N) : nullptr),
Martin Reinecke's avatar
Martin Reinecke committed
312
313
314
315
316
317
        C2((N&1) ? 0 : N/2)
      {
      if ((N&1)==0)
        {
        UnityRoots<T0,Cmplx<T0>> tw(16*N);
        for (size_t i=0; i<N/2; ++i)
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
318
          C2[i] = tw[8*i+1].conj();
Martin Reinecke's avatar
Martin Reinecke committed
319
320
321
        }
      }

Martin Reinecke's avatar
Martin Reinecke committed
322
    template<typename T> DUCC0_NOINLINE T *exec(T c[], T /*buf*/[], T0 fct,
Martin Reinecke's avatar
more    
Martin Reinecke committed
323
      bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads) const
Martin Reinecke's avatar
Martin Reinecke committed
324
325
326
327
      {
      size_t n2 = N/2;
      if (!cosine)
        for (size_t k=0, kc=N-1; k<n2; ++k, --kc)
Martin Reinecke's avatar
Martin Reinecke committed
328
          std::swap(c[k], c[kc]);
Martin Reinecke's avatar
Martin Reinecke committed
329
330
331
332
333
334
      if (N&1)
        {
        // The following code is derived from the FFTW3 function apply_re11()
        // and is released under the 3-clause BSD license with friendly
        // permission of Matteo Frigo and Steven G. Johnson.

335
        aligned_array<T> y(N);
Martin Reinecke's avatar
Martin Reinecke committed
336
337
338
339
340
341
342
343
344
345
346
347
348
        {
        size_t i=0, m=n2;
        for (; m<N; ++i, m+=4)
          y[i] = c[m];
        for (; m<2*N; ++i, m+=4)
          y[i] = -c[2*N-m-1];
        for (; m<3*N; ++i, m+=4)
          y[i] = -c[m-2*N];
        for (; m<4*N; ++i, m+=4)
          y[i] = c[4*N-m-1];
        for (; i<N; ++i, m+=4)
          y[i] = c[m-4*N];
        }
Martin Reinecke's avatar
more    
Martin Reinecke committed
349
        rfft->exec(y.data(), fct, true, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
        {
        auto SGN = [](size_t i)
           {
           constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
           return (i&2) ? -sqrt2 : sqrt2;
           };
        c[n2] = y[0]*SGN(n2+1);
        size_t i=0, i1=1, k=1;
        for (; k<n2; ++i, ++i1, k+=2)
          {
          c[i    ] = y[2*k-1]*SGN(i1)     + y[2*k  ]*SGN(i);
          c[N -i1] = y[2*k-1]*SGN(N -i)   - y[2*k  ]*SGN(N -i1);
          c[n2-i1] = y[2*k+1]*SGN(n2-i)   - y[2*k+2]*SGN(n2-i1);
          c[n2+i1] = y[2*k+1]*SGN(n2+i+2) + y[2*k+2]*SGN(n2+i1);
          }
        if (k == n2)
          {
          c[i   ] = y[2*k-1]*SGN(i+1) + y[2*k]*SGN(i);
          c[N-i1] = y[2*k-1]*SGN(i+2) + y[2*k]*SGN(i1);
          }
        }

        // FFTW-derived code ends here
        }
      else
        {
        // even length algorithm from
        // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
378
        aligned_array<Cmplx<T>> y(n2);
Martin Reinecke's avatar
Martin Reinecke committed
379
380
381
382
383
        for(size_t i=0; i<n2; ++i)
          {
          y[i].Set(c[2*i],c[N-1-2*i]);
          y[i] *= C2[i];
          }
Martin Reinecke's avatar
more    
Martin Reinecke committed
384
        fft->exec(y.data(), fct, true, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
385
386
        for(size_t i=0, ic=n2-1; i<n2; ++i, --ic)
          {
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
387
388
          c[2*i  ] = T0( 2)*(y[i ].r*C2[i ].r-y[i ].i*C2[i ].i);
          c[2*i+1] = T0(-2)*(y[ic].i*C2[ic].r+y[ic].r*C2[ic].i);
Martin Reinecke's avatar
Martin Reinecke committed
389
390
391
392
393
          }
        }
      if (!cosine)
        for (size_t k=1; k<N; k+=2)
          c[k] = -c[k];
Martin Reinecke's avatar
Martin Reinecke committed
394
395
396
      return c;
      }

Martin Reinecke's avatar
Martin Reinecke committed
397
    template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct,
Martin Reinecke's avatar
more    
Martin Reinecke committed
398
      bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const
Martin Reinecke's avatar
Martin Reinecke committed
399
      {
Martin Reinecke's avatar
more    
Martin Reinecke committed
400
      exec(c, nullptr, fct, true, 4, cosine, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
401
402
403
      }

    size_t length() const { return N; }
Martin Reinecke's avatar
Martin Reinecke committed
404
    size_t bufsize() const { return 0; }
Martin Reinecke's avatar
Martin Reinecke committed
405
406
407
408
409
410
411
412
413
414
  };


//
// multi-D infrastructure
//

template<size_t N> class multi_iter
  {
  private:
Martin Reinecke's avatar
Martin Reinecke committed
415
416
417
418
419
    shape_t shp, pos;
    stride_t str_i, str_o;
    size_t cshp_i, cshp_o, rem;
    ptrdiff_t cstr_i, cstr_o, sstr_i, sstr_o, p_ii, p_i[N], p_oi, p_o[N];
    bool uni_i, uni_o;
Martin Reinecke's avatar
Martin Reinecke committed
420
421
422

    void advance_i()
      {
Martin Reinecke's avatar
Martin Reinecke committed
423
      for (size_t i=0; i<pos.size(); ++i)
Martin Reinecke's avatar
Martin Reinecke committed
424
        {
Martin Reinecke's avatar
Martin Reinecke committed
425
426
427
        p_ii += str_i[i];
        p_oi += str_o[i];
        if (++pos[i] < shp[i])
Martin Reinecke's avatar
Martin Reinecke committed
428
429
          return;
        pos[i] = 0;
Martin Reinecke's avatar
Martin Reinecke committed
430
431
        p_ii -= ptrdiff_t(shp[i])*str_i[i];
        p_oi -= ptrdiff_t(shp[i])*str_o[i];
Martin Reinecke's avatar
Martin Reinecke committed
432
433
434
435
        }
      }

  public:
Martin Reinecke's avatar
Martin Reinecke committed
436
437
438
    multi_iter(const fmav_info &iarr, const fmav_info &oarr, size_t idim,
      size_t nshares, size_t myshare)
      : rem(iarr.size()/iarr.shape(idim)), sstr_i(0), sstr_o(0), p_ii(0), p_oi(0)
Martin Reinecke's avatar
Martin Reinecke committed
439
      {
Martin Reinecke's avatar
Martin Reinecke committed
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
      MR_assert(oarr.ndim()==iarr.ndim(), "dimension mismatch");
      MR_assert(iarr.ndim()>=1, "not enough dimensions");
      // Sort the extraneous dimensions in order of ascending output stride;
      // this should improve overall cache re-use and avoid clashes between
      // threads as much as possible.
      shape_t idx(iarr.ndim());
      std::iota(idx.begin(), idx.end(), 0);
      sort(idx.begin(), idx.end(),
        [&oarr](size_t i1, size_t i2) {return oarr.stride(i1) < oarr.stride(i2);});
      for (auto i: idx)
        if (i!=idim)
          {
          pos.push_back(0);
          MR_assert(iarr.shape(i)==oarr.shape(i), "shape mismatch");
          shp.push_back(iarr.shape(i));
          str_i.push_back(iarr.stride(i));
          str_o.push_back(oarr.stride(i));
          }
      MR_assert(idim<iarr.ndim(), "bad active dimension");
      cstr_i = iarr.stride(idim);
      cstr_o = oarr.stride(idim);
      cshp_i = iarr.shape(idim);
      cshp_o = oarr.shape(idim);

// collapse unneeded dimensions
      bool done = false;
      while(!done)
        {
        done=true;
        for (size_t i=1; i<shp.size(); ++i)
          if ((str_i[i] == str_i[i-1]*ptrdiff_t(shp[i-1]))
           && (str_o[i] == str_o[i-1]*ptrdiff_t(shp[i-1])))
            {
            shp[i-1] *= shp[i];
            str_i.erase(str_i.begin()+ptrdiff_t(i));
            str_o.erase(str_o.begin()+ptrdiff_t(i));
            shp.erase(shp.begin()+ptrdiff_t(i));
            pos.pop_back();
            done=false;
            }
        }
      if (pos.size()>0)
        {
        sstr_i = str_i[0];
        sstr_o = str_o[0];
        }

Martin Reinecke's avatar
Martin Reinecke committed
487
      if (nshares==1) return;
Martin Reinecke's avatar
Martin Reinecke committed
488
489
      if (nshares==0) throw std::runtime_error("can't run with zero threads");
      if (myshare>=nshares) throw std::runtime_error("impossible share requested");
Martin Reinecke's avatar
Martin Reinecke committed
490
      auto [lo, hi] = calcShare(nshares, myshare, rem);
Martin Reinecke's avatar
Martin Reinecke committed
491
492
493
      size_t todo = hi-lo;

      size_t chunk = rem;
Martin Reinecke's avatar
Martin Reinecke committed
494
      for (size_t i2=0, i=pos.size()-1; i2<pos.size(); ++i2,--i)
Martin Reinecke's avatar
Martin Reinecke committed
495
        {
Martin Reinecke's avatar
Martin Reinecke committed
496
        chunk /= shp[i];
Martin Reinecke's avatar
Martin Reinecke committed
497
498
        size_t n_advance = lo/chunk;
        pos[i] += n_advance;
Martin Reinecke's avatar
Martin Reinecke committed
499
500
        p_ii += ptrdiff_t(n_advance)*str_i[i];
        p_oi += ptrdiff_t(n_advance)*str_o[i];
Martin Reinecke's avatar
Martin Reinecke committed
501
502
        lo -= n_advance*chunk;
        }
Martin Reinecke's avatar
Martin Reinecke committed
503
      MR_assert(lo==0, "must not happen");
Martin Reinecke's avatar
Martin Reinecke committed
504
505
506
507
      rem = todo;
      }
    void advance(size_t n)
      {
Martin Reinecke's avatar
Martin Reinecke committed
508
      if (rem<n) throw std::runtime_error("underrun");
Martin Reinecke's avatar
Martin Reinecke committed
509
510
511
512
513
514
      for (size_t i=0; i<n; ++i)
        {
        p_i[i] = p_ii;
        p_o[i] = p_oi;
        advance_i();
        }
Martin Reinecke's avatar
Martin Reinecke committed
515
516
517
518
519
520
      uni_i = uni_o = true;
      for (size_t i=1; i<n; ++i)
        {
        uni_i = uni_i && (p_i[i]-p_i[i-1] == sstr_i);
        uni_o = uni_o && (p_o[i]-p_o[i-1] == sstr_o);
        }
Martin Reinecke's avatar
Martin Reinecke committed
521
522
      rem -= n;
      }
Martin Reinecke's avatar
Martin Reinecke committed
523
524
525
526
527
528
529
530
531
532
533
534
535
536
    ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*cstr_i; }
    ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*cstr_i; }
    ptrdiff_t iofs_uni(size_t j, size_t i) const { return p_i[0] + ptrdiff_t(j)*sstr_i + ptrdiff_t(i)*cstr_i; }
    ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*cstr_o; }
    ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*cstr_o; }
    ptrdiff_t oofs_uni(size_t j, size_t i) const { return p_o[0] + ptrdiff_t(j)*sstr_o + ptrdiff_t(i)*cstr_o; }
    bool uniform_i() const { return uni_i; } 
    ptrdiff_t unistride_i() const { return sstr_i; } 
    bool uniform_o() const { return uni_o; } 
    ptrdiff_t unistride_o() const { return sstr_o; } 
    size_t length_in() const { return cshp_i; }
    size_t length_out() const { return cshp_o; }
    ptrdiff_t stride_in() const { return cstr_i; }
    ptrdiff_t stride_out() const { return cstr_o; }
Martin Reinecke's avatar
Martin Reinecke committed
537
538
539
540
541
542
543
    size_t remaining() const { return rem; }
  };

class rev_iter
  {
  private:
    shape_t pos;
Martin Reinecke's avatar
Martin Reinecke committed
544
    fmav_info arr;
Martin Reinecke's avatar
Martin Reinecke committed
545
546
    std::vector<char> rev_axis;
    std::vector<char> rev_jump;
Martin Reinecke's avatar
Martin Reinecke committed
547
548
549
550
551
552
    size_t last_axis, last_size;
    shape_t shp;
    ptrdiff_t p, rp;
    size_t rem;

  public:
Martin Reinecke's avatar
Martin Reinecke committed
553
    rev_iter(const fmav_info &arr_, const shape_t &axes)
Martin Reinecke's avatar
Martin Reinecke committed
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
      : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0),
        rev_jump(arr_.ndim(), 1), p(0), rp(0)
      {
      for (auto ax: axes)
        rev_axis[ax]=1;
      last_axis = axes.back();
      last_size = arr.shape(last_axis)/2 + 1;
      shp = arr.shape();
      shp[last_axis] = last_size;
      rem=1;
      for (auto i: shp)
        rem *= i;
      }
    void advance()
      {
      --rem;
      for (int i_=int(pos.size())-1; i_>=0; --i_)
        {
        auto i = size_t(i_);
        p += arr.stride(i);
        if (!rev_axis[i])
          rp += arr.stride(i);
        else
          {
          rp -= arr.stride(i);
          if (rev_jump[i])
            {
            rp += ptrdiff_t(arr.shape(i))*arr.stride(i);
            rev_jump[i] = 0;
            }
          }
        if (++pos[i] < shp[i])
          return;
        pos[i] = 0;
        p -= ptrdiff_t(shp[i])*arr.stride(i);
        if (rev_axis[i])
          {
          rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i);
          rev_jump[i] = 1;
          }
        else
          rp -= ptrdiff_t(shp[i])*arr.stride(i);
        }
      }
    ptrdiff_t ofs() const { return p; }
    ptrdiff_t rev_ofs() const { return rp; }
    size_t remaining() const { return rem; }
  };

Martin Reinecke's avatar
Martin Reinecke committed
603
template<typename T, typename T0> DUCC0_NOINLINE aligned_array<T> alloc_tmp
Martin Reinecke's avatar
update    
Martin Reinecke committed
604
  (const fmav_info &info, size_t axsize)
Martin Reinecke's avatar
Martin Reinecke committed
605
  {
Martin Reinecke's avatar
update    
Martin Reinecke committed
606
  auto othersize = info.size()/axsize;
Martin Reinecke's avatar
Martin Reinecke committed
607
  constexpr auto vlen = native_simd<T0>::size();
Martin Reinecke's avatar
Martin Reinecke committed
608
  // FIXME: when switching to C++20, use bit_floor(othersize)
609
  return aligned_array<T>(axsize*std::min(vlen, othersize));
Martin Reinecke's avatar
Martin Reinecke committed
610
611
  }

Martin Reinecke's avatar
Martin Reinecke committed
612
template<typename T, typename T0> DUCC0_NOINLINE aligned_array<T> alloc_tmp
613
  (const fmav_info &info, size_t axsize, size_t bufsize, bool inplace=false)
Martin Reinecke's avatar
Martin Reinecke committed
614
  {
615
616
  if (inplace)
    return aligned_array<T>(bufsize);
Martin Reinecke's avatar
Martin Reinecke committed
617
618
619
620
621
622
  auto othersize = info.size()/axsize;
  constexpr auto vlen = native_simd<T0>::size();
  // FIXME: when switching to C++20, use bit_floor(othersize)
  return aligned_array<T>((axsize+bufsize)*std::min(vlen, othersize));
  }

623
template <typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
624
  const fmav<Cmplx<typename Tsimd::value_type>> &src, Cmplx<Tsimd> *DUCC0_RESTRICT dst)
Martin Reinecke's avatar
Martin Reinecke committed
625
  {
626
  constexpr auto vlen=Tsimd::size();
Martin Reinecke's avatar
Martin Reinecke committed
627
628
  if (it.uniform_i())
    {
Martin Reinecke's avatar
Martin Reinecke committed
629
    auto ptr = &src.craw(it.iofs_uni(0,0));
Martin Reinecke's avatar
Martin Reinecke committed
630
631
632
    auto jstr = it.unistride_i();
    auto istr = it.stride_in();
    if (istr==1)
633
      for (size_t i=0; i<it.length_in(); ++i)
634
        {
635
        Cmplx<Tsimd> stmp;
636
        for (size_t j=0; j<vlen; ++j)
637
          {
638
          auto tmp = ptr[ptrdiff_t(j)*jstr+ptrdiff_t(i)];
639
640
641
642
643
          stmp.r[j] = tmp.r;
          stmp.i[j] = tmp.i;
          }
        dst[i] = stmp;
        }
Martin Reinecke's avatar
Martin Reinecke committed
644
    else if (jstr==1)
645
      for (size_t i=0; i<it.length_in(); ++i)
646
        {
647
        Cmplx<Tsimd> stmp;
648
        for (size_t j=0; j<vlen; ++j)
649
          {
650
          auto tmp = ptr[ptrdiff_t(j)+ptrdiff_t(i)*istr];
651
652
653
654
655
          stmp.r[j] = tmp.r;
          stmp.i[j] = tmp.i;
          }
        dst[i] = stmp;
        }
Martin Reinecke's avatar
Martin Reinecke committed
656
657
658
    else
      for (size_t i=0; i<it.length_in(); ++i)
        {
659
        Cmplx<Tsimd> stmp;
Martin Reinecke's avatar
Martin Reinecke committed
660
661
        for (size_t j=0; j<vlen; ++j)
          {
Martin Reinecke's avatar
Martin Reinecke committed
662
          auto tmp = src.craw(it.iofs_uni(j,i));
Martin Reinecke's avatar
Martin Reinecke committed
663
664
665
666
667
668
669
670
          stmp.r[j] = tmp.r;
          stmp.i[j] = tmp.i;
          }
        dst[i] = stmp;
        }
    }
  else
    for (size_t i=0; i<it.length_in(); ++i)
671
      {
672
      Cmplx<Tsimd> stmp;
Martin Reinecke's avatar
Martin Reinecke committed
673
674
      for (size_t j=0; j<vlen; ++j)
        {
Martin Reinecke's avatar
Martin Reinecke committed
675
        auto tmp = src.craw(it.iofs(j,i));
Martin Reinecke's avatar
Martin Reinecke committed
676
677
678
679
        stmp.r[j] = tmp.r;
        stmp.i[j] = tmp.i;
        }
      dst[i] = stmp;
680
      }
Martin Reinecke's avatar
Martin Reinecke committed
681
682
  }

683
template <typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
684
  const fmav<typename Tsimd::value_type> &src, Tsimd *DUCC0_RESTRICT dst)
Martin Reinecke's avatar
Martin Reinecke committed
685
  {
686
  constexpr auto vlen=Tsimd::size();
Martin Reinecke's avatar
Martin Reinecke committed
687
  if (it.uniform_i())
688
    {
Martin Reinecke's avatar
Martin Reinecke committed
689
    auto ptr = &src.craw(it.iofs_uni(0,0));
690
691
    auto jstr = it.unistride_i();
    auto istr = it.stride_in();
Martin Reinecke's avatar
Martin Reinecke committed
692
// FIXME: flip loops to avoid critical strides?
693
694
695
696
697
698
699
700
701
702
703
    if (istr==1)
      for (size_t i=0; i<it.length_in(); ++i)
        for (size_t j=0; j<vlen; ++j)
          dst[i][j] = ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i)];
    else if (jstr==1)
      for (size_t i=0; i<it.length_in(); ++i)
        for (size_t j=0; j<vlen; ++j)
          dst[i][j] = ptr[ptrdiff_t(j) + ptrdiff_t(i)*istr];
    else
      for (size_t i=0; i<it.length_in(); ++i)
        for (size_t j=0; j<vlen; ++j)
Martin Reinecke's avatar
Martin Reinecke committed
704
          dst[i][j] = src.craw(it.iofs_uni(j,i));
705
    }
Martin Reinecke's avatar
Martin Reinecke committed
706
  else
707
    for (size_t i=0; i<it.length_in(); ++i)
Martin Reinecke's avatar
Martin Reinecke committed
708
      for (size_t j=0; j<vlen; ++j)
Martin Reinecke's avatar
Martin Reinecke committed
709
        dst[i][j] = src.craw(it.iofs(j,i));
Martin Reinecke's avatar
Martin Reinecke committed
710
711
  }

Martin Reinecke's avatar
Martin Reinecke committed
712
713
template <typename T, size_t vlen> DUCC0_NOINLINE void copy_input(const multi_iter<vlen> &it,
  const fmav<T> &src, T *DUCC0_RESTRICT dst)
Martin Reinecke's avatar
Martin Reinecke committed
714
  {
Martin Reinecke's avatar
Martin Reinecke committed
715
  if (dst == &src.craw(it.iofs(0))) return;  // in-place
Martin Reinecke's avatar
Martin Reinecke committed
716
  for (size_t i=0; i<it.length_in(); ++i)
Martin Reinecke's avatar
Martin Reinecke committed
717
    dst[i] = src.craw(it.iofs(i));
Martin Reinecke's avatar
Martin Reinecke committed
718
719
  }

720
template<typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
721
  const Cmplx<Tsimd> *DUCC0_RESTRICT src, fmav<Cmplx<typename Tsimd::value_type>> &dst)
Martin Reinecke's avatar
Martin Reinecke committed
722
  {
723
  constexpr auto vlen=Tsimd::size();
Martin Reinecke's avatar
Martin Reinecke committed
724
  if (it.uniform_o())
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
    {
    auto ptr = &dst.vraw(it.oofs_uni(0,0));
    auto jstr = it.unistride_o();
    auto istr = it.stride_out();
    if (istr==1)
      for (size_t i=0; i<it.length_out(); ++i)
        for (size_t j=0; j<vlen; ++j)
          ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i)].Set(src[i].r[j],src[i].i[j]);
    else if (jstr==1)
      for (size_t i=0; i<it.length_out(); ++i)
        for (size_t j=0; j<vlen; ++j)
          ptr[ptrdiff_t(j) + ptrdiff_t(i)*istr].Set(src[i].r[j],src[i].i[j]);
    else
      for (size_t i=0; i<it.length_out(); ++i)
        for (size_t j=0; j<vlen; ++j)
          dst.vraw(it.oofs_uni(j,i)).Set(src[i].r[j],src[i].i[j]);
    }
Martin Reinecke's avatar
Martin Reinecke committed
742
  else
743
744
    {
    auto ptr = dst.vdata();
745
    for (size_t i=0; i<it.length_out(); ++i)
Martin Reinecke's avatar
Martin Reinecke committed
746
747
      for (size_t j=0; j<vlen; ++j)
        ptr[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
748
    }
Martin Reinecke's avatar
Martin Reinecke committed
749
750
  }

751
template<typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
752
  const Tsimd *DUCC0_RESTRICT src, fmav<typename Tsimd::value_type> &dst)
Martin Reinecke's avatar
Martin Reinecke committed
753
  {
754
  constexpr auto vlen=Tsimd::size();
Martin Reinecke's avatar
Martin Reinecke committed
755
  if (it.uniform_o())
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
    {
    auto ptr = &dst.vraw(it.oofs_uni(0,0));
    auto jstr = it.unistride_o();
    auto istr = it.stride_out();
    if (istr==1)
      for (size_t i=0; i<it.length_out(); ++i)
        for (size_t j=0; j<vlen; ++j)
          ptr[ptrdiff_t(j)*jstr + ptrdiff_t(i)] = src[i][j];
    else if (jstr==1)
      for (size_t i=0; i<it.length_out(); ++i)
        for (size_t j=0; j<vlen; ++j)
          ptr[ptrdiff_t(j) + ptrdiff_t(i)*istr] = src[i][j];
    else
      for (size_t i=0; i<it.length_out(); ++i)
        for (size_t j=0; j<vlen; ++j)
          dst.vraw(it.oofs_uni(j,i)) = src[i][j];
    }
Martin Reinecke's avatar
Martin Reinecke committed
773
  else
774
775
    {
    auto ptr=dst.vdata();
776
    for (size_t i=0; i<it.length_out(); ++i)
Martin Reinecke's avatar
Martin Reinecke committed
777
778
      for (size_t j=0; j<vlen; ++j)
        ptr[it.oofs(j,i)] = src[i][j];
779
    }
Martin Reinecke's avatar
Martin Reinecke committed
780
781
  }

Martin Reinecke's avatar
Martin Reinecke committed
782
783
template<typename T, size_t vlen> DUCC0_NOINLINE void copy_output(const multi_iter<vlen> &it,
  const T *DUCC0_RESTRICT src, fmav<T> &dst)
Martin Reinecke's avatar
Martin Reinecke committed
784
  {
Martin Reinecke's avatar
stage 2    
Martin Reinecke committed
785
  auto ptr=dst.vdata();
Martin Reinecke's avatar
Martin Reinecke committed
786
  if (src == &dst.craw(it.oofs(0))) return;  // in-place
Martin Reinecke's avatar
Martin Reinecke committed
787
  for (size_t i=0; i<it.length_out(); ++i)
Martin Reinecke's avatar
stage 2    
Martin Reinecke committed
788
    ptr[it.oofs(i)] = src[i];
Martin Reinecke's avatar
Martin Reinecke committed
789
790
  }

Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
791
792
template <typename T, size_t vlen> struct add_vec
  { using type = typename simd_select<T, vlen>::type; };
793
template <typename T, size_t vlen> struct add_vec<Cmplx<T>, vlen>
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
794
  { using type = Cmplx<typename simd_select<T, vlen>::type>; };
795
template <typename T, size_t vlen> using add_vec_t = typename add_vec<T, vlen>::type;
Martin Reinecke's avatar
Martin Reinecke committed
796
797

template<typename Tplan, typename T, typename T0, typename Exec>
Martin Reinecke's avatar
Martin Reinecke committed
798
DUCC0_NOINLINE void general_nd(const fmav<T> &in, fmav<T> &out,
Martin Reinecke's avatar
Martin Reinecke committed
799
800
  const shape_t &axes, T0 fct, size_t nthreads, const Exec &exec,
  const bool /*allow_inplace*/=true)
Martin Reinecke's avatar
Martin Reinecke committed
801
  {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
802
  std::unique_ptr<Tplan> plan;
Martin Reinecke's avatar
Martin Reinecke committed
803
  size_t nth1d = (in.ndim()==1) ? nthreads : 1;
804
  bool inplace = (out.ndim()==1)&&(out.stride(0)==1);
Martin Reinecke's avatar
Martin Reinecke committed
805
806
807
808
809

  for (size_t iax=0; iax<axes.size(); ++iax)
    {
    size_t len=in.shape(axes[iax]);
    if ((!plan) || (len!=plan->length()))
Martin Reinecke's avatar
Martin Reinecke committed
810
      plan = std::make_unique<Tplan>(len, in.ndim()==1);
Martin Reinecke's avatar
Martin Reinecke committed
811
812

    execParallel(
Martin Reinecke's avatar
Martin Reinecke committed
813
      util::thread_count(nthreads, in, axes[iax], native_simd<T0>::size()),
Martin Reinecke's avatar
rework    
Martin Reinecke committed
814
      [&](Scheduler &sched) {
Martin Reinecke's avatar
Martin Reinecke committed
815
        constexpr auto vlen = native_simd<T0>::size();
816
        auto storage = alloc_tmp<T,T0>(in, len, plan->bufsize(), inplace);
Martin Reinecke's avatar
Martin Reinecke committed
817
        const auto &tin(iax==0? in : out);
Martin Reinecke's avatar
rework    
Martin Reinecke committed
818
        multi_iter<vlen> it(tin, out, axes[iax], sched.num_threads(), sched.thread_num());
Martin Reinecke's avatar
Martin Reinecke committed
819
#ifndef DUCC0_NO_SIMD
820
        if constexpr (vlen>1)
Martin Reinecke's avatar
Martin Reinecke committed
821
822
823
          while (it.remaining()>=vlen)
            {
            it.advance(vlen);
824
            auto tdatav = reinterpret_cast<add_vec_t<T, vlen> *>(storage.data());
Martin Reinecke's avatar
Martin Reinecke committed
825
            exec(it, tin, out, tdatav, *plan, fct, nth1d);
826
            }
Martin Reinecke's avatar
Martin Reinecke committed
827
828
829
830
831
832
        if constexpr (vlen>2)
          if constexpr (simd_exists<T0,vlen/2>)
            if (it.remaining()>=vlen/2)
              {
              it.advance(vlen/2);
              auto tdatav = reinterpret_cast<add_vec_t<T, vlen/2> *>(storage.data());
Martin Reinecke's avatar
Martin Reinecke committed
833
              exec(it, tin, out, tdatav, *plan, fct, nth1d);
Martin Reinecke's avatar
Martin Reinecke committed
834
835
836
837
838
839
840
              }
        if constexpr (vlen>4)
          if constexpr (simd_exists<T0,vlen/4>)
            if (it.remaining()>=vlen/4)
              {
              it.advance(vlen/4);
              auto tdatav = reinterpret_cast<add_vec_t<T, vlen/4> *>(storage.data());
Martin Reinecke's avatar
Martin Reinecke committed
841
              exec(it, tin, out, tdatav, *plan, fct, nth1d);
Martin Reinecke's avatar
Martin Reinecke committed
842
              }
Martin Reinecke's avatar
Martin Reinecke committed
843
844
845
846
#endif
        while (it.remaining()>0)
          {
          it.advance(1);
847
          exec(it, tin, out, storage.data(), *plan, fct, nth1d, inplace);
Martin Reinecke's avatar
Martin Reinecke committed
848
849
850
851
852
853
854
855
856
857
          }
      });  // end of parallel region
    fct = T0(1); // factor has been applied, use 1 for remaining axes
    }
  }

struct ExecC2C
  {
  bool forward;

858
859
  template <typename T0, typename T, typename Titer> DUCC0_NOINLINE void operator() (
    const Titer &it, const fmav<Cmplx<T0>> &in,
860
861
    fmav<Cmplx<T0>> &out, T *buf, const pocketfft_c<T0> &plan, T0 fct,
    size_t nthreads, bool inplace=false) const
Martin Reinecke's avatar
Martin Reinecke committed
862
    {
863
864
865
    if constexpr(is_same<Cmplx<T0>, T>::value)
      if (inplace)
        {
Martin Reinecke's avatar
more    
Martin Reinecke committed
866
867
        if (in.cdata()!=out.vdata())
          copy_input(it, in, out.vdata());
868
869
870
        plan.exec_copyback(out.vdata(), buf, fct, forward, nthreads);
        return;
        }
Martin Reinecke's avatar
Martin Reinecke committed
871
    T *buf1=buf, *buf2=buf+plan.bufsize();
Martin Reinecke's avatar
Martin Reinecke committed
872
    copy_input(it, in, buf2);
Martin Reinecke's avatar
Martin Reinecke committed
873
    auto res = plan.exec(buf2, buf1, fct, forward, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
874
    copy_output(it, res, out);
Martin Reinecke's avatar
Martin Reinecke committed
875
876
877
878
879
    }
  };

struct ExecHartley
  {
880
881
  template <typename T0, typename T, typename Titer> DUCC0_NOINLINE void operator () (
    const Titer &it, const fmav<T0> &in, fmav<T0> &out,
882
883
    T *buf, const pocketfft_hartley<T0> &plan, T0 fct, size_t nthreads,
    bool inplace=false) const
Martin Reinecke's avatar
Martin Reinecke committed
884
    {
885
886
887
    if constexpr(is_same<T0, T>::value)
      if (inplace)
        {
Martin Reinecke's avatar
more    
Martin Reinecke committed
888
889
        if (in.cdata()!=out.vdata())
          copy_input(it, in, out.vdata());
890
891
892
        plan.exec_copyback(out.vdata(), buf, fct, nthreads);
        return;
        }
Martin Reinecke's avatar
Martin Reinecke committed
893
894
    T *buf1=buf, *buf2=buf+plan.bufsize(); 
    copy_input(it, in, buf2);
Martin Reinecke's avatar
Martin Reinecke committed
895
896
    auto res = plan.exec(buf2, buf1, fct, nthreads);
    copy_output(it, res, out);
Martin Reinecke's avatar
Martin Reinecke committed
897
898
899
900
901
902
903
904
905
    }
  };

struct ExecDcst
  {
  bool ortho;
  int type;
  bool cosine;

906
907
  template <typename T0, typename T, typename Tplan, typename Titer>
  DUCC0_NOINLINE void operator () (const Titer &it, const fmav<T0> &in,
908
909
    fmav <T0> &out, T * buf, const Tplan &plan, T0 fct, size_t nthreads,
    bool inplace=false) const
Martin Reinecke's avatar
Martin Reinecke committed
910
    {
911
912
913
    if constexpr(is_same<T0, T>::value)
      if (inplace)
        {
Martin Reinecke's avatar
more    
Martin Reinecke committed
914
915
        if (in.cdata()!=out.vdata())
          copy_input(it, in, out.vdata());
916
917
918
919
920
        auto res = plan.exec(out.vdata(), buf, fct, ortho, type, cosine, nthreads);
        if (res!=out.vdata())
          copy_n(res, plan.length(), out.vdata());
        return;
        }
Martin Reinecke's avatar
Martin Reinecke committed
921
922
    T *buf1=buf, *buf2=buf+plan.bufsize(); 
    copy_input(it, in, buf2);
Martin Reinecke's avatar
more    
Martin Reinecke committed
923
    auto res = plan.exec(buf2, buf1, fct, ortho, type, cosine, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
924
    copy_output(it, res, out);
Martin Reinecke's avatar
Martin Reinecke committed
925
926
927
    }
  };

Martin Reinecke's avatar
Martin Reinecke committed
928
template<typename T> DUCC0_NOINLINE void general_r2c(
Martin Reinecke's avatar
stage 1    
Martin Reinecke committed
929
  const fmav<T> &in, fmav<Cmplx<T>> &out, size_t axis, bool forward, T fct,
Martin Reinecke's avatar
Martin Reinecke committed
930
931
  size_t nthreads)
  {
Martin Reinecke's avatar
Martin Reinecke committed
932
  size_t nth1d = (in.ndim()==1) ? nthreads : 1;
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
933
  auto plan = std::make_unique<pocketfft_r<T>>(in.shape(axis));
Martin Reinecke's avatar
Martin Reinecke committed
934
935
  size_t len=in.shape(axis);
  execParallel(
Martin Reinecke's avatar
Martin Reinecke committed
936
    util::thread_count(nthreads, in, axis, native_simd<T>::size()),
Martin Reinecke's avatar
rework    
Martin Reinecke committed
937
    [&](Scheduler &sched) {
Martin Reinecke's avatar
Martin Reinecke committed
938
    constexpr auto vlen = native_simd<T>::size();
Martin Reinecke's avatar
Martin Reinecke committed
939
    auto storage = alloc_tmp<T,T>(in, len, plan->bufsize());
Martin Reinecke's avatar
rework    
Martin Reinecke committed
940
    multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
Martin Reinecke's avatar
Martin Reinecke committed
941
#ifndef DUCC0_NO_SIMD
942
    if constexpr (vlen>1)
Martin Reinecke's avatar
Martin Reinecke committed
943
944
945
      while (it.remaining()>=vlen)
        {
        it.advance(vlen);
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
946
        auto tdatav = reinterpret_cast<native_simd<T> *>(storage.data());
Martin Reinecke's avatar
Martin Reinecke committed
947
        copy_input(it, in, tdatav);
Martin Reinecke's avatar
Martin Reinecke committed
948
        plan->exec(tdatav, fct, true, nth1d);
Martin Reinecke's avatar
stage 2    
Martin Reinecke committed
949
        auto vout = out.vdata();
Martin Reinecke's avatar
Martin Reinecke committed
950
        for (size_t j=0; j<vlen; ++j)
Martin Reinecke's avatar
stage 2    
Martin Reinecke committed
951
          vout[it.oofs(j,0)].Set(tdatav[0][j]);
Martin Reinecke's avatar
Martin Reinecke committed
952
953
954
955
        size_t i=1, ii=1;
        if (forward)
          for (; i<len-1; i+=2, ++ii)
            for (size_t j=0; j<vlen; ++j)
Martin Reinecke's avatar
stage 2    
Martin Reinecke committed
956
              vout[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
Martin Reinecke's avatar
Martin Reinecke committed
957
958
959
        else
          for (; i<len-1; i+=2, ++ii)
            for (size_t j=0; j<vlen; ++j)
Martin Reinecke's avatar
stage 2    
Martin Reinecke committed
960
              vout[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
Martin Reinecke's avatar
Martin Reinecke committed
961
962
        if (i<len)
          for (size_t j=0; j<vlen; ++j)
Martin Reinecke's avatar
stage 2    
Martin Reinecke committed
963
            vout[it.oofs(j,ii)].Set(tdatav[i][j]);
Martin Reinecke's avatar
Martin Reinecke committed
964
        }
Martin Reinecke's avatar
Martin Reinecke committed
965
966
967
968
969
    if constexpr (vlen>2)
      if constexpr (simd_exists<T,vlen/2>)
        if (it.remaining()>=vlen/2)
          {
          it.advance(vlen/2);
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
970
          auto tdatav = reinterpret_cast<typename simd_select<T,vlen/2>::type *>(storage.data());
Martin Reinecke's avatar
Martin Reinecke committed
971
          copy_input(it, in, tdatav);
Martin Reinecke's avatar
Martin Reinecke committed
972
          plan->exec(tdatav, fct, true, nth1d);
Martin Reinecke's avatar
Martin Reinecke committed
973
          auto vout = out.vdata();
974
          for (size_t j=0; j<vlen/2; ++j)
Martin Reinecke's avatar
Martin Reinecke committed
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
            vout[it.oofs(j,0)].Set(tdatav[0][j]);
          size_t i=1, ii=1;
          if (forward)
            for (; i<len-1; i+=2, ++ii)
              for (size_t j=0; j<vlen/2; ++j)
                vout[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
          else
            for (; i<len-1; i+=2, ++ii)
              for (size_t j=0; j<vlen/2; ++j)
                vout[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
          if (i<len)
            for (size_t j=0; j<vlen/2; ++j)
              vout[it.oofs(j,ii)].Set(tdatav[i][j]);
          }
    if constexpr (vlen>4)
      if constexpr( simd_exists<T,vlen/4>)
        if (it.remaining()>=vlen/4)
          {
          it.advance(vlen/4);
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
994
          auto tdatav = reinterpret_cast<typename simd_select<T,vlen/4>::type *>(storage.data());
Martin Reinecke's avatar
Martin Reinecke committed
995
          copy_input(it, in, tdatav);
Martin Reinecke's avatar
Martin Reinecke committed
996
          plan->exec(tdatav, fct, true, nth1d);
Martin Reinecke's avatar
Martin Reinecke committed
997
          auto vout = out.vdata();
998
          for (size_t j=0; j<vlen/4; ++j)
Martin Reinecke's avatar
Martin Reinecke committed
999
1000
            vout[it.oofs(j,0)].Set(tdatav[0][j]);
          size_t i=1, ii=1;
For faster browsing, not all history is shown. View entire blame