totalconvolve.h 30.9 KB
Newer Older
Martin Reinecke's avatar
Martin Reinecke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 *  This code is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This code is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this code; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

17
/*
Martin Reinecke's avatar
Martin Reinecke committed
18
 *  Copyright (C) 2020-2021 Max-Planck-Society
19
20
21
 *  Author: Martin Reinecke
 */

Martin Reinecke's avatar
Martin Reinecke committed
22
23
#ifndef DUCC0_TOTALCONVOLVE_H
#define DUCC0_TOTALCONVOLVE_H
24

Martin Reinecke's avatar
Martin Reinecke committed
25
26
27
28
29
30
#include <cstdint>
#include <algorithm>
#include <cstddef>
#include <functional>
#include <memory>
#include <type_traits>
31
32
#include <vector>
#include <complex>
Martin Reinecke's avatar
Martin Reinecke committed
33
#include <cmath>
Martin Reinecke's avatar
Martin Reinecke committed
34
35
36
#include <mutex>
#include "ducc0/infra/error_handling.h"
#include "ducc0/infra/threading.h"
Martin Reinecke's avatar
Martin Reinecke committed
37
#include "ducc0/math/constants.h"
Martin Reinecke's avatar
Martin Reinecke committed
38
#include "ducc0/math/gridding_kernel.h"
Martin Reinecke's avatar
Martin Reinecke committed
39
40
#include "ducc0/infra/mav.h"
#include "ducc0/infra/simd.h"
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
41
#include "ducc0/infra/aligned_array.h"
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
42
#include "ducc0/infra/useful_macros.h"
Martin Reinecke's avatar
Martin Reinecke committed
43
#include "ducc0/infra/bucket_sort.h"
44
45
#include "ducc0/sht/sht.h"
#include "ducc0/sht/alm.h"
Martin Reinecke's avatar
Martin Reinecke committed
46
47
#include "ducc0/fft/fft1d.h"
#include "ducc0/fft/fft.h"
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
48
#include "ducc0/math/math_utils.h"
Martin Reinecke's avatar
Martin Reinecke committed
49

Martin Reinecke's avatar
Martin Reinecke committed
50
namespace ducc0 {
51

Martin Reinecke's avatar
Martin Reinecke committed
52
53
54
55
56
namespace detail_fft {

using std::vector;

template<typename T, typename T0> aligned_array<T> alloc_tmp_conv
57
  (const fmav_info &info, size_t axis, size_t len, size_t bufsize)
Martin Reinecke's avatar
Martin Reinecke committed
58
59
60
  {
  auto othersize = info.size()/info.shape(axis);
  constexpr auto vlen = native_simd<T0>::size();
61
  return aligned_array<T>((len+bufsize)*std::min(vlen, othersize));
Martin Reinecke's avatar
Martin Reinecke committed
62
63
64
  }

template<typename Tplan, typename T, typename T0, typename Exec>
Martin Reinecke's avatar
Martin Reinecke committed
65
DUCC0_NOINLINE void general_convolve(const fmav<T> &in, fmav<T> &out,
Martin Reinecke's avatar
Martin Reinecke committed
66
  const size_t axis, const vector<T0> &kernel, size_t nthreads,
Martin Reinecke's avatar
Martin Reinecke committed
67
  const Exec &exec)
Martin Reinecke's avatar
Martin Reinecke committed
68
  {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
69
  std::unique_ptr<Tplan> plan1, plan2;
Martin Reinecke's avatar
Martin Reinecke committed
70
71
72
73

  size_t l_in=in.shape(axis), l_out=out.shape(axis);
  size_t l_min=std::min(l_in, l_out), l_max=std::max(l_in, l_out);
  MR_assert(kernel.size()==l_min/2+1, "bad kernel size");
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
74
75
  plan1 = std::make_unique<Tplan>(l_in);
  plan2 = std::make_unique<Tplan>(l_out);
76
  size_t bufsz = max(plan1->bufsize(), plan2->bufsize());
Martin Reinecke's avatar
Martin Reinecke committed
77
78
79
80
81

  execParallel(
    util::thread_count(nthreads, in, axis, native_simd<T0>::size()),
    [&](Scheduler &sched) {
      constexpr auto vlen = native_simd<T0>::size();
82
      auto storage = alloc_tmp_conv<T,T0>(in, axis, l_max, bufsz);
Martin Reinecke's avatar
Martin Reinecke committed
83
      multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
Martin Reinecke's avatar
Martin Reinecke committed
84
#ifndef DUCC0_NO_SIMD
85
      if constexpr (vlen>1)
Martin Reinecke's avatar
Martin Reinecke committed
86
87
88
        while (it.remaining()>=vlen)
          {
          it.advance(vlen);
89
90
91
          auto tdatav = reinterpret_cast<add_vec_t<T, vlen> *>(storage.data());
          exec(it, in, out, tdatav, *plan1, *plan2, kernel);
          }
Martin Reinecke's avatar
Martin Reinecke committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
      if constexpr (vlen>2)
        if constexpr (simd_exists<T,vlen/2>)
          if (it.remaining()>=vlen/2)
            {
            it.advance(vlen/2);
            auto tdatav = reinterpret_cast<add_vec_t<T, vlen/2> *>(storage.data());
            exec(it, in, out, tdatav, *plan1, *plan2, kernel);
            }
      if constexpr (vlen>4)
        if constexpr (simd_exists<T,vlen/4>)
          if (it.remaining()>=vlen/4)
            {
            it.advance(vlen/4);
            auto tdatav = reinterpret_cast<add_vec_t<T, vlen/4> *>(storage.data());
            exec(it, in, out, tdatav, *plan1, *plan2, kernel);
            }
Martin Reinecke's avatar
Martin Reinecke committed
108
109
110
111
#endif
      while (it.remaining()>0)
        {
        it.advance(1);
Martin Reinecke's avatar
Martin Reinecke committed
112
        auto buf = reinterpret_cast<T *>(storage.data());
Martin Reinecke's avatar
Martin Reinecke committed
113
114
115
116
117
118
119
        exec(it, in, out, buf, *plan1, *plan2, kernel);
        }
    });  // end of parallel region
  }

struct ExecConvR1
  {
Martin Reinecke's avatar
update    
Martin Reinecke committed
120
121
  template <typename T0, typename T, typename Titer> void operator() (
    const Titer &it, const fmav<T0> &in, fmav<T0> &out,
Martin Reinecke's avatar
Martin Reinecke committed
122
123
124
125
126
    T * buf, const pocketfft_r<T0> &plan1, const pocketfft_r<T0> &plan2,
    const vector<T0> &kernel) const
    {
    size_t l_in = plan1.length(),
           l_out = plan2.length(),
127
128
129
130
131
132
133
134
135
           l_min = std::min(l_in, l_out),
           bufsz = max(plan1.bufsize(), plan2.bufsize());
    T *buf1=buf, *buf2=buf+bufsz; 
    copy_input(it, in, buf2);
    plan1.exec_copyback(buf2, buf1, T0(1), true);
    for (size_t i=0; i<l_min; ++i) buf2[i]*=kernel[(i+1)/2];
    for (size_t i=l_in; i<l_out; ++i) buf2[i] = T(0);
    auto res = plan2.exec(buf2, buf1, T0(1), false);
    copy_output(it, res, out);
Martin Reinecke's avatar
Martin Reinecke committed
136
137
138
139
140
141
142
143
    }
  };

template<typename T> void convolve_1d(const fmav<T> &in,
  fmav<T> &out, size_t axis, const vector<T> &kernel, size_t nthreads=1)
  {
  MR_assert(axis<in.ndim(), "bad axis number");
  MR_assert(in.ndim()==out.ndim(), "dimensionality mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
144
  if (in.cdata()==out.cdata())
Martin Reinecke's avatar
Martin Reinecke committed
145
    MR_assert(in.stride()==out.stride(), "strides mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
146
147
148
  for (size_t i=0; i<in.ndim(); ++i)
    if (i!=axis)
      MR_assert(in.shape(i)==out.shape(i), "shape mismatch");
Martin Reinecke's avatar
Martin Reinecke committed
149
150
  MR_assert(!((in.shape(axis)&1) || (out.shape(axis)&1)),
    "input and output axis lengths must be even");
Martin Reinecke's avatar
Martin Reinecke committed
151
152
153
154
155
156
  if (in.size()==0) return;
  general_convolve<pocketfft_r<T>>(in, out, axis, kernel, nthreads,
    ExecConvR1());
  }

}
Martin Reinecke's avatar
Martin Reinecke committed
157
158
159

using detail_fft::convolve_1d;

160
namespace detail_totalconvolve {
161

Martin Reinecke's avatar
Martin Reinecke committed
162
using namespace std;
163

164
template<typename T> class ConvolverPlan
165
166
  {
  protected:
167
    constexpr static auto vlen = min<size_t>(8, native_simd<T>::size());
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
168
    using Tsimd = typename simd_select<T, vlen>::type;
169
170

    size_t nthreads;
Martin Reinecke's avatar
Martin Reinecke committed
171
    size_t lmax, kmax;
172
173
174
    // _s: small grid
    // _b: oversampled grid
    // no suffix: grid with borders
175
    size_t nphi_s, ntheta_s, npsi_s, nphi_b, ntheta_b, npsi_b;
176
    double dphi, dtheta, dpsi, xdphi, xdtheta, xdpsi;
177

178
    shared_ptr<HornerKernel> kernel;
179
180
    size_t nbphi, nbtheta;
    size_t nphi, ntheta;
181
    double phi0, theta0;
182
183

    void correct(mav<T,2> &arr, int spin) const
184
      {
185
      T sfct = (spin&1) ? -1 : 1;
186
      mav<T,2> tmp({nphi_b,nphi_s});
Martin Reinecke's avatar
Martin Reinecke committed
187
      // copy and extend to second half
188
      for (size_t j=0; j<nphi_s; ++j)
Martin Reinecke's avatar
Martin Reinecke committed
189
        tmp.v(0,j) = arr(0,j);
190
191
      for (size_t i=1, i2=nphi_s-1; i+1<ntheta_s; ++i,--i2)
        for (size_t j=0,j2=nphi_s/2; j<nphi_s; ++j,++j2)
192
          {
193
          if (j2>=nphi_s) j2-=nphi_s;
Martin Reinecke's avatar
Martin Reinecke committed
194
195
          tmp.v(i,j2) = arr(i,j2);
          tmp.v(i2,j) = sfct*tmp(i,j2);
196
          }
197
198
199
      for (size_t j=0; j<nphi_s; ++j)
        tmp.v(ntheta_s-1,j) = arr(ntheta_s-1,j);
      auto fct = kernel->corfunc(nphi_s/2+1, 1./nphi_b, nthreads);
200
      vector<T> k2(fct.size());
201
      for (size_t i=0; i<fct.size(); ++i) k2[i] = T(fct[i]/nphi_s);
Martin Reinecke's avatar
Martin Reinecke committed
202
      fmav<T> ftmp(tmp);
Martin Reinecke's avatar
Martin Reinecke committed
203
      fmav<T> ftmp0(subarray<2>(tmp, {0,0},{nphi_s, nphi_s}));
204
      convolve_1d(ftmp0, ftmp, 0, k2, nthreads);
Martin Reinecke's avatar
Martin Reinecke committed
205
      fmav<T> ftmp2(subarray<2>(tmp, {0,0},{ntheta_b, nphi_s}));
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
206
      fmav<T> farr(arr);
207
      convolve_1d(ftmp2, farr, 1, k2, nthreads);
208
      }
209
    void decorrect(mav<T,2> &arr, int spin) const
210
      {
211
      T sfct = (spin&1) ? -1 : 1;
212
213
      mav<T,2> tmp({nphi_b,nphi_s});
      auto fct = kernel->corfunc(nphi_s/2+1, 1./nphi_b, nthreads);
214
      vector<T> k2(fct.size());
215
      for (size_t i=0; i<fct.size(); ++i) k2[i] = T(fct[i]/nphi_s);
Martin Reinecke's avatar
Martin Reinecke committed
216
      fmav<T> farr(arr);
Martin Reinecke's avatar
Martin Reinecke committed
217
      fmav<T> ftmp2(subarray<2>(tmp, {0,0},{ntheta_b, nphi_s}));
218
      convolve_1d(farr, ftmp2, 1, k2, nthreads);
219
      // extend to second half
220
221
      for (size_t i=1, i2=nphi_b-1; i+1<ntheta_b; ++i,--i2)
        for (size_t j=0,j2=nphi_s/2; j<nphi_s; ++j,++j2)
222
          {
223
          if (j2>=nphi_s) j2-=nphi_s;
224
225
          tmp.v(i2,j) = sfct*tmp(i,j2);
          }
Martin Reinecke's avatar
Martin Reinecke committed
226
      fmav<T> ftmp(tmp);
Martin Reinecke's avatar
Martin Reinecke committed
227
      fmav<T> ftmp0(subarray<2>(tmp, {0,0},{nphi_s, nphi_s}));
228
      convolve_1d(ftmp, ftmp0, 0, k2, nthreads);
229
      for (size_t j=0; j<nphi_s; ++j)
230
        arr.v(0,j) = T(0.5)*tmp(0,j);
231
232
      for (size_t i=1; i+1<ntheta_s; ++i)
        for (size_t j=0; j<nphi_s; ++j)
Martin Reinecke's avatar
Martin Reinecke committed
233
          arr.v(i,j) = tmp(i,j);
234
235
      for (size_t j=0; j<nphi_s; ++j)
        arr.v(ntheta_s-1,j) = T(0.5)*tmp(ntheta_s-1,j);
236
      }
237

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
238
    aligned_array<uint32_t> getIdx(const mav<T,1> &theta, const mav<T,1> &phi, const mav<T,1> &psi,
239
      size_t patch_ntheta, size_t patch_nphi, size_t itheta0, size_t iphi0, size_t supp) const
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
240
      {
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
241
242
      size_t nptg = theta.shape(0);
      constexpr size_t cellsize=8;
243
      size_t nct = patch_ntheta/cellsize+1,
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
244
             ncp = patch_nphi/cellsize+1,
Martin Reinecke's avatar
fix    
Martin Reinecke committed
245
             ncpsi = npsi_b/cellsize+1;
246
247
      double theta0 = (int(itheta0)-int(nbtheta))*dtheta,
             phi0 = (int(iphi0)-int(nbphi))*dphi;
Martin Reinecke's avatar
Martin Reinecke committed
248
249
      double theta_lo=theta0, theta_hi=theta_lo+(patch_ntheta+1)*dtheta;
      double phi_lo=phi0, phi_hi=phi_lo+(patch_nphi+1)*dphi;
Martin Reinecke's avatar
Martin Reinecke committed
250
      MR_assert(nct*ncp*ncpsi<(size_t(1)<<32), "key space too large");
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
251

Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
252
      aligned_array<uint32_t> key(nptg);
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
253
      execParallel(nptg, nthreads, [&](size_t lo, size_t hi)
Martin Reinecke's avatar
Martin Reinecke committed
254
        {
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
255
256
257
258
        for (size_t i=lo; i<hi; ++i)
          {
          MR_assert((theta(i)>=theta_lo) && (theta(i)<=theta_hi), "theta out of range: ", theta(i));
          MR_assert((phi(i)>=phi_lo) && (phi(i)<=phi_hi), "phi out of range: ", phi(i));
Martin Reinecke's avatar
Martin Reinecke committed
259
          auto ftheta = (theta(i)-theta0)*xdtheta-supp*0.5;
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
260
          auto itheta = size_t(ftheta+1);
Martin Reinecke's avatar
Martin Reinecke committed
261
          auto fphi = (phi(i)-phi0)*xdphi-supp*0.5;
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
262
          auto iphi = size_t(fphi+1);
263
264
          auto fpsi = psi(i)*xdpsi;
          fpsi = fmodulo(fpsi, double(npsi_b));
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
265
266
267
268
269
270
271
272
273
          size_t ipsi = size_t(fpsi);
          ipsi /= cellsize;
          itheta /= cellsize;
          iphi /= cellsize;
          MR_assert(itheta<nct, "bad itheta");
          MR_assert(iphi<ncp, "bad iphi");
          key[i] = (itheta*ncp+iphi)*ncpsi+ipsi;
          }
        });
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
274
275
276
      aligned_array<uint32_t> res(key.size());
      bucket_sort(&key[0], &res[0], key.size(), ncp*nct*ncpsi, nthreads);
      return res;
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
277
278
      }

279
    template<size_t supp> class WeightHelper
Martin Reinecke's avatar
Martin Reinecke committed
280
281
282
283
284
285
286
287
      {
      public:
        static constexpr size_t vlen = Tsimd::size();
        static constexpr size_t nvec = (supp+vlen-1)/vlen;
        const ConvolverPlan &plan;
        union kbuf {
          T scalar[3*nvec*vlen];
          Tsimd simd[3*nvec];
288
289
290
#if defined(_MSC_VER)
          kbuf() {}
#endif
Martin Reinecke's avatar
Martin Reinecke committed
291
292
293
294
295
          };
        kbuf buf;

      private:
        TemplateKernel<supp, Tsimd> tkrn;
296
        double mytheta0, myphi0;
Martin Reinecke's avatar
Martin Reinecke committed
297
298

      public:
Martin Reinecke's avatar
Martin Reinecke committed
299
        WeightHelper(const ConvolverPlan &plan_, const mav_info<3> &info, size_t itheta0, size_t iphi0)
Martin Reinecke's avatar
Martin Reinecke committed
300
301
302
303
304
305
306
307
308
309
310
          : plan(plan_),
            tkrn(*plan.kernel),
            mytheta0(plan.theta0+itheta0*plan.dtheta),
            myphi0(plan.phi0+iphi0*plan.dphi),
            wpsi(&buf.scalar[0]),
            wtheta(&buf.scalar[nvec*vlen]),
            wphi(&buf.simd[2*nvec]),
            jumptheta(info.stride(1))
          {
          MR_assert(info.stride(2)==1, "last axis of cube must be contiguous");
          }
Martin Reinecke's avatar
Martin Reinecke committed
311
        void prep(double theta, double phi, double psi)
Martin Reinecke's avatar
Martin Reinecke committed
312
          {
313
          auto ftheta = (theta-mytheta0)*plan.xdtheta-supp*0.5;
Martin Reinecke's avatar
Martin Reinecke committed
314
315
          itheta = size_t(ftheta+1);
          ftheta = -1+(itheta-ftheta)*2;
316
          auto fphi = (phi-myphi0)*plan.xdphi-supp*0.5;
Martin Reinecke's avatar
Martin Reinecke committed
317
318
          iphi = size_t(fphi+1);
          fphi = -1+(iphi-fphi)*2;
319
320
          auto fpsi = psi*plan.xdpsi-supp*0.5;
          fpsi = fmodulo(fpsi, double(plan.npsi_b));
Martin Reinecke's avatar
Martin Reinecke committed
321
322
323
          ipsi = size_t(fpsi+1);
          fpsi = -1+(ipsi-fpsi)*2;
          if (ipsi>=plan.npsi_b) ipsi-=plan.npsi_b;
324
          tkrn.eval3(T(fpsi), T(ftheta), T(fphi), &buf.simd[0]);
Martin Reinecke's avatar
Martin Reinecke committed
325
326
327
328
329
330
331
332
          }
        size_t itheta, iphi, ipsi;
        const T * DUCC0_RESTRICT wpsi;
        const T * DUCC0_RESTRICT wtheta;
        const Tsimd * DUCC0_RESTRICT wphi;
        ptrdiff_t jumptheta;
      };

Martin Reinecke's avatar
Martin Reinecke committed
333
334
335
    // prefetching distance
    static constexpr size_t pfdist=2;

Martin Reinecke's avatar
Martin Reinecke committed
336
    template<size_t supp> void interpolx(const mav<T,3> &cube,
Martin Reinecke's avatar
Martin Reinecke committed
337
338
339
340
341
342
343
344
345
346
      size_t itheta0, size_t iphi0, const mav<T,1> &theta, const mav<T,1> &phi,
      const mav<T,1> &psi, mav<T,1> &signal) const
      {
      MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous");
      MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch");
      MR_assert(psi.shape(0)==theta.shape(0), "array shape mismatch");
      MR_assert(signal.shape(0)==theta.shape(0), "array shape mismatch");
      static constexpr size_t vlen = Tsimd::size();
      static constexpr size_t nvec = (supp+vlen-1)/vlen;
      MR_assert(cube.shape(0)==npsi_b, "bad psi dimension");
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
347
      auto idx = getIdx(theta, phi, psi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp);
Martin Reinecke's avatar
Martin Reinecke committed
348

Martin Reinecke's avatar
Martin Reinecke committed
349
350
      execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched)
        {
Martin Reinecke's avatar
Martin Reinecke committed
351
        WeightHelper<supp> hlp(*this, cube, itheta0, iphi0);
Martin Reinecke's avatar
Martin Reinecke committed
352
353
        while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind<rng.hi; ++ind)
          {
Martin Reinecke's avatar
Martin Reinecke committed
354
          if (ind+pfdist<rng.hi)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
355
            {
Martin Reinecke's avatar
Martin Reinecke committed
356
            size_t i=idx[ind+pfdist];
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
357
358
359
360
361
362
            DUCC0_PREFETCH_R(&theta(i));
            DUCC0_PREFETCH_R(&phi(i))
            DUCC0_PREFETCH_R(&psi(i));
            DUCC0_PREFETCH_R(&signal.v(i));
            DUCC0_PREFETCH_W(&signal.v(i));
            }
Martin Reinecke's avatar
Martin Reinecke committed
363
364
365
366
          size_t i=idx[ind];
          hlp.prep(theta(i), phi(i), psi(i));
          auto ipsi = hlp.ipsi;
          const T * DUCC0_RESTRICT ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
367
368
          Tsimd res=0;
          if constexpr(nvec==1)
Martin Reinecke's avatar
Martin Reinecke committed
369
370
371
372
            {
            for (size_t ipsic=0; ipsic<supp; ++ipsic)
              {
              const T * DUCC0_RESTRICT ptr2 = ptr;
373
374
              Tsimd tres=0;
              for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta)
375
                tres += hlp.wtheta[itheta]*Tsimd(ptr2, element_aligned_tag());
376
377
378
379
380
381
382
383
384
385
386
387
388
              res += tres*hlp.wpsi[ipsic];
              if (++ipsi>=npsi_b) ipsi=0;
              ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
              }
            res *= hlp.wphi[0];
            }
          else
            {
            for (size_t ipsic=0; ipsic<supp; ++ipsic)
              {
              const T * DUCC0_RESTRICT ptr2 = ptr;
              Tsimd tres=0;
              for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta)
Martin Reinecke's avatar
Martin Reinecke committed
389
                for (size_t iphi=0; iphi<nvec; ++iphi)
390
                  tres += hlp.wtheta[itheta]*hlp.wphi[iphi]*Tsimd(ptr2+iphi*vlen,element_aligned_tag());
391
              res += tres*hlp.wpsi[ipsic];
Martin Reinecke's avatar
Martin Reinecke committed
392
393
394
395
              if (++ipsi>=npsi_b) ipsi=0;
              ptr = &cube(ipsi,hlp.itheta,hlp.iphi);
              }
            }
396
          signal.v(i) = reduce(res, std::plus<>());
Martin Reinecke's avatar
Martin Reinecke committed
397
398
399
          }
        });
      }
Martin Reinecke's avatar
Martin Reinecke committed
400
    template<size_t supp> void deinterpolx(mav<T,3> &cube,
401
402
      size_t itheta0, size_t iphi0, const mav<T,1> &theta, const mav<T,1> &phi,
      const mav<T,1> &psi, const mav<T,1> &signal) const
403
      {
404
405
406
407
408
409
      MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous");
      MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch");
      MR_assert(psi.shape(0)==theta.shape(0), "array shape mismatch");
      MR_assert(signal.shape(0)==theta.shape(0), "array shape mismatch");
      static constexpr size_t vlen = Tsimd::size();
      static constexpr size_t nvec = (supp+vlen-1)/vlen;
Martin Reinecke's avatar
Martin Reinecke committed
410
      MR_assert(cube.shape(0)==npsi_b, "bad psi dimension");
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
411
      auto idx = getIdx(theta, phi, psi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp);
412
413

      constexpr size_t cellsize=16;
414
415
      size_t nct = cube.shape(1)/cellsize+10,
             ncp = cube.shape(2)/cellsize+10;
416
417
418
      mav<std::mutex,2> locks({nct,ncp});

      execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched)
419
        {
420
        size_t b_theta=99999999999999, b_phi=9999999999999999;
421
        WeightHelper<supp> hlp(*this, cube, itheta0, iphi0);
422
423
        while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind<rng.hi; ++ind)
          {
Martin Reinecke's avatar
Martin Reinecke committed
424
          if (ind+pfdist<rng.hi)
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
425
            {
Martin Reinecke's avatar
Martin Reinecke committed
426
            size_t i=idx[ind+pfdist];
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
427
428
429
430
431
            DUCC0_PREFETCH_R(&theta(i));
            DUCC0_PREFETCH_R(&phi(i))
            DUCC0_PREFETCH_R(&psi(i));
            DUCC0_PREFETCH_R(&signal(i));
            }
432
          size_t i=idx[ind];
433
          hlp.prep(theta(i), phi(i), psi(i));
Martin Reinecke's avatar
Martin Reinecke committed
434
435
          auto ipsi = hlp.ipsi;
          T * DUCC0_RESTRICT ptr = &cube.v(ipsi,hlp.itheta,hlp.iphi);
436
437
438

          size_t b_theta_new = hlp.itheta/cellsize,
                 b_phi_new = hlp.iphi/cellsize;
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
          if ((b_theta_new!=b_theta) || (b_phi_new!=b_phi))
            {
            if (b_theta<locks.shape(0))  // unlock
              {
              locks.v(b_theta,b_phi).unlock();
              locks.v(b_theta,b_phi+1).unlock();
              locks.v(b_theta+1,b_phi).unlock();
              locks.v(b_theta+1,b_phi+1).unlock();
              }
            b_theta = b_theta_new;
            b_phi = b_phi_new;
            locks.v(b_theta,b_phi).lock();
            locks.v(b_theta,b_phi+1).lock();
            locks.v(b_theta+1,b_phi).lock();
            locks.v(b_theta+1,b_phi+1).lock();
            }
Martin Reinecke's avatar
Martin Reinecke committed
455

456
457
            {
            Tsimd tmp=signal(i);
458
            if constexpr (nvec==1)
459
              {
460
461
              tmp *= hlp.wphi[0];
              for (size_t ipsic=0; ipsic<supp; ++ipsic)
462
                {
463
464
465
                auto ttmp=tmp*hlp.wpsi[ipsic];
                T * DUCC0_RESTRICT ptr2 = ptr;
                for (size_t itheta=0; itheta<supp; ++itheta, ptr2+=hlp.jumptheta)
466
                  {
467
                  Tsimd var=Tsimd(ptr2,element_aligned_tag());
468
                  var += ttmp*hlp.wtheta[itheta];
469
                  var.copy_to(ptr2,element_aligned_tag());
470
                  }
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
                if (++ipsi>=npsi_b) ipsi=0;
                ptr = &cube.v(ipsi,hlp.itheta,hlp.iphi);
                }
              }
            else
              {
              for (size_t ipsic=0; ipsic<supp; ++ipsic)
                {
                auto ttmp=tmp*hlp.wpsi[ipsic];
                T * DUCC0_RESTRICT ptr2 = ptr;
                for (size_t itheta=0; itheta<supp; ++itheta)
                  {
                  auto tttmp=ttmp*hlp.wtheta[itheta];
                  for (size_t iphi=0; iphi<nvec; ++iphi)
                    {
486
                    Tsimd var=Tsimd(ptr2+iphi*vlen, element_aligned_tag());
487
                    var += tttmp*hlp.wphi[iphi];
488
                    var.copy_to(ptr2+iphi*vlen, element_aligned_tag());
489
490
491
492
493
                    }
                  ptr2 += hlp.jumptheta;
                  }
                if (++ipsi>=npsi_b) ipsi=0;
                ptr = &cube.v(ipsi,hlp.itheta,hlp.iphi);
494
                }
495
              }
496
            }
497
          }
498
499
500
501
502
503
504
        if (b_theta<locks.shape(0))  // unlock
          {
          locks.v(b_theta,b_phi).unlock();
          locks.v(b_theta,b_phi+1).unlock();
          locks.v(b_theta+1,b_phi).unlock();
          locks.v(b_theta+1,b_phi+1).unlock();
          }
505
506
        });
      }
Martin Reinecke's avatar
Martin Reinecke committed
507
    double realsigma() const
Martin Reinecke's avatar
Martin Reinecke committed
508
      {
Martin Reinecke's avatar
Martin Reinecke committed
509
510
      return min(double(npsi_b)/(2*kmax+1),
                 min(double(nphi_b)/(2*lmax+1), double(ntheta_b)/(lmax+1)));
Martin Reinecke's avatar
Martin Reinecke committed
511
      }
512
513

  public:
Martin Reinecke's avatar
Martin Reinecke committed
514
    ConvolverPlan(size_t lmax_, size_t kmax_, double sigma, double epsilon,
515
      size_t nthreads_)
Martin Reinecke's avatar
Martin Reinecke committed
516
      : nthreads((nthreads_==0) ? get_default_nthreads() : nthreads_),
517
        lmax(lmax_),
Martin Reinecke's avatar
Martin Reinecke committed
518
        kmax(kmax_),
519
520
        nphi_s(2*good_size_real(lmax+1)),
        ntheta_s(nphi_s/2+1),
Martin Reinecke's avatar
Martin Reinecke committed
521
        npsi_s(kmax*2+1),
522
523
        nphi_b(std::max<size_t>(20,2*good_size_real(size_t((2*lmax+1)*sigma/2.)))),
        ntheta_b(nphi_b/2+1),
Martin Reinecke's avatar
Martin Reinecke committed
524
        npsi_b(size_t(npsi_s*sigma+0.99999)),
525
526
527
528
529
530
        dphi(2*pi/nphi_b),
        dtheta(pi/(ntheta_b-1)),
        dpsi(2*pi/npsi_b),
        xdphi(1./dphi),
        xdtheta(1./dtheta),
        xdpsi(1./dpsi),
Martin Reinecke's avatar
Martin Reinecke committed
531
        kernel(selectKernel<T>(realsigma(), epsilon/3.)),
Martin Reinecke's avatar
tweaks    
Martin Reinecke committed
532
533
534
        nbphi((kernel->support()+1)/2),
        nbtheta((kernel->support()+1)/2),
        nphi(nphi_b+2*nbphi+vlen),
535
536
537
        ntheta(ntheta_b+2*nbtheta),
        phi0(nbphi*(-dphi)),
        theta0(nbtheta*(-dtheta))
538
539
540
541
542
      {
      auto supp = kernel->support();
      MR_assert((supp<=ntheta) && (supp<=nphi_b), "kernel support too large!");
      }

543
544
    size_t Lmax() const { return lmax; }
    size_t Kmax() const { return kmax; }
545
546
    size_t Ntheta() const { return ntheta; }
    size_t Nphi() const { return nphi; }
Martin Reinecke's avatar
Martin Reinecke committed
547
548
    size_t Npsi() const { return npsi_b; }

549
    vector<size_t> getPatchInfo(double theta_lo, double theta_hi, double phi_lo, double phi_hi) const
Martin Reinecke's avatar
Martin Reinecke committed
550
551
      {
      vector<size_t> res(4);
552
      auto tmp = (theta_lo-theta0)*xdtheta-nbtheta;
553
554
555
      res[0] = min(size_t(max(0., tmp)), ntheta);
      tmp = (theta_hi-theta0)*xdtheta+nbtheta+1.;
      res[1] = min(size_t(max(0., tmp)), ntheta);
556
      tmp = (phi_lo-phi0)*xdphi-nbphi;
557
558
559
      res[2] = min(size_t(max(0., tmp)), nphi);
      tmp = (phi_hi-phi0)*xdphi+nbphi+1.+vlen;
      res[3] = min(size_t(max(0., tmp)), nphi);
Martin Reinecke's avatar
Martin Reinecke committed
560
561
      return res;
      }
562

Martin Reinecke's avatar
Martin Reinecke committed
563
    void getPlane(const mav<complex<T>,2> &vslm, const mav<complex<T>,2> &vblm,
564
      size_t mbeam, mav<T,3> &planes) const
565
      {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
566
      size_t nplanes=1+(mbeam>0);
567
      auto ncomp = vslm.shape(0);
568
      MR_assert(ncomp>0, "need at least one component");
569
      MR_assert(vblm.shape(0)==ncomp, "inconsistent slm and blm vectors");
Martin Reinecke's avatar
Martin Reinecke committed
570
      Alm_Base islm(lmax, lmax), iblm(lmax, kmax);
571
572
      MR_assert(islm.Num_Alms()==vslm.shape(1), "bad array dimension");
      MR_assert(iblm.Num_Alms()==vblm.shape(1), "bad array dimension");
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
573
      MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape");
Martin Reinecke's avatar
Martin Reinecke committed
574
      MR_assert(mbeam <= kmax, "mbeam too high");
575
576
577
578
579

      vector<T> lnorm(lmax+1);
      for (size_t i=0; i<=lmax; ++i)
        lnorm[i]=T(std::sqrt(4*pi/(2*i+1.)));

Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
580
581
582
583
584
585
586
587
588
      Alm_Base base(lmax, lmax);
      mav<complex<T>,2> aarr({nplanes,base.Num_Alms()});
      for (size_t m=0; m<=lmax; ++m)
        for (size_t l=m; l<=lmax; ++l)
          {
          aarr.v(0, base.index(l,m))=0.;
          if (mbeam>0)
            aarr.v(1, base.index(l,m))=0.;
          if (l>=mbeam)
589
            {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
590
591
592
            auto norm = (mbeam>0) ? -lnorm[l] : lnorm[l];
            for (size_t i=0; i<ncomp; ++i)
              {
593
594
              auto tmp = vblm(i,iblm.index(l,mbeam))*norm;
              aarr.v(0,base.index(l,m)) += vslm(i,islm.index(l,m))*tmp.real();
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
595
              if (mbeam>0)
596
                aarr.v(1,base.index(l,m)) += vslm(i,islm.index(l,m))*tmp.imag();
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
597
              }
598
            }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
599
600
          }
      auto subplanes=subarray<3>(planes,{0, nbtheta,nbphi}, {nplanes, ntheta_s, nphi_s});
601
      synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads);
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
602
      for (size_t iplane=0; iplane<nplanes; ++iplane)
603
        {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
604
605
        auto m = subarray<2>(planes, {iplane,nbtheta,nbphi},{0,ntheta_b,nphi_b});
        correct(m,mbeam);
606
        }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
607

608
609
      // fill border regions
      T fct = (mbeam&1) ? -1 : 1;
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
610
611
612
613
      for (size_t iplane=0; iplane<nplanes; ++iplane)
        {
        for (size_t i=0; i<nbtheta; ++i)
          for (size_t j=0, j2=nphi_b/2; j<nphi_b; ++j,++j2)
614
            {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
615
616
617
            if (j2>=nphi_b) j2-=nphi_b;
            planes.v(iplane,nbtheta-1-i,j2+nbphi) = fct*planes(iplane,nbtheta+1+i,j+nbphi);
            planes.v(iplane,nbtheta+ntheta_b+i,j2+nbphi) = fct*planes(iplane,nbtheta+ntheta_b-2-i,j+nbphi);
618
            }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
619
        for (size_t i=0; i<ntheta; ++i)
620
          {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
621
          for (size_t j=0; j<nbphi; ++j)
622
            {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
623
624
            planes.v(iplane,i,j) = planes(iplane,i,j+nphi_b);
            planes.v(iplane,i,j+nphi_b+nbphi) = planes(iplane,i,j+nbphi);
625
            }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
626
627
628
          // SIMD buffer
          for (size_t j=0; j<vlen; ++j)
            planes.v(iplane, i, nphi-vlen+j) = T(0);
Martin Reinecke's avatar
fix    
Martin Reinecke committed
629
630
          }
        }
631
      }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
632

Martin Reinecke's avatar
Martin Reinecke committed
633
    void getPlane(const mav<complex<T>,1> &slm, const mav<complex<T>,1> &blm,
634
      size_t mbeam, mav<T,3> &planes) const
635
      {
636
637
      mav<complex<T>,2> vslm(&slm(0), {1,slm.shape(0)}, {0,slm.stride(0)});
      mav<complex<T>,2> vblm(&blm(0), {1,blm.shape(0)}, {0,blm.stride(0)});
638
      getPlane(vslm, vblm, mbeam, planes);
639
      }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
640

641
642
643
644
    void interpol(const mav<T,3> &cube, size_t itheta0,
      size_t iphi0, const mav<T,1> &theta, const mav<T,1> &phi,
      const mav<T,1> &psi, mav<T,1> &signal) const
      {
645
646
647
648
649
650
651
652
653
654
655
656
      if constexpr(is_same<T,double>::value)
        switch(kernel->support())
          {
          case  9: interpolx< 9>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 10: interpolx<10>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 11: interpolx<11>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 12: interpolx<12>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 13: interpolx<13>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 14: interpolx<14>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 15: interpolx<15>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 16: interpolx<16>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          }
657
658
      switch(kernel->support())
        {
Martin Reinecke's avatar
Martin Reinecke committed
659
660
661
662
663
        case 4: interpolx<4>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
        case 5: interpolx<5>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
        case 6: interpolx<6>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
        case 7: interpolx<7>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
        case 8: interpolx<8>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
Martin Reinecke's avatar
Martin Reinecke committed
664
665
666
        default: MR_fail("must not happen");
        }
      }
667
668
669
670
671

    void deinterpol(mav<T,3> &cube, size_t itheta0,
      size_t iphi0, const mav<T,1> &theta, const mav<T,1> &phi,
      const mav<T,1> &psi, const mav<T,1> &signal) const
      {
672
673
674
675
676
677
678
679
680
681
682
683
      if constexpr(is_same<T,double>::value)
        switch(kernel->support())
          {
          case  9: deinterpolx< 9>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 10: deinterpolx<10>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 11: deinterpolx<11>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 12: deinterpolx<12>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 13: deinterpolx<13>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 14: deinterpolx<14>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 15: deinterpolx<15>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          case 16: deinterpolx<16>(cube, itheta0, iphi0, theta, phi, psi, signal); return;
          }
684
685
      switch(kernel->support())
        {
Martin Reinecke's avatar
Martin Reinecke committed
686
687
688
689
690
        case 4: deinterpolx<4>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
        case 5: deinterpolx<5>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
        case 6: deinterpolx<6>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
        case 7: deinterpolx<7>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
        case 8: deinterpolx<8>(cube, itheta0, iphi0, theta, phi, psi, signal); break;
691
692
693
694
        default: MR_fail("must not happen");
        }
      }

Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
695
    void updateSlm(mav<complex<T>,2> &vslm, const mav<complex<T>,2> &vblm,
696
      size_t mbeam, mav<T,3> &planes) const
697
      {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
698
      size_t nplanes=1+(mbeam>0);
699
      auto ncomp = vslm.shape(0);
700
      MR_assert(ncomp>0, "need at least one component");
701
      MR_assert(vblm.shape(0)==ncomp, "inconsistent slm and blm vectors");
Martin Reinecke's avatar
Martin Reinecke committed
702
      Alm_Base islm(lmax, lmax), iblm(lmax, kmax);
703
704
      MR_assert(islm.Num_Alms()==vslm.shape(1), "bad array dimension");
      MR_assert(iblm.Num_Alms()==vblm.shape(1), "bad array dimension");
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
705
      MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape");
Martin Reinecke's avatar
Martin Reinecke committed
706
      MR_assert(mbeam <= kmax, "mbeam too high");
707
708

      // move stuff from border regions onto the main grid
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
709
710
711
712
      for (size_t iplane=0; iplane<nplanes; ++iplane)
        {
        for (size_t i=0; i<ntheta; ++i)
          for (size_t j=0; j<nbphi; ++j)
713
            {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
714
715
            planes.v(iplane,i,j+nphi_b) += planes(iplane,i,j);
            planes.v(iplane,i,j+nbphi) += planes(iplane,i,j+nphi_b+nbphi);
716
            }
717

Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
718
719
        for (size_t i=0; i<nbtheta; ++i)
          for (size_t j=0, j2=nphi_b/2; j<nphi_b; ++j,++j2)
720
            {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
721
722
723
724
            T fct = (mbeam&1) ? -1 : 1;
            if (j2>=nphi_b) j2-=nphi_b;
            planes.v(iplane,nbtheta+1+i,j+nbphi) += fct*planes(iplane,nbtheta-1-i,j2+nbphi);
            planes.v(iplane,nbtheta+ntheta_b-2-i, j+nbphi) += fct*planes(iplane,nbtheta+ntheta_b+i,j2+nbphi);
725
726
            }

Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
727
728
        // special treatment for poles
        for (size_t j=0,j2=nphi_b/2; j<nphi_b/2; ++j,++j2)
729
          {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
730
731
732
733
734
735
736
737
          T fct = (mbeam&1) ? -1 : 1;
          if (j2>=nphi_b) j2-=nphi_b;
          T tval = planes(iplane,nbtheta,j+nbphi) + fct*planes(iplane,nbtheta,j2+nbphi);
          planes.v(iplane,nbtheta,j+nbphi) = tval;
          planes.v(iplane,nbtheta,j2+nbphi) = fct*tval;
          tval = planes(iplane,nbtheta+ntheta_b-1,j+nbphi) + fct*planes(iplane,nbtheta+ntheta_b-1,j2+nbphi);
          planes.v(iplane,nbtheta+ntheta_b-1,j+nbphi) = tval;
          planes.v(iplane,nbtheta+ntheta_b-1,j2+nbphi) = fct*tval;
738
739
          }
        }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
740

741
      vector<T>lnorm(lmax+1);
742
      for (size_t i=0; i<=lmax; ++i)
743
        lnorm[i]=T(std::sqrt(4*pi/(2*i+1.)));
744

Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
745
      Alm_Base base(lmax,lmax);
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
746
747

      for (size_t iplane=0; iplane<nplanes; ++iplane)
748
        {
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
749
750
        auto m = subarray<2>(planes, {iplane,nbtheta,nbphi},{0,ntheta_b,nphi_b});
        decorrect(m,mbeam);
751
        }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
752
753
754
      auto subplanes=subarray<3>(planes,{0, nbtheta, nbphi}, {nplanes, ntheta_s, nphi_s});

      mav<complex<T>,2> aarr({nplanes, base.Num_Alms()});
755
      adjoint_synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads);
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
756
757
758
759
760
      for (size_t m=0; m<=lmax; ++m)
        for (size_t l=m; l<=lmax; ++l)
          if (l>=mbeam)
            for (size_t i=0; i<ncomp; ++i)
              {
761
              auto tmp = vblm(i,iblm.index(l,mbeam))*lnorm[l] * T((mbeam==0) ? 1 : (-2));
762
              vslm.v(i,islm.index(l,m)) += aarr(0,base.index(l,m))*tmp.real();
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
763
              if (mbeam>0)
764
                vslm.v(i,islm.index(l,m)) += aarr(1,base.index(l,m))*tmp.imag();
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
765
              }
766
      }
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
767

Martin Reinecke's avatar
Martin Reinecke committed
768
    void updateSlm(mav<complex<T>,1> &slm, const mav<complex<T>,1> &blm,
769
      size_t mbeam, mav<T,3> &planes) const
770
      {
771
772
      mav<complex<T>,2> vslm(&slm.v(0), {1,slm.shape(0)}, {0,slm.stride(0)}, true);
      mav<complex<T>,2> vblm(&blm(0), {1,blm.shape(0)}, {0,blm.stride(0)});
773
      updateSlm(vslm, vblm, mbeam, planes);
774
      }
Martin Reinecke's avatar
Martin Reinecke committed
775
776
777
778

    void prepPsi(mav<T,3> &subcube) const
      {
      MR_assert(subcube.shape(0)==npsi_b, "bad psi dimension");
Martin Reinecke's avatar
Martin Reinecke committed
779
      auto newpart = subarray<3>(subcube, {npsi_s,0,0},{MAXIDX,MAXIDX,MAXIDX});
Martin Reinecke's avatar
Martin Reinecke committed
780
781
782
783
784
785
786
787
788
789
      newpart.fill(T(0));
      auto fct = kernel->corfunc(npsi_s/2+1, 1./npsi_b, nthreads);
      for (size_t k=0; k<npsi_s; ++k)
        {
        auto factor = T(fct[(k+1)/2]);
        for (size_t i=0; i<subcube.shape(1); ++i)
          for (size_t j=0; j<subcube.shape(2); ++j)
            subcube.v(k,i,j) *= factor;
        }
      fmav<T> fsubcube(subcube);
Martin Reinecke's avatar
Martin Reinecke committed
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
      r2r_fftpack(fsubcube, fsubcube, {0}, false, true, T(1), nthreads);
      }

    void deprepPsi(mav<T,3> &subcube) const
      {
      MR_assert(subcube.shape(0)==npsi_b, "bad psi dimension");
      fmav<T> fsubcube(subcube);
      r2r_fftpack(fsubcube, fsubcube, {0}, true, false, T(1), nthreads);
      auto fct = kernel->corfunc(npsi_s/2+1, 1./npsi_b, nthreads);
      for (size_t k=0; k<npsi_s; ++k)
        {
        auto factor = T(fct[(k+1)/2]);
        for (size_t i=0; i<subcube.shape(1); ++i)
          for (size_t j=0; j<subcube.shape(2); ++j)
            subcube.v(k,i,j) *= factor;
        }
Martin Reinecke's avatar
Martin Reinecke committed
806
      }
807
808
  };

Martin Reinecke's avatar
Martin Reinecke committed
809
}
Martin Reinecke's avatar
cleanup    
Martin Reinecke committed
810

811
812
using detail_totalconvolve::ConvolverPlan;

813

Martin Reinecke's avatar
Martin Reinecke committed
814
}
815

Martin Reinecke's avatar
Martin Reinecke committed
816
#endif