sharp_core_inc.cc 37.2 KB
Newer Older
Martin Reinecke's avatar
Martin Reinecke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/*
 *  This file is part of libsharp2.
 *
 *  libsharp2 is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  libsharp2 is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with libsharp2; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

/* libsharp2 is being developed at the Max-Planck-Institut fuer Astrophysik */

Martin Reinecke's avatar
Martin Reinecke committed
21
/*! \file sharp_core_inc.cc
Martin Reinecke's avatar
Martin Reinecke committed
22
23
 *  Computational core
 *
Martin Reinecke's avatar
Martin Reinecke committed
24
 *  Copyright (C) 2012-2020 Max-Planck-Society
Martin Reinecke's avatar
Martin Reinecke committed
25
26
27
28
29
30
31
32
33
34
35
36
 *  \author Martin Reinecke
 */

// FIXME: special ugly workaround for problems on OSX
#if (!defined(__APPLE__)) || (!defined(__AVX512F__))

#if (defined(MULTIARCH) || defined(GENERIC_ARCH))

#define XCONCATX(a,b) a##_##b
#define XCONCATX2(a,b) XCONCATX(a,b)
#define XARCH(a) XCONCATX2(a,ARCH)

Martin Reinecke's avatar
Martin Reinecke committed
37
#include <numeric>
Martin Reinecke's avatar
Martin Reinecke committed
38
#include <complex>
Martin Reinecke's avatar
Martin Reinecke committed
39
40
#include <cmath>
#include <cstring>
Martin Reinecke's avatar
Martin Reinecke committed
41
#include <vector>
42
43
44
45
46
47
48
49
50
#include "mr_util/sharp/sharp.h"
#include "mr_util/sharp/sharp_internal.h"
#include "mr_util/infra/error_handling.h"
#include "mr_util/infra/useful_macros.h"
#include "mr_util/infra/simd.h"

namespace mr {

namespace detail_sharp {
Martin Reinecke's avatar
Martin Reinecke committed
51
52
53

#pragma GCC visibility push(hidden)

Martin Reinecke's avatar
Martin Reinecke committed
54
using namespace std;
55

Martin Reinecke's avatar
Martin Reinecke committed
56
using Tv=native_simd<double>;
Martin Reinecke's avatar
cleanup  
Martin Reinecke committed
57
58
static constexpr size_t VLEN=Tv::size();

59
#if ((!defined(MRUTIL_NO_SIMD)) && defined(__AVX__) && (!defined(__AVX512F__)))
60
61
62
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
  complex<double> * MRUTIL_RESTRICT cc)
  {
Martin Reinecke's avatar
Martin Reinecke committed
63
  auto tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
64
65
66
67
68
69
70
71
72
73
74
75
76
77
  auto tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
       tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
  tmp1=tmp3+tmp4;
  union U
    {
    decltype(tmp1) v;
    complex<double> c[2];
    U() {}
    };
  U u;
  u.v=tmp1;
  cc[0]+=u.c[0]; cc[1]+=u.c[1];
  }
#else
Martin Reinecke's avatar
cleanup  
Martin Reinecke committed
78
79
80
static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
  complex<double> * MRUTIL_RESTRICT cc)
  {
Martin Reinecke's avatar
Martin Reinecke committed
81
82
  cc[0] += complex<double>(accumulate(a,std::plus<>()),accumulate(b,std::plus<>()));
  cc[1] += complex<double>(accumulate(c,std::plus<>()),accumulate(d,std::plus<>()));
Martin Reinecke's avatar
cleanup  
Martin Reinecke committed
83
  }
84
#endif
Martin Reinecke's avatar
Martin Reinecke committed
85

Martin Reinecke's avatar
Martin Reinecke committed
86
using dcmplx = complex<double>;
Martin Reinecke's avatar
Martin Reinecke committed
87

Martin Reinecke's avatar
Martin Reinecke committed
88
89
constexpr size_t nv0 = 128/VLEN;
constexpr size_t nvx = 64/VLEN;
Martin Reinecke's avatar
Martin Reinecke committed
90

Martin Reinecke's avatar
Martin Reinecke committed
91
92
using Tbv0 = Tv[nv0];
using Tbs0 = double[nv0*VLEN];
Martin Reinecke's avatar
Martin Reinecke committed
93

Martin Reinecke's avatar
Martin Reinecke committed
94
struct s0data_v
Martin Reinecke's avatar
Martin Reinecke committed
95
96
  {
  Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i;
Martin Reinecke's avatar
Martin Reinecke committed
97
  };
Martin Reinecke's avatar
Martin Reinecke committed
98

Martin Reinecke's avatar
Martin Reinecke committed
99
struct s0data_s
Martin Reinecke's avatar
Martin Reinecke committed
100
101
  {
  Tbs0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i;
Martin Reinecke's avatar
Martin Reinecke committed
102
  };
Martin Reinecke's avatar
Martin Reinecke committed
103

Martin Reinecke's avatar
Martin Reinecke committed
104
union s0data_u
Martin Reinecke's avatar
Martin Reinecke committed
105
106
107
  {
  s0data_v v;
  s0data_s s;
Martin Reinecke's avatar
Martin Reinecke committed
108
  };
Martin Reinecke's avatar
Martin Reinecke committed
109

Martin Reinecke's avatar
Martin Reinecke committed
110
111
using Tbvx = Tv[nvx];
using Tbsx = double[nvx*VLEN];
Martin Reinecke's avatar
Martin Reinecke committed
112

Martin Reinecke's avatar
Martin Reinecke committed
113
struct sxdata_v
Martin Reinecke's avatar
Martin Reinecke committed
114
115
116
  {
  Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
Martin Reinecke's avatar
Martin Reinecke committed
117
  };
Martin Reinecke's avatar
Martin Reinecke committed
118

Martin Reinecke's avatar
Martin Reinecke committed
119
struct sxdata_s
Martin Reinecke's avatar
Martin Reinecke committed
120
121
122
  {
  Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth,
       p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi;
Martin Reinecke's avatar
Martin Reinecke committed
123
  };
Martin Reinecke's avatar
Martin Reinecke committed
124

Martin Reinecke's avatar
Martin Reinecke committed
125
union sxdata_u
Martin Reinecke's avatar
Martin Reinecke committed
126
127
128
  {
  sxdata_v v;
  sxdata_s s;
Martin Reinecke's avatar
Martin Reinecke committed
129
  };
Martin Reinecke's avatar
Martin Reinecke committed
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

static inline void Tvnormalize (Tv * MRUTIL_RESTRICT val, Tv * MRUTIL_RESTRICT scale,
  double maxval)
  {
  const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval;
  const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig;
  auto mask = abs(*val)>vfmax;
  while (any_of(mask))
    {
    where(mask,*val)*=vfsmall;
    where(mask,*scale)+=1;
    mask = abs(*val)>vfmax;
    }
  mask = (abs(*val)<vfmin) & (*val!=0);
  while (any_of(mask))
    {
    where(mask,*val)*=vfbig;
    where(mask,*scale)-=1;
    mask = (abs(*val)<vfmin) & (*val!=0);
    }
  }

Martin Reinecke's avatar
Martin Reinecke committed
152
static void mypow(Tv val, size_t npow, const vector<double> &powlimit,
Martin Reinecke's avatar
Martin Reinecke committed
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
  Tv * MRUTIL_RESTRICT resd, Tv * MRUTIL_RESTRICT ress)
  {
  Tv vminv=powlimit[npow];
  auto mask = abs(val)<vminv;
  if (none_of(mask)) // no underflows possible, use quick algoritm
    {
    Tv res=1;
    do
      {
      if (npow&1)
        res*=val;
      val*=val;
      }
    while(npow>>=1);
    *resd=res;
    *ress=0;
    }
  else
    {
    Tv scale=0, scaleint=0, res=1;
    Tvnormalize(&val,&scaleint,sharp_fbighalf);
    do
      {
      if (npow&1)
        {
        res*=val;
        scale+=scaleint;
        Tvnormalize(&res,&scale,sharp_fbighalf);
        }
      val*=val;
      scaleint+=scaleint;
      Tvnormalize(&val,&scaleint,sharp_fbighalf);
      }
    while(npow>>=1);
    *resd=res;
    *ress=scale;
    }
  }

static inline void getCorfac(Tv scale, Tv * MRUTIL_RESTRICT corfac,
Martin Reinecke's avatar
Martin Reinecke committed
193
  const vector<double> &cf)
Martin Reinecke's avatar
Martin Reinecke committed
194
  {
Martin Reinecke's avatar
Martin Reinecke committed
195
196
  union Tvu
    { Tv v; double s[VLEN]; };
Martin Reinecke's avatar
Martin Reinecke committed
197
198
199

  Tvu sc, corf;
  sc.v=scale;
Martin Reinecke's avatar
Martin Reinecke committed
200
  for (size_t i=0; i<VLEN; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
201
    corf.s[i] = (sc.s[i]<sharp_minscale) ?
202
      0. : cf[size_t(int(sc.s[i])-sharp_minscale)];
Martin Reinecke's avatar
Martin Reinecke committed
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
  *corfac=corf.v;
  }

static inline bool rescale(Tv * MRUTIL_RESTRICT v1, Tv * MRUTIL_RESTRICT v2, Tv * MRUTIL_RESTRICT s, Tv eps)
  {
  auto mask = abs(*v2)>eps;
  if (any_of(mask))
    {
    where(mask,*v1)*=sharp_fsmall;
    where(mask,*v2)*=sharp_fsmall;
    where(mask,*s)+=1;
    return true;
    }
  return false;
  }

Martin Reinecke's avatar
Martin Reinecke committed
219
MRUTIL_NOINLINE static void iter_to_ieee(const sharp_Ylmgen &gen,
Martin Reinecke's avatar
Martin Reinecke committed
220
  s0data_v & MRUTIL_RESTRICT d, size_t & MRUTIL_RESTRICT l_, size_t & MRUTIL_RESTRICT il_, size_t nv2)
Martin Reinecke's avatar
Martin Reinecke committed
221
  {
Martin Reinecke's avatar
Martin Reinecke committed
222
  size_t l=gen.m, il=0;
Martin Reinecke's avatar
Martin Reinecke committed
223
  Tv mfac = (gen.m&1) ? -gen.mfac[gen.m]:gen.mfac[gen.m];
Martin Reinecke's avatar
Martin Reinecke committed
224
  Tv limscale=sharp_limscale;
Martin Reinecke's avatar
Martin Reinecke committed
225
226
  bool below_limit = true;
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
227
    {
Martin Reinecke's avatar
Martin Reinecke committed
228
229
230
231
232
    d.lam1[i]=0;
    mypow(d.sth[i],gen.m,gen.powlimit,&d.lam2[i],&d.scale[i]);
    d.lam2[i] *= mfac;
    Tvnormalize(&d.lam2[i],&d.scale[i],sharp_ftol);
    below_limit &= all_of(d.scale[i]<limscale);
Martin Reinecke's avatar
Martin Reinecke committed
233
234
235
236
    }

  while (below_limit)
    {
Martin Reinecke's avatar
Martin Reinecke committed
237
    if (l+4>gen.lmax) {l_=gen.lmax+1;return;}
Martin Reinecke's avatar
Martin Reinecke committed
238
    below_limit=1;
Martin Reinecke's avatar
Martin Reinecke committed
239
240
    Tv a1=gen.coef[il  ].a, b1=gen.coef[il  ].b;
    Tv a2=gen.coef[il+1].a, b2=gen.coef[il+1].b;
Martin Reinecke's avatar
Martin Reinecke committed
241
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
242
      {
Martin Reinecke's avatar
Martin Reinecke committed
243
244
245
246
      d.lam1[i] = (a1*d.csq[i] + b1)*d.lam2[i] + d.lam1[i];
      d.lam2[i] = (a2*d.csq[i] + b2)*d.lam1[i] + d.lam2[i];
      if (rescale(&d.lam1[i], &d.lam2[i], &d.scale[i], sharp_ftol))
        below_limit &= all_of(d.scale[i]<sharp_limscale);
Martin Reinecke's avatar
Martin Reinecke committed
247
248
249
      }
    l+=4; il+=2;
    }
Martin Reinecke's avatar
Martin Reinecke committed
250
  l_=l; il_=il;
Martin Reinecke's avatar
Martin Reinecke committed
251
252
  }

Martin Reinecke's avatar
Martin Reinecke committed
253
MRUTIL_NOINLINE static void alm2map_kernel(s0data_v & MRUTIL_RESTRICT d,
Martin Reinecke's avatar
Martin Reinecke committed
254
  const vector<sharp_Ylmgen::dbl2> &coef, const dcmplx * MRUTIL_RESTRICT alm,
Martin Reinecke's avatar
Martin Reinecke committed
255
  size_t l, size_t il, size_t lmax, size_t nv2)
Martin Reinecke's avatar
Martin Reinecke committed
256
  {
Martin Reinecke's avatar
Martin Reinecke committed
257
  for (; l+6<=lmax; il+=4, l+=8)
Martin Reinecke's avatar
Martin Reinecke committed
258
    {
Martin Reinecke's avatar
Martin Reinecke committed
259
260
261
262
263
264
265
266
267
268
269
270
271
    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag();
    Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
    Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
    Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
    Tv ar5=alm[l+4].real(), ai5=alm[l+4].imag();
    Tv ar6=alm[l+5].real(), ai6=alm[l+5].imag();
    Tv ar7=alm[l+6].real(), ai7=alm[l+6].imag();
    Tv ar8=alm[l+7].real(), ai8=alm[l+7].imag();
    Tv a1=coef[il  ].a, b1=coef[il  ].b;
    Tv a2=coef[il+1].a, b2=coef[il+1].b;
    Tv a3=coef[il+2].a, b3=coef[il+2].b;
    Tv a4=coef[il+3].a, b4=coef[il+3].b;
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
272
      {
Martin Reinecke's avatar
Martin Reinecke committed
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
      d.p1r[i] += d.lam2[i]*ar1;
      d.p1i[i] += d.lam2[i]*ai1;
      d.p2r[i] += d.lam2[i]*ar2;
      d.p2i[i] += d.lam2[i]*ai2;
      d.lam1[i] = (a1*d.csq[i] + b1)*d.lam2[i] + d.lam1[i];
      d.p1r[i] += d.lam1[i]*ar3;
      d.p1i[i] += d.lam1[i]*ai3;
      d.p2r[i] += d.lam1[i]*ar4;
      d.p2i[i] += d.lam1[i]*ai4;
      d.lam2[i] = (a2*d.csq[i] + b2)*d.lam1[i] + d.lam2[i];
      d.p1r[i] += d.lam2[i]*ar5;
      d.p1i[i] += d.lam2[i]*ai5;
      d.p2r[i] += d.lam2[i]*ar6;
      d.p2i[i] += d.lam2[i]*ai6;
      d.lam1[i] = (a3*d.csq[i] + b3)*d.lam2[i] + d.lam1[i];
      d.p1r[i] += d.lam1[i]*ar7;
      d.p1i[i] += d.lam1[i]*ai7;
      d.p2r[i] += d.lam1[i]*ar8;
      d.p2i[i] += d.lam1[i]*ai8;
      d.lam2[i] = (a4*d.csq[i] + b4)*d.lam1[i] + d.lam2[i];
Martin Reinecke's avatar
Martin Reinecke committed
293
294
      }
    }
Martin Reinecke's avatar
Martin Reinecke committed
295
  for (; l+2<=lmax; il+=2, l+=4)
Martin Reinecke's avatar
Martin Reinecke committed
296
    {
Martin Reinecke's avatar
Martin Reinecke committed
297
298
299
300
301
302
303
    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag();
    Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
    Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag();
    Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag();
    Tv a1=coef[il  ].a, b1=coef[il  ].b;
    Tv a2=coef[il+1].a, b2=coef[il+1].b;
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
304
      {
Martin Reinecke's avatar
Martin Reinecke committed
305
306
307
308
309
310
311
312
313
314
      d.p1r[i] += d.lam2[i]*ar1;
      d.p1i[i] += d.lam2[i]*ai1;
      d.p2r[i] += d.lam2[i]*ar2;
      d.p2i[i] += d.lam2[i]*ai2;
      d.lam1[i] = (a1*d.csq[i] + b1)*d.lam2[i] + d.lam1[i];
      d.p1r[i] += d.lam1[i]*ar3;
      d.p1i[i] += d.lam1[i]*ai3;
      d.p2r[i] += d.lam1[i]*ar4;
      d.p2i[i] += d.lam1[i]*ai4;
      d.lam2[i] = (a2*d.csq[i] + b2)*d.lam1[i] + d.lam2[i];
Martin Reinecke's avatar
Martin Reinecke committed
315
316
317
318
319
320
321
      }
    }
  for (; l<=lmax; ++il, l+=2)
    {
    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag();
    Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
    Tv a=coef[il].a, b=coef[il].b;
Martin Reinecke's avatar
Martin Reinecke committed
322
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
323
      {
Martin Reinecke's avatar
Martin Reinecke committed
324
325
326
327
328
329
330
      d.p1r[i] += d.lam2[i]*ar1;
      d.p1i[i] += d.lam2[i]*ai1;
      d.p2r[i] += d.lam2[i]*ar2;
      d.p2i[i] += d.lam2[i]*ai2;
      Tv tmp = (a*d.csq[i] + b)*d.lam2[i] + d.lam1[i];
      d.lam1[i] = d.lam2[i];
      d.lam2[i] = tmp;
Martin Reinecke's avatar
Martin Reinecke committed
331
332
333
334
      }
    }
  }

Martin Reinecke's avatar
Martin Reinecke committed
335
MRUTIL_NOINLINE static void calc_alm2map (sharp_job & MRUTIL_RESTRICT job,
Martin Reinecke's avatar
Martin Reinecke committed
336
  const sharp_Ylmgen &gen, s0data_v & MRUTIL_RESTRICT d, size_t nth)
Martin Reinecke's avatar
Martin Reinecke committed
337
  {
Martin Reinecke's avatar
fixes  
Martin Reinecke committed
338
  size_t l,il=0,lmax=gen.lmax;
Martin Reinecke's avatar
Martin Reinecke committed
339
340
  size_t nv2 = (nth+VLEN-1)/VLEN;
  iter_to_ieee(gen, d, l, il, nv2);
Martin Reinecke's avatar
Martin Reinecke committed
341
  job.opcnt += il * 4*nth;
Martin Reinecke's avatar
Martin Reinecke committed
342
  if (l>lmax) return;
Martin Reinecke's avatar
Martin Reinecke committed
343
  job.opcnt += (lmax+1-l) * 6*nth;
Martin Reinecke's avatar
Martin Reinecke committed
344

Martin Reinecke's avatar
Martin Reinecke committed
345
  auto &coef = gen.coef;
Martin Reinecke's avatar
Martin Reinecke committed
346
  const dcmplx * MRUTIL_RESTRICT alm=job.almtmp;
Martin Reinecke's avatar
Martin Reinecke committed
347
348
  bool full_ieee=true;
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
349
    {
Martin Reinecke's avatar
Martin Reinecke committed
350
351
    getCorfac(d.scale[i], &d.corfac[i], gen.cf);
    full_ieee &= all_of(d.scale[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
352
353
354
355
356
357
358
359
    }

  while((!full_ieee) && (l<=lmax))
    {
    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag();
    Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag();
    Tv a=coef[il].a, b=coef[il].b;
    full_ieee=1;
Martin Reinecke's avatar
Martin Reinecke committed
360
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
361
      {
Martin Reinecke's avatar
Martin Reinecke committed
362
363
364
365
366
367
368
369
370
371
      d.p1r[i] += d.lam2[i]*d.corfac[i]*ar1;
      d.p1i[i] += d.lam2[i]*d.corfac[i]*ai1;
      d.p2r[i] += d.lam2[i]*d.corfac[i]*ar2;
      d.p2i[i] += d.lam2[i]*d.corfac[i]*ai2;
      Tv tmp = (a*d.csq[i] + b)*d.lam2[i] + d.lam1[i];
      d.lam1[i] = d.lam2[i];
      d.lam2[i] = tmp;
      if (rescale(&d.lam1[i], &d.lam2[i], &d.scale[i], sharp_ftol))
        getCorfac(d.scale[i], &d.corfac[i], gen.cf);
      full_ieee &= all_of(d.scale[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
372
373
374
375
376
      }
    l+=2; ++il;
    }
  if (l>lmax) return;

Martin Reinecke's avatar
Martin Reinecke committed
377
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
378
    {
Martin Reinecke's avatar
Martin Reinecke committed
379
380
    d.lam1[i] *= d.corfac[i];
    d.lam2[i] *= d.corfac[i];
Martin Reinecke's avatar
Martin Reinecke committed
381
382
383
384
    }
  alm2map_kernel(d, coef, alm, l, il, lmax, nv2);
  }

Martin Reinecke's avatar
Martin Reinecke committed
385
MRUTIL_NOINLINE static void map2alm_kernel(s0data_v & MRUTIL_RESTRICT d,
Martin Reinecke's avatar
Martin Reinecke committed
386
  const vector<sharp_Ylmgen::dbl2> &coef, dcmplx * MRUTIL_RESTRICT alm, size_t l,
Martin Reinecke's avatar
Martin Reinecke committed
387
  size_t il, size_t lmax, size_t nv2)
Martin Reinecke's avatar
Martin Reinecke committed
388
  {
Martin Reinecke's avatar
Martin Reinecke committed
389
  for (; l+2<=lmax; il+=2, l+=4)
Martin Reinecke's avatar
Martin Reinecke committed
390
391
392
393
394
    {
    Tv a1=coef[il  ].a, b1=coef[il  ].b;
    Tv a2=coef[il+1].a, b2=coef[il+1].b;
    Tv atmp1[4] = {0,0,0,0};
    Tv atmp2[4] = {0,0,0,0};
Martin Reinecke's avatar
Martin Reinecke committed
395
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
396
      {
Martin Reinecke's avatar
Martin Reinecke committed
397
398
399
400
401
402
403
404
405
406
      atmp1[0] += d.lam2[i]*d.p1r[i];
      atmp1[1] += d.lam2[i]*d.p1i[i];
      atmp1[2] += d.lam2[i]*d.p2r[i];
      atmp1[3] += d.lam2[i]*d.p2i[i];
      d.lam1[i] = (a1*d.csq[i] + b1)*d.lam2[i] + d.lam1[i];
      atmp2[0] += d.lam1[i]*d.p1r[i];
      atmp2[1] += d.lam1[i]*d.p1i[i];
      atmp2[2] += d.lam1[i]*d.p2r[i];
      atmp2[3] += d.lam1[i]*d.p2i[i];
      d.lam2[i] = (a2*d.csq[i] + b2)*d.lam1[i] + d.lam2[i];
Martin Reinecke's avatar
Martin Reinecke committed
407
408
409
410
411
412
413
414
      }
    vhsum_cmplx_special (atmp1[0], atmp1[1], atmp1[2], atmp1[3], &alm[l  ]);
    vhsum_cmplx_special (atmp2[0], atmp2[1], atmp2[2], atmp2[3], &alm[l+2]);
    }
  for (; l<=lmax; ++il, l+=2)
    {
    Tv a=coef[il].a, b=coef[il].b;
    Tv atmp[4] = {0,0,0,0};
Martin Reinecke's avatar
Martin Reinecke committed
415
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
416
      {
Martin Reinecke's avatar
Martin Reinecke committed
417
418
419
420
421
422
423
      atmp[0] += d.lam2[i]*d.p1r[i];
      atmp[1] += d.lam2[i]*d.p1i[i];
      atmp[2] += d.lam2[i]*d.p2r[i];
      atmp[3] += d.lam2[i]*d.p2i[i];
      Tv tmp = (a*d.csq[i] + b)*d.lam2[i] + d.lam1[i];
      d.lam1[i] = d.lam2[i];
      d.lam2[i] = tmp;
Martin Reinecke's avatar
Martin Reinecke committed
424
425
426
427
428
      }
    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
    }
  }

Martin Reinecke's avatar
Martin Reinecke committed
429
MRUTIL_NOINLINE static void calc_map2alm (sharp_job & MRUTIL_RESTRICT job,
Martin Reinecke's avatar
Martin Reinecke committed
430
  const sharp_Ylmgen &gen, s0data_v & MRUTIL_RESTRICT d, size_t nth)
Martin Reinecke's avatar
Martin Reinecke committed
431
  {
Martin Reinecke's avatar
fixes  
Martin Reinecke committed
432
  size_t l,il=0,lmax=gen.lmax;
Martin Reinecke's avatar
Martin Reinecke committed
433
434
  size_t nv2 = (nth+VLEN-1)/VLEN;
  iter_to_ieee(gen, d, l, il, nv2);
Martin Reinecke's avatar
Martin Reinecke committed
435
  job.opcnt += il * 4*nth;
Martin Reinecke's avatar
Martin Reinecke committed
436
  if (l>lmax) return;
Martin Reinecke's avatar
Martin Reinecke committed
437
  job.opcnt += (lmax+1-l) * 6*nth;
Martin Reinecke's avatar
Martin Reinecke committed
438

Martin Reinecke's avatar
Martin Reinecke committed
439
  auto &coef = gen.coef;
Martin Reinecke's avatar
Martin Reinecke committed
440
  dcmplx * MRUTIL_RESTRICT alm=job.almtmp;
Martin Reinecke's avatar
Martin Reinecke committed
441
442
  bool full_ieee=true;
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
443
    {
Martin Reinecke's avatar
Martin Reinecke committed
444
445
    getCorfac(d.scale[i], &d.corfac[i], gen.cf);
    full_ieee &= all_of(d.scale[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
446
447
448
449
450
451
452
    }

  while((!full_ieee) && (l<=lmax))
    {
    Tv a=coef[il].a, b=coef[il].b;
    Tv atmp[4] = {0,0,0,0};
    full_ieee=1;
Martin Reinecke's avatar
Martin Reinecke committed
453
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
454
      {
Martin Reinecke's avatar
Martin Reinecke committed
455
456
457
458
459
460
461
462
463
464
      atmp[0] += d.lam2[i]*d.corfac[i]*d.p1r[i];
      atmp[1] += d.lam2[i]*d.corfac[i]*d.p1i[i];
      atmp[2] += d.lam2[i]*d.corfac[i]*d.p2r[i];
      atmp[3] += d.lam2[i]*d.corfac[i]*d.p2i[i];
      Tv tmp = (a*d.csq[i] + b)*d.lam2[i] + d.lam1[i];
      d.lam1[i] = d.lam2[i];
      d.lam2[i] = tmp;
      if (rescale(&d.lam1[i], &d.lam2[i], &d.scale[i], sharp_ftol))
        getCorfac(d.scale[i], &d.corfac[i], gen.cf);
      full_ieee &= all_of(d.scale[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
465
466
467
468
469
470
      }
    vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]);
    l+=2; ++il;
    }
  if (l>lmax) return;

Martin Reinecke's avatar
Martin Reinecke committed
471
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
472
    {
Martin Reinecke's avatar
Martin Reinecke committed
473
474
    d.lam1[i] *= d.corfac[i];
    d.lam2[i] *= d.corfac[i];
Martin Reinecke's avatar
Martin Reinecke committed
475
476
477
478
    }
  map2alm_kernel(d, coef, alm, l, il, lmax, nv2);
  }

Martin Reinecke's avatar
Martin Reinecke committed
479
MRUTIL_NOINLINE static void iter_to_ieee_spin (const sharp_Ylmgen &gen,
Martin Reinecke's avatar
Martin Reinecke committed
480
  sxdata_v & MRUTIL_RESTRICT d, size_t & MRUTIL_RESTRICT l_, size_t nv2)
Martin Reinecke's avatar
Martin Reinecke committed
481
  {
Martin Reinecke's avatar
Martin Reinecke committed
482
483
484
  const auto &fx = gen.coef;
  Tv prefac=gen.prefac[gen.m],
     prescale=gen.fscale[gen.m];
Martin Reinecke's avatar
Martin Reinecke committed
485
  Tv limscale=sharp_limscale;
Martin Reinecke's avatar
Martin Reinecke committed
486
487
  bool below_limit=true;
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
488
    {
Martin Reinecke's avatar
Martin Reinecke committed
489
490
491
492
493
    Tv cth2=max(Tv(1e-15),sqrt((1.+d.cth[i])*0.5));
    Tv sth2=max(Tv(1e-15),sqrt((1.-d.cth[i])*0.5));
    auto mask=d.sth[i]<0;
    where(mask&(d.cth[i]<0),cth2)*=-1.;
    where(mask&(d.cth[i]<0),sth2)*=-1.;
Martin Reinecke's avatar
Martin Reinecke committed
494
495

    Tv ccp, ccps, ssp, ssps, csp, csps, scp, scps;
Martin Reinecke's avatar
Martin Reinecke committed
496
497
498
499
    mypow(cth2,gen.cosPow,gen.powlimit,&ccp,&ccps);
    mypow(sth2,gen.sinPow,gen.powlimit,&ssp,&ssps);
    mypow(cth2,gen.sinPow,gen.powlimit,&csp,&csps);
    mypow(sth2,gen.cosPow,gen.powlimit,&scp,&scps);
Martin Reinecke's avatar
Martin Reinecke committed
500

Martin Reinecke's avatar
Martin Reinecke committed
501
502
503
504
505
506
507
508
509
510
511
512
    d.l1p[i] = 0;
    d.l1m[i] = 0;
    d.l2p[i] = prefac*ccp;
    d.scp[i] = prescale+ccps;
    d.l2m[i] = prefac*csp;
    d.scm[i] = prescale+csps;
    Tvnormalize(&d.l2m[i],&d.scm[i],sharp_fbighalf);
    Tvnormalize(&d.l2p[i],&d.scp[i],sharp_fbighalf);
    d.l2p[i] *= ssp;
    d.scp[i] += ssps;
    d.l2m[i] *= scp;
    d.scm[i] += scps;
Martin Reinecke's avatar
Martin Reinecke committed
513
    if (gen.preMinus_p)
Martin Reinecke's avatar
Martin Reinecke committed
514
      d.l2p[i] = -d.l2p[i];
Martin Reinecke's avatar
Martin Reinecke committed
515
    if (gen.preMinus_m)
Martin Reinecke's avatar
Martin Reinecke committed
516
      d.l2m[i] = -d.l2m[i];
Martin Reinecke's avatar
Martin Reinecke committed
517
    if (gen.s&1)
Martin Reinecke's avatar
Martin Reinecke committed
518
      d.l2p[i] = -d.l2p[i];
Martin Reinecke's avatar
Martin Reinecke committed
519

Martin Reinecke's avatar
Martin Reinecke committed
520
521
    Tvnormalize(&d.l2m[i],&d.scm[i],sharp_ftol);
    Tvnormalize(&d.l2p[i],&d.scp[i],sharp_ftol);
Martin Reinecke's avatar
Martin Reinecke committed
522

Martin Reinecke's avatar
Martin Reinecke committed
523
524
    below_limit &= all_of(d.scm[i]<limscale) &&
                   all_of(d.scp[i]<limscale);
Martin Reinecke's avatar
Martin Reinecke committed
525
526
    }

Martin Reinecke's avatar
Martin Reinecke committed
527
  size_t l=gen.mhi;
Martin Reinecke's avatar
Martin Reinecke committed
528
529
530

  while (below_limit)
    {
Martin Reinecke's avatar
Martin Reinecke committed
531
    if (l+2>gen.lmax) {l_=gen.lmax+1;return;}
Martin Reinecke's avatar
Martin Reinecke committed
532
533
534
    below_limit=1;
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
Martin Reinecke's avatar
Martin Reinecke committed
535
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
536
      {
Martin Reinecke's avatar
Martin Reinecke committed
537
538
539
540
541
542
543
544
      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];
      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
      if (rescale(&d.l1p[i],&d.l2p[i],&d.scp[i],sharp_ftol) ||
          rescale(&d.l1m[i],&d.l2m[i],&d.scm[i],sharp_ftol))
        below_limit &= all_of(d.scp[i]<limscale) &&
                       all_of(d.scm[i]<limscale);
Martin Reinecke's avatar
Martin Reinecke committed
545
546
547
548
      }
    l+=2;
    }

Martin Reinecke's avatar
Martin Reinecke committed
549
  l_=l;
Martin Reinecke's avatar
Martin Reinecke committed
550
551
  }

Martin Reinecke's avatar
Martin Reinecke committed
552
MRUTIL_NOINLINE static void alm2map_spin_kernel(sxdata_v & MRUTIL_RESTRICT d,
Martin Reinecke's avatar
Martin Reinecke committed
553
  const vector<sharp_Ylmgen::dbl2> &fx, const dcmplx * MRUTIL_RESTRICT alm,
Martin Reinecke's avatar
Martin Reinecke committed
554
  size_t l, size_t lmax, size_t nv2)
Martin Reinecke's avatar
Martin Reinecke committed
555
  {
Martin Reinecke's avatar
Martin Reinecke committed
556
  size_t lsave = l;
Martin Reinecke's avatar
Martin Reinecke committed
557
558
559
560
561
562
563
564
  while (l<=lmax)
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv agr1=alm[2*l  ].real(), agi1=alm[2*l  ].imag(),
       acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
    Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
       acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
Martin Reinecke's avatar
Martin Reinecke committed
565
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
566
      {
Martin Reinecke's avatar
Martin Reinecke committed
567
568
569
570
571
572
573
574
575
576
577
      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
      d.p1pr[i] += agr1*d.l2p[i];
      d.p1pi[i] += agi1*d.l2p[i];
      d.p1mr[i] += acr1*d.l2p[i];
      d.p1mi[i] += aci1*d.l2p[i];

      d.p1pr[i] += aci2*d.l1p[i];
      d.p1pi[i] -= acr2*d.l1p[i];
      d.p1mr[i] -= agi2*d.l1p[i];
      d.p1mi[i] += agr2*d.l1p[i];
      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
Martin Reinecke's avatar
Martin Reinecke committed
578
579
580
581
582
583
584
585
586
587
588
589
      }
    l+=2;
    }
  l=lsave;
  while (l<=lmax)
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv agr1=alm[2*l  ].real(), agi1=alm[2*l  ].imag(),
       acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
    Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
       acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
Martin Reinecke's avatar
Martin Reinecke committed
590
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
591
      {
Martin Reinecke's avatar
Martin Reinecke committed
592
593
594
595
596
597
598
599
600
601
602
      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];
      d.p2pr[i] -= aci1*d.l2m[i];
      d.p2pi[i] += acr1*d.l2m[i];
      d.p2mr[i] += agi1*d.l2m[i];
      d.p2mi[i] -= agr1*d.l2m[i];

      d.p2pr[i] += agr2*d.l1m[i];
      d.p2pi[i] += agi2*d.l1m[i];
      d.p2mr[i] += acr2*d.l1m[i];
      d.p2mi[i] += aci2*d.l1m[i];
      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
Martin Reinecke's avatar
Martin Reinecke committed
603
604
605
606
607
      }
    l+=2;
    }
  }

Martin Reinecke's avatar
Martin Reinecke committed
608
MRUTIL_NOINLINE static void calc_alm2map_spin (sharp_job & MRUTIL_RESTRICT job,
Martin Reinecke's avatar
Martin Reinecke committed
609
  const sharp_Ylmgen &gen, sxdata_v & MRUTIL_RESTRICT d, size_t nth)
Martin Reinecke's avatar
Martin Reinecke committed
610
  {
Martin Reinecke's avatar
Martin Reinecke committed
611
612
613
  size_t l,lmax=gen.lmax;
  size_t nv2 = (nth+VLEN-1)/VLEN;
  iter_to_ieee_spin(gen, d, l, nv2);
Martin Reinecke's avatar
Martin Reinecke committed
614
  job.opcnt += (l-gen.mhi) * 7*nth;
Martin Reinecke's avatar
Martin Reinecke committed
615
  if (l>lmax) return;
Martin Reinecke's avatar
Martin Reinecke committed
616
  job.opcnt += (lmax+1-l) * 23*nth;
Martin Reinecke's avatar
Martin Reinecke committed
617

Martin Reinecke's avatar
Martin Reinecke committed
618
  const auto &fx = gen.coef;
Martin Reinecke's avatar
Martin Reinecke committed
619
  const dcmplx * MRUTIL_RESTRICT alm=job.almtmp;
Martin Reinecke's avatar
Martin Reinecke committed
620
621
  bool full_ieee=true;
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
622
    {
Martin Reinecke's avatar
Martin Reinecke committed
623
624
625
626
    getCorfac(d.scp[i], &d.cfp[i], gen.cf);
    getCorfac(d.scm[i], &d.cfm[i], gen.cf);
    full_ieee &= all_of(d.scp[i]>=sharp_minscale) &&
                 all_of(d.scm[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
627
628
629
630
631
632
633
634
635
636
    }

  while((!full_ieee) && (l<=lmax))
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv agr1=alm[2*l  ].real(), agi1=alm[2*l  ].imag(),
       acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag();
    Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(),
       acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag();
Martin Reinecke's avatar
Martin Reinecke committed
637
638
    full_ieee=true;
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
639
      {
Martin Reinecke's avatar
Martin Reinecke committed
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];

      Tv l2p=d.l2p[i]*d.cfp[i], l2m=d.l2m[i]*d.cfm[i];
      Tv l1m=d.l1m[i]*d.cfm[i], l1p=d.l1p[i]*d.cfp[i];

      d.p1pr[i] += agr1*l2p + aci2*l1p;
      d.p1pi[i] += agi1*l2p - acr2*l1p;
      d.p1mr[i] += acr1*l2p - agi2*l1p;
      d.p1mi[i] += aci1*l2p + agr2*l1p;

      d.p2pr[i] += agr2*l1m - aci1*l2m;
      d.p2pi[i] += agi2*l1m + acr1*l2m;
      d.p2mr[i] += acr2*l1m + agi1*l2m;
      d.p2mi[i] += aci2*l1m - agr1*l2m;

      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
      if (rescale(&d.l1p[i], &d.l2p[i], &d.scp[i], sharp_ftol))
        getCorfac(d.scp[i], &d.cfp[i], gen.cf);
      full_ieee &= all_of(d.scp[i]>=sharp_minscale);
      if (rescale(&d.l1m[i], &d.l2m[i], &d.scm[i], sharp_ftol))
        getCorfac(d.scm[i], &d.cfm[i], gen.cf);
      full_ieee &= all_of(d.scm[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
664
665
666
667
668
      }
    l+=2;
    }
//  if (l>lmax) return;

Martin Reinecke's avatar
Martin Reinecke committed
669
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
670
    {
Martin Reinecke's avatar
Martin Reinecke committed
671
672
673
674
    d.l1p[i] *= d.cfp[i];
    d.l2p[i] *= d.cfp[i];
    d.l1m[i] *= d.cfm[i];
    d.l2m[i] *= d.cfm[i];
Martin Reinecke's avatar
Martin Reinecke committed
675
676
677
    }
  alm2map_spin_kernel(d, fx, alm, l, lmax, nv2);

Martin Reinecke's avatar
Martin Reinecke committed
678
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
679
680
    {
    Tv tmp;
Martin Reinecke's avatar
Martin Reinecke committed
681
682
683
684
    tmp = d.p1pr[i]; d.p1pr[i] -= d.p2mi[i]; d.p2mi[i] += tmp;
    tmp = d.p1pi[i]; d.p1pi[i] += d.p2mr[i]; d.p2mr[i] -= tmp;
    tmp = d.p1mr[i]; d.p1mr[i] += d.p2pi[i]; d.p2pi[i] -= tmp;
    tmp = d.p1mi[i]; d.p1mi[i] -= d.p2pr[i]; d.p2pr[i] += tmp;
Martin Reinecke's avatar
Martin Reinecke committed
685
686
687
    }
  }

Martin Reinecke's avatar
Martin Reinecke committed
688
MRUTIL_NOINLINE static void map2alm_spin_kernel(sxdata_v & MRUTIL_RESTRICT d,
Martin Reinecke's avatar
Martin Reinecke committed
689
  const vector<sharp_Ylmgen::dbl2> &fx, dcmplx * MRUTIL_RESTRICT alm,
Martin Reinecke's avatar
Martin Reinecke committed
690
  size_t l, size_t lmax, size_t nv2)
Martin Reinecke's avatar
Martin Reinecke committed
691
  {
Martin Reinecke's avatar
Martin Reinecke committed
692
  size_t lsave=l;
Martin Reinecke's avatar
Martin Reinecke committed
693
694
695
696
697
698
  while (l<=lmax)
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv agr1=0, agi1=0, acr1=0, aci1=0;
    Tv agr2=0, agi2=0, acr2=0, aci2=0;
Martin Reinecke's avatar
Martin Reinecke committed
699
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
700
      {
Martin Reinecke's avatar
Martin Reinecke committed
701
702
703
704
705
706
707
708
709
710
      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
      agr1 += d.p2mi[i]*d.l2p[i];
      agi1 -= d.p2mr[i]*d.l2p[i];
      acr1 -= d.p2pi[i]*d.l2p[i];
      aci1 += d.p2pr[i]*d.l2p[i];
      agr2 += d.p2pr[i]*d.l1p[i];
      agi2 += d.p2pi[i]*d.l1p[i];
      acr2 += d.p2mr[i]*d.l1p[i];
      aci2 += d.p2mi[i]*d.l1p[i];
      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
Martin Reinecke's avatar
Martin Reinecke committed
711
712
713
714
715
716
717
718
719
720
721
722
      }
    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
    l+=2;
    }
  l=lsave;
  while (l<=lmax)
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv agr1=0, agi1=0, acr1=0, aci1=0;
    Tv agr2=0, agi2=0, acr2=0, aci2=0;
Martin Reinecke's avatar
Martin Reinecke committed
723
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
724
      {
Martin Reinecke's avatar
Martin Reinecke committed
725
726
727
728
729
730
731
732
733
734
      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];
      agr1 += d.p1pr[i]*d.l2m[i];
      agi1 += d.p1pi[i]*d.l2m[i];
      acr1 += d.p1mr[i]*d.l2m[i];
      aci1 += d.p1mi[i]*d.l2m[i];
      agr2 -= d.p1mi[i]*d.l1m[i];
      agi2 += d.p1mr[i]*d.l1m[i];
      acr2 += d.p1pi[i]*d.l1m[i];
      aci2 -= d.p1pr[i]*d.l1m[i];
      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
Martin Reinecke's avatar
Martin Reinecke committed
735
736
737
738
739
740
741
      }
    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
    l+=2;
    }
  }

Martin Reinecke's avatar
Martin Reinecke committed
742
MRUTIL_NOINLINE static void calc_map2alm_spin (sharp_job & MRUTIL_RESTRICT job,
Martin Reinecke's avatar
Martin Reinecke committed
743
  const sharp_Ylmgen &gen, sxdata_v & MRUTIL_RESTRICT d, size_t nth)
Martin Reinecke's avatar
Martin Reinecke committed
744
  {
Martin Reinecke's avatar
Martin Reinecke committed
745
746
747
  size_t l,lmax=gen.lmax;
  size_t nv2 = (nth+VLEN-1)/VLEN;
  iter_to_ieee_spin(gen, d, l, nv2);
Martin Reinecke's avatar
Martin Reinecke committed
748
  job.opcnt += (l-gen.mhi) * 7*nth;
Martin Reinecke's avatar
Martin Reinecke committed
749
  if (l>lmax) return;
Martin Reinecke's avatar
Martin Reinecke committed
750
  job.opcnt += (lmax+1-l) * 23*nth;
Martin Reinecke's avatar
Martin Reinecke committed
751

Martin Reinecke's avatar
Martin Reinecke committed
752
  const auto &fx = gen.coef;
Martin Reinecke's avatar
Martin Reinecke committed
753
  dcmplx * MRUTIL_RESTRICT alm=job.almtmp;
Martin Reinecke's avatar
Martin Reinecke committed
754
755
  bool full_ieee=true;
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
756
    {
Martin Reinecke's avatar
Martin Reinecke committed
757
758
759
760
    getCorfac(d.scp[i], &d.cfp[i], gen.cf);
    getCorfac(d.scm[i], &d.cfm[i], gen.cf);
    full_ieee &= all_of(d.scp[i]>=sharp_minscale) &&
                 all_of(d.scm[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
761
    }
Martin Reinecke's avatar
Martin Reinecke committed
762
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
763
764
    {
    Tv tmp;
Martin Reinecke's avatar
Martin Reinecke committed
765
766
767
768
    tmp = d.p1pr[i]; d.p1pr[i] -= d.p2mi[i]; d.p2mi[i] += tmp;
    tmp = d.p1pi[i]; d.p1pi[i] += d.p2mr[i]; d.p2mr[i] -= tmp;
    tmp = d.p1mr[i]; d.p1mr[i] += d.p2pi[i]; d.p2pi[i] -= tmp;
    tmp = d.p1mi[i]; d.p1mi[i] -= d.p2pr[i]; d.p2pr[i] += tmp;
Martin Reinecke's avatar
Martin Reinecke committed
769
770
771
772
773
774
775
776
777
    }

  while((!full_ieee) && (l<=lmax))
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv agr1=0, agi1=0, acr1=0, aci1=0;
    Tv agr2=0, agi2=0, acr2=0, aci2=0;
    full_ieee=1;
Martin Reinecke's avatar
Martin Reinecke committed
778
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
779
      {
Martin Reinecke's avatar
Martin Reinecke committed
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];
      Tv l2p = d.l2p[i]*d.cfp[i], l2m = d.l2m[i]*d.cfm[i];
      Tv l1p = d.l1p[i]*d.cfp[i], l1m = d.l1m[i]*d.cfm[i];
      agr1 += d.p1pr[i]*l2m + d.p2mi[i]*l2p;
      agi1 += d.p1pi[i]*l2m - d.p2mr[i]*l2p;
      acr1 += d.p1mr[i]*l2m - d.p2pi[i]*l2p;
      aci1 += d.p1mi[i]*l2m + d.p2pr[i]*l2p;
      agr2 += d.p2pr[i]*l1p - d.p1mi[i]*l1m;
      agi2 += d.p2pi[i]*l1p + d.p1mr[i]*l1m;
      acr2 += d.p2mr[i]*l1p + d.p1pi[i]*l1m;
      aci2 += d.p2mi[i]*l1p - d.p1pr[i]*l1m;

      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
      if (rescale(&d.l1p[i], &d.l2p[i], &d.scp[i], sharp_ftol))
        getCorfac(d.scp[i], &d.cfp[i], gen.cf);
      full_ieee &= all_of(d.scp[i]>=sharp_minscale);
      if (rescale(&d.l1m[i], &d.l2m[i], &d.scm[i], sharp_ftol))
        getCorfac(d.scm[i], &d.cfm[i], gen.cf);
      full_ieee &= all_of(d.scm[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
801
802
803
804
805
806
807
      }
    vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]);
    vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]);
    l+=2;
    }
  if (l>lmax) return;

Martin Reinecke's avatar
Martin Reinecke committed
808
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
809
    {
Martin Reinecke's avatar
Martin Reinecke committed
810
811
812
813
    d.l1p[i] *= d.cfp[i];
    d.l2p[i] *= d.cfp[i];
    d.l1m[i] *= d.cfm[i];
    d.l2m[i] *= d.cfm[i];
Martin Reinecke's avatar
Martin Reinecke committed
814
815
816
817
818
    }
  map2alm_spin_kernel(d, fx, alm, l, lmax, nv2);
  }


Martin Reinecke's avatar
Martin Reinecke committed
819
MRUTIL_NOINLINE static void alm2map_deriv1_kernel(sxdata_v & MRUTIL_RESTRICT d,
Martin Reinecke's avatar
Martin Reinecke committed
820
  const vector<sharp_Ylmgen::dbl2> &fx, const dcmplx * MRUTIL_RESTRICT alm,
Martin Reinecke's avatar
Martin Reinecke committed
821
  size_t l, size_t lmax, size_t nv2)
Martin Reinecke's avatar
Martin Reinecke committed
822
  {
Martin Reinecke's avatar
Martin Reinecke committed
823
  size_t lsave=l;
Martin Reinecke's avatar
Martin Reinecke committed
824
825
826
827
828
829
  while (l<=lmax)
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag(),
       ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Martin Reinecke's avatar
Martin Reinecke committed
830
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
831
      {
Martin Reinecke's avatar
Martin Reinecke committed
832
833
834
      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
      d.p1pr[i] += ar1*d.l2p[i];
      d.p1pi[i] += ai1*d.l2p[i];
Martin Reinecke's avatar
Martin Reinecke committed
835

Martin Reinecke's avatar
Martin Reinecke committed
836
837
838
      d.p1mr[i] -= ai2*d.l1p[i];
      d.p1mi[i] += ar2*d.l1p[i];
      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
Martin Reinecke's avatar
Martin Reinecke committed
839
840
841
842
843
844
845
846
847
848
      }
    l+=2;
    }
  l=lsave;
  while (l<=lmax)
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag(),
       ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Martin Reinecke's avatar
Martin Reinecke committed
849
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
850
      {
Martin Reinecke's avatar
Martin Reinecke committed
851
852
853
      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];
      d.p2mr[i] += ai1*d.l2m[i];
      d.p2mi[i] -= ar1*d.l2m[i];
Martin Reinecke's avatar
Martin Reinecke committed
854

Martin Reinecke's avatar
Martin Reinecke committed
855
856
857
      d.p2pr[i] += ar2*d.l1m[i];
      d.p2pi[i] += ai2*d.l1m[i];
      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
Martin Reinecke's avatar
Martin Reinecke committed
858
859
860
861
862
      }
    l+=2;
    }
  }

Martin Reinecke's avatar
Martin Reinecke committed
863
MRUTIL_NOINLINE static void calc_alm2map_deriv1(sharp_job & MRUTIL_RESTRICT job,
Martin Reinecke's avatar
Martin Reinecke committed
864
  const sharp_Ylmgen &gen, sxdata_v & MRUTIL_RESTRICT d, size_t nth)
Martin Reinecke's avatar
Martin Reinecke committed
865
  {
Martin Reinecke's avatar
Martin Reinecke committed
866
867
868
  size_t l,lmax=gen.lmax;
  size_t nv2 = (nth+VLEN-1)/VLEN;
  iter_to_ieee_spin(gen, d, l, nv2);
Martin Reinecke's avatar
Martin Reinecke committed
869
  job.opcnt += (l-gen.mhi) * 7*nth;
Martin Reinecke's avatar
Martin Reinecke committed
870
  if (l>lmax) return;
Martin Reinecke's avatar
Martin Reinecke committed
871
  job.opcnt += (lmax+1-l) * 15*nth;
Martin Reinecke's avatar
Martin Reinecke committed
872

Martin Reinecke's avatar
Martin Reinecke committed
873
  const auto &fx = gen.coef;
Martin Reinecke's avatar
Martin Reinecke committed
874
  const dcmplx * MRUTIL_RESTRICT alm=job.almtmp;
Martin Reinecke's avatar
Martin Reinecke committed
875
876
  bool full_ieee=true;
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
877
    {
Martin Reinecke's avatar
Martin Reinecke committed
878
879
880
881
    getCorfac(d.scp[i], &d.cfp[i], gen.cf);
    getCorfac(d.scm[i], &d.cfm[i], gen.cf);
    full_ieee &= all_of(d.scp[i]>=sharp_minscale) &&
                 all_of(d.scm[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
882
883
884
885
886
887
888
889
    }

  while((!full_ieee) && (l<=lmax))
    {
    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
    Tv ar1=alm[l  ].real(), ai1=alm[l  ].imag(),
       ar2=alm[l+1].real(), ai2=alm[l+1].imag();
Martin Reinecke's avatar
Martin Reinecke committed
890
891
    full_ieee=true;
    for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
892
      {
Martin Reinecke's avatar
Martin Reinecke committed
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];

      Tv l2p=d.l2p[i]*d.cfp[i], l2m=d.l2m[i]*d.cfm[i];
      Tv l1m=d.l1m[i]*d.cfm[i], l1p=d.l1p[i]*d.cfp[i];

      d.p1pr[i] += ar1*l2p;
      d.p1pi[i] += ai1*l2p;
      d.p1mr[i] -= ai2*l1p;
      d.p1mi[i] += ar2*l1p;

      d.p2pr[i] += ar2*l1m;
      d.p2pi[i] += ai2*l1m;
      d.p2mr[i] += ai1*l2m;
      d.p2mi[i] -= ar1*l2m;

      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
      if (rescale(&d.l1p[i], &d.l2p[i], &d.scp[i], sharp_ftol))
        getCorfac(d.scp[i], &d.cfp[i], gen.cf);
      full_ieee &= all_of(d.scp[i]>=sharp_minscale);
      if (rescale(&d.l1m[i], &d.l2m[i], &d.scm[i], sharp_ftol))
        getCorfac(d.scm[i], &d.cfm[i], gen.cf);
      full_ieee &= all_of(d.scm[i]>=sharp_minscale);
Martin Reinecke's avatar
Martin Reinecke committed
917
918
919
920
921
      }
    l+=2;
    }
//  if (l>lmax) return;

Martin Reinecke's avatar
Martin Reinecke committed
922
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
923
    {
Martin Reinecke's avatar
Martin Reinecke committed
924
925
926
927
    d.l1p[i] *= d.cfp[i];
    d.l2p[i] *= d.cfp[i];
    d.l1m[i] *= d.cfm[i];
    d.l2m[i] *= d.cfm[i];
Martin Reinecke's avatar
Martin Reinecke committed
928
929
930
    }
  alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2);

Martin Reinecke's avatar
Martin Reinecke committed
931
  for (size_t i=0; i<nv2; ++i)
Martin Reinecke's avatar
Martin Reinecke committed
932
933
    {
    Tv tmp;
Martin Reinecke's avatar
Martin Reinecke committed
934
935
936
937
    tmp = d.p1pr[i]; d.p1pr[i] -= d.p2mi[i]; d.p2mi[i] += tmp;
    tmp = d.p1pi[i]; d.p1pi[i] += d.p2mr[i]; d.p2mr[i] -= tmp;
    tmp = d.p1mr[i]; d.p1mr[i] += d.p2pi[i]; d.p2pi[i] -= tmp;
    tmp = d.p1mi[i]; d.p1mi[i] -= d.p2pr[i]; d.p2pr[i] += tmp;
Martin Reinecke's avatar
Martin Reinecke committed
938
939
940
941
942
943
    }
  }


#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)

Martin Reinecke's avatar
Martin Reinecke committed
944
MRUTIL_NOINLINE static void inner_loop_a2m(sharp_job &job, const vector<bool> & ispair,
Martin Reinecke's avatar
Martin Reinecke committed
945
946
  const vector<double> &cth_, const vector<double> &sth_, size_t llim, size_t ulim,
  sharp_Ylmgen &gen, size_t mi, const vector<size_t> &mlim)
Martin Reinecke's avatar
Martin Reinecke committed
947
  {
Martin Reinecke's avatar
Martin Reinecke committed
948
  const size_t m = job.ainfo.mval(mi);
Martin Reinecke's avatar
Martin Reinecke committed
949
  gen.prepare(m);
Martin Reinecke's avatar
Martin Reinecke committed
950

Martin Reinecke's avatar
Martin Reinecke committed
951
  switch (job.type)
Martin Reinecke's avatar
Martin Reinecke committed
952
953
954
955
    {
    case SHARP_ALM2MAP:
    case SHARP_ALM2MAP_DERIV1:
      {
Martin Reinecke's avatar
Martin Reinecke committed
956
      if (job.spin==0)
Martin Reinecke's avatar
Martin Reinecke committed
957
958
        {
        //adjust the a_lm for the new algorithm
Martin Reinecke's avatar
Martin Reinecke committed
959
        dcmplx * MRUTIL_RESTRICT alm=job.almtmp;
Martin Reinecke's avatar
Martin Reinecke committed
960
        for (size_t il=0, l=gen.m; l<=gen.lmax; ++il,l+=2)
Martin Reinecke's avatar
Martin Reinecke committed
961
962
          {
          dcmplx al = alm[l];
Martin Reinecke's avatar
Martin Reinecke committed
963
964
965
966
          dcmplx al1 = (l+1>gen.lmax) ? 0. : alm[l+1];
          dcmplx al2 = (l+2>gen.lmax) ? 0. : alm[l+2];
          alm[l  ] = gen.alpha[il]*(gen.eps[l+1]*al + gen.eps[l+2]*al2);
          alm[l+1] = gen.