Commit 269309d9 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

Merge branch 'ducc0' into vdot_experiments

parents 8eee3030 4948d494
Pipeline #104822 passed with stages
in 17 minutes and 37 seconds
......@@ -42,6 +42,7 @@ def test(lmax, geometry, spin, nthreads=1):
alm2 = ducc0.sht.experimental.analysis_2d(alm=alm2, map=map, lmax=lmax, spin=spin, geometry=geometry, nthreads=nthreads)
print(time()-t0)
print("L2 error after full round-trip:", ducc0.misc.l2error(alm2,alm))
print("L_inf error after full round-trip:", np.max(np.abs(alm2-alm)))
nthr=8
for l0 in [4095]:
......
......@@ -133,6 +133,13 @@ template<typename T1> py::object Py2_vdot(const py::array &a, const py::array &b
}
py::object Py_vdot(const py::object &a, const py::object &b)
{
if ((!isPyarr(a)) || (py::array(a).ndim()==0)) // scalars
{
auto xa = py::cast<complex<long double>>(a),
xb = py::cast<complex<long double>>(b);
auto res = conj(xa)*xb;
return (res.imag()==0) ? py::cast(res.real()) : py::cast(res);
}
if (isPyarr<float>(a))
return Py2_vdot<float>(a,b);
if (isPyarr<double>(a))
......@@ -145,11 +152,7 @@ py::object Py_vdot(const py::object &a, const py::object &b)
return Py2_vdot<complex<double>>(a,b);
if (isPyarr<complex<long double>>(a))
return Py2_vdot<complex<long double>>(a,b);
// no arrays, so we assume a and b are scalars
auto xa = py::cast<complex<long double>>(a),
xb = py::cast<complex<long double>>(b);
auto res = conj(xa)*xb;
return (res.imag()==0) ? py::cast(res.real()) : py::cast(res);
MR_fail("type matching failed");
}
constexpr const char *Py_l2error_DS = R"""(
......@@ -238,6 +241,13 @@ template<typename T1> double Py2_l2error(const py::array &a, const py::array &b)
}
double Py_l2error(const py::object &a, const py::object &b)
{
if ((!isPyarr(a)) || (py::array(a).ndim()==0)) // scalars
{
auto xa = py::cast<complex<long double>>(a),
xb = py::cast<complex<long double>>(b);
auto res = abs(xa-xb)/max(abs(xa), abs(xb));
return double(res);
}
if (isPyarr<float>(a))
return Py2_l2error<float>(a,b);
if (isPyarr<double>(a))
......@@ -250,11 +260,7 @@ double Py_l2error(const py::object &a, const py::object &b)
return Py2_l2error<complex<double>>(a,b);
if (isPyarr<complex<long double>>(a))
return Py2_l2error<complex<long double>>(a,b);
// no arrays, so we assume a and b are scalars
auto xa = py::cast<complex<long double>>(a),
xb = py::cast<complex<long double>>(b);
auto res = abs(xa-xb)/max(abs(xa), abs(xb));
return double(res);
MR_fail("type matching failed");
}
py::array Py_GL_weights(size_t nlat, size_t nlon)
......
......@@ -38,6 +38,9 @@ using stride_t=fmav_info::stride_t;
namespace py = pybind11;
bool isPyarr(const py::object &obj)
{ return py::isinstance<py::array>(obj); }
template<typename T> bool isPyarr(const py::object &obj)
{ return py::isinstance<py::array_t<T>>(obj); }
......
......@@ -688,6 +688,7 @@ template <typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_input(const T
auto ptr = &src.craw(it.iofs_uni(0,0));
auto jstr = it.unistride_i();
auto istr = it.stride_in();
// FIXME: flip loops to avoid critical strides?
if (istr==1)
for (size_t i=0; i<it.length_in(); ++i)
for (size_t j=0; j<vlen; ++j)
......
......@@ -1245,16 +1245,15 @@ template <typename Tfs> class cfft_multipass: public cfftpass<Tfs>
{
private:
using typename cfftpass<Tfs>::Tcs;
static constexpr size_t bunchsize=8;
const size_t l1, ido;
size_t ip;
vector<Tcpass<Tfs>> passes;
size_t bufsz;
bool need_cpy;
aligned_array<Tcs> wa;
auto WA(size_t x, size_t i) const
{ return wa[(i-1)*(ip-1)+x]; }
size_t rfct;
Troots<Tfs> myroots;
template<bool fwd, typename T> Cmplx<T> *exec_(Cmplx<T> *cc, Cmplx<T> *ch,
Cmplx<T> *buf) const
......@@ -1288,121 +1287,282 @@ template <typename Tfs> class cfft_multipass: public cfftpass<Tfs>
auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc&
{ return cc[a+ido*(b+ip*c)]; };
for (size_t itrans=0; itrans<nvtrans; ++itrans)
if (ido==1)
{
size_t k0=(itrans*vlen)/ido;
if (k0==(itrans*vlen+vlen-1)/ido) // k is constant for all vlen transforms
for (size_t itrans=0; itrans<nvtrans; ++itrans)
{
size_t i0 = (itrans*vlen)%ido;
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<vlen; ++n)
{
cc2[m].r[n] = CC(i0+n,m,k0).r;
cc2[m].i[n] = CC(i0+n,m,k0).i;
size_t k = min(l1-1, itrans*vlen+n);
cc2[m].r[n] = CC(0,m,k).r;
cc2[m].i[n] = CC(0,m,k).i;
}
Tcv *p1=cc2, *p2=ch2;
for(const auto &pass: passes)
{
auto res = any_cast<Tcv *>(pass->exec(p1, p2, buf2, fwd));
if (res==p2) swap (p1,p2);
}
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<vlen; ++n)
{
auto k = min(l1-1, itrans*vlen+n);
CH(0,k,m) = { p1[m].r[n], p1[m].i[n] };
}
}
else
return ch;
}
if (l1==1)
{
for (size_t itrans=0; itrans<nvtrans; ++itrans)
{
for (size_t n=0; n<vlen; ++n)
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<vlen; ++n)
{
size_t i = min(ido-1, itrans*vlen+n);
cc2[m].r[n] = CC(i,m,0).r;
cc2[m].i[n] = CC(i,m,0).i;
}
Tcv *p1=cc2, *p2=ch2;
for(const auto &pass: passes)
{
auto i = (itrans*vlen+n)%ido;
auto k = min(l1-1,(itrans*vlen+n)/ido);
for (size_t m=0; m<ip; ++m)
auto res = any_cast<Tcv *>(pass->exec(p1, p2, buf2, fwd));
if (res==p2) swap (p1,p2);
}
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<vlen; ++n)
{
cc2[m].r[n] = CC(i,m,k).r;
cc2[m].i[n] = CC(i,m,k).i;
auto i = itrans*vlen+n;
if (i >= ido) break;
if (i==0)
CC(0,m,0) = { p1[m].r[n], p1[m].i[n] };
else
{
if (m==0)
CC(i,0,0) = { p1[0].r[n], p1[0].i[n] } ;
else
CC(i,m,0) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>((*myroots)[rfct*m*i]);
}
}
}
return cc;
}
for (size_t itrans=0; itrans<nvtrans; ++itrans)
{
array<size_t, vlen> ix, kx;
size_t ixcur = (itrans*vlen)%ido;
size_t kxcur = (itrans*vlen)/ido;
for (size_t n=0; n<vlen; ++n)
{
ix[n] = ixcur;
kx[n] = min(l1-1,kxcur);
if (++ixcur==ido)
{
ixcur=0;
++kxcur;
}
}
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<vlen; ++n)
{
cc2[m].r[n] = CC(ix[n],m,kx[n]).r;
cc2[m].i[n] = CC(ix[n],m,kx[n]).i;
}
Tcv *p1=cc2, *p2=ch2;
for(const auto &pass: passes)
{
auto res = any_cast<Tcv *>(pass->exec(p1, p2, buf2, fwd));
if (res==p2) swap (p1,p2);
}
for (size_t n=0; n<vlen; ++n)
{
auto i = (itrans*vlen+n)%ido;
auto k = (itrans*vlen+n)/ido;
if (k>=l1) break;
if (l1>1)
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<vlen; ++n)
{
auto i = ix[n];
auto k = kx[n];
if (itrans*vlen+n >= l1*ido) break;
if (i==0)
for (size_t m=0; m<ip; ++m)
CH(0,k,m) = { p1[m].r[n], p1[m].i[n] };
CH(0,k,m) = { p1[m].r[n], p1[m].i[n] };
else
{
CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ;
for (size_t m=1; m<ip; ++m)
CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>(WA(m-1,i));
if (m==0)
CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ;
else
CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
}
}
else
}
return ch;
}
else
{
if (ido==1)
{
// parallelize here!
for (size_t n=0; n<l1; ++n)
{
Cmplx<T> *p1=&cc[n*ip], *p2=ch;
Cmplx<T> *res = nullptr;
for(const auto &pass: passes)
{
if (i==0)
for (size_t m=0; m<ip; ++m)
CC(0,m,0) = {p1[m].r[n], p1[m].i[n]};
res = any_cast<Cmplx<T> *>(pass->exec(p1, p2, buf, fwd));
if (res==p2) swap (p1,p2);
}
if (res != &cc[n*ip])
// FIXME: use std::copy()
for (size_t m=0; m<ip; ++m)
cc[n*ip+m] = res[m];
}
// transpose
size_t nbunch = (l1*ido + bunchsize-1)/bunchsize;
// parallelize here!
for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
{
size_t ntrans = min(bunchsize, l1-ibunch*bunchsize);
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<ntrans; ++n)
{
size_t itrans = ibunch*bunchsize + n;
ch[itrans+m*l1] = cc[m+itrans*ip];
}
}
return ch;
}
if (l1==1)
{
auto cc2 = &buf[0];
auto ch2 = &buf[bunchsize*ip];
auto buf2 = &buf[(bunchsize+1)*ip];
size_t nbunch = (ido + bunchsize-1)/bunchsize;
auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc&
{ return ch[a+ido*(b+l1*c)]; };
auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc&
{ return cc[a+ido*(b+ip*c)]; };
// parallelize here!
for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
{
size_t ntrans = min(bunchsize, ido-ibunch*bunchsize);
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<ntrans; ++n)
cc2[m+n*ip] = CC(n+ibunch*bunchsize,m,0);
for (size_t n=0; n<ntrans; ++n)
{
auto i = n+ibunch*bunchsize;
Cmplx<T> *p1=&cc2[n*ip], *p2=ch2;
Cmplx<T> *res = nullptr;
for(const auto &pass: passes)
{
res = any_cast<Cmplx<T> *>(pass->exec(p1, p2, buf2, fwd));
if (res==p2) swap (p1,p2);
}
if (res==&cc2[n*ip]) // no copying necessary
{
if (i!=0)
{
for (size_t m=1; m<ip; ++m)
cc2[n*ip+m] = cc2[n*ip+m].template special_mul<fwd>((*myroots)[rfct*m*i]);
}
}
else
{
CC(i,0,0) = Tcs(p1[0].r[n], p1[0].i[n]);
for (size_t m=1; m<ip; ++m)
CC(i,m,0) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>(WA(m-1,i));
if (i==0)
for (size_t m=0; m<ip; ++m)
cc2[n*ip+m] = res[m];
else
{
cc2[n*ip] = res[0];
for (size_t m=1; m<ip; ++m)
cc2[n*ip+m] = res[m].template special_mul<fwd>((*myroots)[rfct*m*i]);
}
}
}
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<ntrans; ++n)
CC(n+ibunch*bunchsize, m, 0) = cc2[m+n*ip];
}
return cc;
}
return (l1>1) ? ch : cc;
}
else
{
auto cc2 = &buf[0];
auto ch2 = &buf[ip];
auto buf2 = &buf[2*ip];
auto ch2 = &buf[bunchsize*ip];
auto buf2 = &buf[(bunchsize+1)*ip];
size_t nbunch = (l1*ido + bunchsize-1)/bunchsize;
auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc&
{ return ch[a+ido*(b+l1*c)]; };
auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc&
{ return cc[a+ido*(b+ip*c)]; };
for (size_t k=0; k<l1; ++k)
for (size_t i=0; i<ido; ++i)
// parallelize here!
for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
{
size_t ntrans = min(bunchsize, l1*ido-ibunch*bunchsize);
array<size_t, bunchsize> ix, kx;
size_t ixcur = (ibunch*bunchsize)%ido;
size_t kxcur = (ibunch*bunchsize)/ido;
for (size_t n=0; n<bunchsize; ++n)
{
for (size_t m=0; m<ip; ++m)
cc2[m] = CC(i,m,k);
ix[n] = ixcur;
kx[n] = min(l1-1,kxcur);
if (++ixcur==ido)
{
ixcur=0;
++kxcur;
}
}
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<ntrans; ++n)
cc2[m+n*ip] = CC(ix[n],m,kx[n]);
Cmplx<T> *p1=cc2, *p2=ch2;
for (size_t n=0; n<ntrans; ++n)
{
auto i = ix[n];
Cmplx<T> *p1=&cc2[n*ip], *p2=ch2;
Cmplx<T> *res = nullptr;
for(const auto &pass: passes)
{
auto res = any_cast<Cmplx<T> *>(pass->exec(p1, p2, buf2, fwd));
res = any_cast<Cmplx<T> *>(pass->exec(p1, p2, buf2, fwd));
if (res==p2) swap (p1,p2);
}
if (l1>1)
if (res==&cc2[n*ip]) // no copying necessary
{
if (i==0)
for (size_t m=0; m<ip; ++m)
CH(0,k,m) = p1[m];
else
if (i!=0)
{
CH(i,k,0) = p1[0];
for (size_t m=1; m<ip; ++m)
CH(i,k,m) = p1[m].template special_mul<fwd>(WA(m-1,i));
cc2[n*ip+m] = cc2[n*ip+m].template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
}
}
else
{
if (i==0)
for (size_t m=0; m<ip; ++m)
CC(0,m,0) = p1[m];
cc2[n*ip+m] = res[m];
else
{
CC(i,0,0) = p1[0];
cc2[n*ip] = res[0];
for (size_t m=1; m<ip; ++m)
CC(i,m,0) = p1[m].template special_mul<fwd>(WA(m-1,i));
cc2[n*ip+m] = res[m].template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
}
}
}
return (l1>1) ? ch : cc;
for (size_t m=0; m<ip; ++m)
for (size_t n=0; n<ntrans; ++n)
CH(ix[n], kx[n], m) = cc2[m+n*ip];
}
return ch;
}
}
}
......@@ -1411,24 +1571,21 @@ template <typename Tfs> class cfft_multipass: public cfftpass<Tfs>
cfft_multipass(size_t l1_, size_t ido_, size_t ip_,
const Troots<Tfs> &roots, bool vectorize=false)
: l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false),
wa((ip-1)*(ido-1))
myroots(roots)
{
size_t N=ip*l1*ido;
auto rfct = roots->size()/N;
rfct = roots->size()/N;
MR_assert(roots->size()==N*rfct, "mismatch");
for (size_t j=1; j<ip; ++j)
for (size_t i=1; i<ido; ++i)
wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
// FIXME TBD
size_t lim = vectorize ? 1000 : ~size_t(0);
size_t lim = vectorize ? 1000 : 10000; //~size_t(0);
if (ip<=lim)
{
auto factors = cfftpass<Tfs>::factorize(ip);
size_t l1l=1;
for (auto fct: factors)
{
passes.push_back(cfftpass<Tfs>::make_pass(l1l, ip/(fct*l1l), fct, roots, vectorize));
passes.push_back(cfftpass<Tfs>::make_pass(l1l, ip/(fct*l1l), fct, roots, false));
l1l*=fct;
}
}
......@@ -1454,7 +1611,7 @@ template <typename Tfs> class cfft_multipass: public cfftpass<Tfs>
if ((l1!=1)||(ido!=1))
{
need_cpy=true;
bufsz += 2*ip;
bufsz += (bunchsize+1)*ip;
}
}
......@@ -1490,6 +1647,7 @@ template <size_t vlen, typename Tfs> class cfftp_vecpass: public cfftpass<Tfs>
auto res = any_cast<Tcs *>(spass->exec(cc, reinterpret_cast<Tcs *>(ch2),
reinterpret_cast<Tcs *>(buf2), fwd));
// arrange input in SIMD-friendly way
// FIXME: swap loops?
for (size_t i=0; i<ip/vlen; ++i)
for (size_t j=0; j<vlen; ++j)
{
......
......@@ -169,18 +169,18 @@ struct ft_partial_sph_isometry_plan
static double Y_index_j_eq_i(int l, int i) // l>=0, i>=0, j>=i
{
auto r1 = abs(Gy_index3_a1(l, 2*l-i , i));
auto r1 = abs(Gy_index3_a1(l, 2*l-i, i));
auto r3 = abs(Gy_index3_a2(l, 2*l-i+2));
return (r1+r3)*0.25/double((2*l+1)*(2*l+3));
return (r1+r3)*0.25;
}
static double Y_index_j_eq_i_plus_2(int l, int i) // l>=0, i>=0, j>=i
{
auto r1 = Gy_index3_a1(l, 2*l-i , i)*Gy_index3_a2(l, 2*l-i);
return copysign(sqrt(abs(r1)),r1)*0.25/double((2*l+1)*(2*l+3));
auto r1 = Gy_index3_a1(l, 2*l-i, i)*Gy_index3_a2(l, 2*l-i);
return copysign(sqrt(abs(r1)),r1)*0.25;
}
static double Z_index(int l, int i, int j)
{
return (i==j) ? (j+1)*(2*l+1-j)/double((2*l+1)*(2*l+3)) : 0.0;
return (i==j) ? (j+1)*(2*l+1-j) : 0.0;
}
struct ft_symmetric_tridiagonal
......@@ -198,7 +198,7 @@ struct ft_partial_sph_isometry_plan
}
};
class ft_symmetric_tridiagonal_symmetric_eigen
template<bool high_accuracy> class ft_symmetric_tridiagonal_symmetric_eigen
{
private:
vector<double> A, B, C;
......@@ -242,9 +242,19 @@ struct ft_partial_sph_isometry_plan
Tv maxnrm(0);
for (size_t i=0; i<N; ++i)
{
auto vkm1 = (A[k]*X[i]+B[k])*vk[i] - C[k]*vkp1[i];
auto vkm2 = (A[k-1]*X[i]+B[k-1])*vkm1 - C[k-1]*vk[i];
auto vkm3 = (A[k-2]*X[i]+B[k-2])*vkm2 - C[k-2]*vkm1;
Tv vkm1, vkm2, vkm3;
if constexpr(high_accuracy)
{
vkm1 = A[k ]*((X[i]+B[k ])*vk[i] - C[k ]*vkp1[i]);
vkm2 = A[k-1]*((X[i]+B[k-1])*vkm1 - C[k-1]*vk[i]);
vkm3 = A[k-2]*((X[i]+B[k-2])*vkm2 - C[k-2]*vkm1);
}
else
{
vkm1 = (A[k ]*X[i]+B[k ])*vk[i] - C[k ]*vkp1[i];
vkm2 = (A[k-1]*X[i]+B[k-1])*vkm1 - C[k-1]*vk[i];
vkm3 = (A[k-2]*X[i]+B[k-2])*vkm2 - C[k-2]*vkm1;
}
vkp1[i] = vkm2;
vk[i] = vkm3;
nrm[i] += vkm1*vkm1 + vkm2*vkm2 + vkm3*vkm3;
......@@ -266,7 +276,11 @@ struct ft_partial_sph_isometry_plan
Tv maxnrm(0);
for (size_t i=0; i<N; ++i)
{
auto vkm1 = (A[k]*X[i]+B[k])*vk[i] - C[k]*vkp1[i];
Tv vkm1;
if constexpr(high_accuracy)
vkm1 = A[k]*((X[i]+B[k])*vk[i] - C[k]*vkp1[i]);
else
vkm1 = (A[k]*X[i]+B[k])*vk[i] - C[k]*vkp1[i];
vkp1[i] = vk[i];
vk[i] = vkm1;
nrm[i] += vkm1*vkm1;
......@@ -312,13 +326,24 @@ struct ft_partial_sph_isometry_plan
if (n>1)
{
A[n-1] = 1/T.b[n-2];
B[n-1] = -T.a[n-1]/T.b[n-2];
if constexpr(high_accuracy)
B[n-1] = -T.a[n-1];
else
B[n-1] = -T.a[n-1]/T.b[n-2];
}
for (int i=n-2; i>0; i--)
{
A[i] = 1/T.b[i-1];
B[i] = -T.a[i]/T.b[i-1];
C[i] = T.b[i]/T.b[i-1];
if constexpr(high_accuracy)
{
B[i] = -T.a[i];
C<