Commit 4b36a29f authored by Martin Reinecke's avatar Martin Reinecke
Browse files

Merge branch 'convolution_work' into 'master'

Convolution work

See merge request mtr/cxxbase!7
parents 474ffe65 ed1b7cf7
......@@ -42,11 +42,13 @@ def convolve(alm1, alm2, lmax):
lmax=60
kmax=13
ncomp=1
separate=True
ncomp2 = ncomp if separate else 1
separate=False
nptg = 1000000
epsilon = 1e-4
ofactor = 2
nthreads = 0
ofactor = 1.5
nthreads = 0 # use as many threads as available
ncomp2 = ncomp if separate else 1
# get random sky a_lm
# the a_lm arrays follow the same conventions as those in healpy
......@@ -93,13 +95,14 @@ plt.imshow(bar2-bar[:,:,0])
plt.show()
ptg=np.random.uniform(0.,1.,3*1000000).reshape(1000000,3)
ptg=np.random.uniform(0.,1.,3*nptg).reshape(nptg,3)
ptg[:,0]*=np.pi
ptg[:,1]*=2*np.pi
ptg[:,2]*=2*np.pi
#foo = pyinterpol_ng.PyInterpolator(slm,blm,separate,lmax, kmax, epsilon=1e-6, nthreads=2)
t0=time.time()
bar=foo.interpol(ptg)
del foo
print("interpolation time: ", time.time()-t0)
fake = np.random.uniform(0.,1., (ptg.shape[0],ncomp2))
foo2 = pyinterpol_ng.PyInterpolator(lmax, kmax, ncomp2, epsilon=epsilon, ofactor=ofactor, nthreads=nthreads)
......
......@@ -22,6 +22,96 @@
namespace mr {
namespace detail_fft {
using std::vector;
template<typename T, typename T0> aligned_array<T> alloc_tmp_conv
(const fmav_info &info, size_t axis, size_t len)
{
auto othersize = info.size()/info.shape(axis);
constexpr auto vlen = native_simd<T0>::size();
auto tmpsize = len*((othersize>=vlen) ? vlen : 1);
return aligned_array<T>(tmpsize);
}
template<typename Tplan, typename T, typename T0, typename Exec>
MRUTIL_NOINLINE void general_convolve(const fmav<T> &in, fmav<T> &out,
const size_t axis, const vector<T0> &kernel, size_t nthreads,
const Exec &exec)
{
std::shared_ptr<Tplan> plan1, plan2;
size_t l_in=in.shape(axis), l_out=out.shape(axis);
size_t l_min=std::min(l_in, l_out), l_max=std::max(l_in, l_out);
MR_assert(kernel.size()==l_min/2+1, "bad kernel size");
plan1 = get_plan<Tplan>(l_in);
plan2 = get_plan<Tplan>(l_out);
execParallel(
util::thread_count(nthreads, in, axis, native_simd<T0>::size()),
[&](Scheduler &sched) {
constexpr auto vlen = native_simd<T0>::size();
auto storage = alloc_tmp_conv<T,T0>(in, axis, l_max);
multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
#ifndef MRUTIL_NO_SIMD
if (vlen>1)
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
exec(it, in, out, tdatav, *plan1, *plan2, kernel);
}
#endif
while (it.remaining()>0)
{
it.advance(1);
auto buf = reinterpret_cast<T *>(storage.data());
exec(it, in, out, buf, *plan1, *plan2, kernel);
}
}); // end of parallel region
}
struct ExecConvR1
{
template <typename T0, typename T, size_t vlen> void operator() (
const multi_iter<vlen> &it, const fmav<T0> &in, fmav<T0> &out,
T * buf, const pocketfft_r<T0> &plan1, const pocketfft_r<T0> &plan2,
const vector<T0> &kernel) const
{
size_t l_in = plan1.length(),
l_out = plan2.length(),
l_min = std::min(l_in, l_out);
copy_input(it, in, buf);
plan1.exec(buf, T0(1), true);
for (size_t i=0; i<l_min; ++i) buf[i]*=kernel[(i+1)/2];
for (size_t i=l_in; i<l_out; ++i) buf[i] = T(0);
plan2.exec(buf, T0(1), false);
copy_output(it, buf, out);
}
};
template<typename T> void convolve_1d(const fmav<T> &in,
fmav<T> &out, size_t axis, const vector<T> &kernel, size_t nthreads=1)
{
MR_assert(axis<in.ndim(), "bad axis number");
MR_assert(in.ndim()==out.ndim(), "dimensionality mismatch");
if (in.data()==out.data())
MR_assert(in.stride()==out.stride(), "strides mismatch");
for (size_t i=0; i<in.ndim(); ++i)
if (i!=axis)
MR_assert(in.shape(i)==out.shape(i), "shape mismatch");
MR_assert(!((in.shape(axis)&1) || (out.shape(axis)&1)),
"input and output axis lengths must be even");
if (in.size()==0) return;
general_convolve<pocketfft_r<T>>(in, out, axis, kernel, nthreads,
ExecConvR1());
}
}
using detail_fft::convolve_1d;
namespace detail_interpol_ng {
using namespace std;
......@@ -41,78 +131,54 @@ template<typename T> class Interpolator
void correct(mav<T,2> &arr, int spin)
{
T sfct = (spin&1) ? -1 : 1;
mav<T,2> tmp({nphi,nphi});
tmp.apply([](T &v){v=0.;});
auto tmp0=tmp.template subarray<2>({0,0},{nphi0, nphi0});
fmav<T> ftmp0(tmp0);
for (size_t i=0; i<ntheta0; ++i)
for (size_t j=0; j<nphi0; ++j)
tmp0.v(i,j) = arr(i,j);
// extend to second half
mav<T,2> tmp({nphi,nphi0});
// copy and extend to second half
for (size_t j=0; j<nphi0; ++j)
tmp.v(0,j) = arr(0,j);
for (size_t i=1, i2=nphi0-1; i+1<ntheta0; ++i,--i2)
for (size_t j=0,j2=nphi0/2; j<nphi0; ++j,++j2)
{
if (j2>=nphi0) j2-=nphi0;
tmp0.v(i2,j) = sfct*tmp0(i,j2);
tmp.v(i,j2) = arr(i,j2);
tmp.v(i2,j) = sfct*tmp(i,j2);
}
// FFT to frequency domain on minimal grid
r2r_fftpack(ftmp0,ftmp0,{0,1},true,true,T(1./(nphi0*nphi0)),nthreads);
// correct amplitude at Nyquist frequency
for (size_t i=0; i<nphi0; ++i)
{
tmp0.v(i,nphi0-1)*=0.5;
tmp0.v(nphi0-1,i)*=0.5;
}
for (size_t j=0; j<nphi0; ++j)
tmp.v(ntheta0-1,j) = arr(ntheta0-1,j);
auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads);
for (size_t i=0; i<nphi0; ++i)
for (size_t j=0; j<nphi0; ++j)
tmp0.v(i,j) *= fct[(i+1)/2] * fct[(j+1)/2];
auto tmp1=tmp.template subarray<2>({0,0},{nphi, nphi0});
fmav<T> ftmp1(tmp1);
// zero-padded FFT in theta direction
r2r_fftpack(ftmp1,ftmp1,{0},false,false,T(1),nthreads);
auto tmp2=tmp.template subarray<2>({0,0},{ntheta, nphi});
fmav<T> ftmp2(tmp2);
for (auto &f:fct) f/=nphi0;
fmav<T> ftmp(tmp);
fmav<T> ftmp0(tmp.template subarray<2>({0,0},{nphi0, nphi0}));
convolve_1d(ftmp0, ftmp, 0, fct, nthreads);
fmav<T> ftmp2(tmp.template subarray<2>({0,0},{ntheta, nphi0}));
fmav<T> farr(arr);
// zero-padded FFT in phi direction
r2r_fftpack(ftmp2,farr,{1},false,false,T(1),nthreads);
convolve_1d(ftmp2, farr, 1, fct, nthreads);
}
void decorrect(mav<T,2> &arr, int spin)
{
T sfct = (spin&1) ? -1 : 1;
mav<T,2> tmp({nphi,nphi});
fmav<T> ftmp(tmp);
for (size_t i=0; i<ntheta; ++i)
for (size_t j=0; j<nphi; ++j)
tmp.v(i,j) = arr(i,j);
mav<T,2> tmp({nphi,nphi0});
auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads);
for (auto &f:fct) f/=nphi0;
fmav<T> farr(arr);
fmav<T> ftmp2(tmp.template subarray<2>({0,0},{ntheta, nphi0}));
convolve_1d(farr, ftmp2, 1, fct, nthreads);
// extend to second half
for (size_t i=1, i2=nphi-1; i+1<ntheta; ++i,--i2)
for (size_t j=0,j2=nphi/2; j<nphi; ++j,++j2)
for (size_t j=0,j2=nphi0/2; j<nphi0; ++j,++j2)
{
if (j2>=nphi) j2-=nphi;
if (j2>=nphi0) j2-=nphi0;
tmp.v(i2,j) = sfct*tmp(i,j2);
}
r2r_fftpack(ftmp,ftmp,{1},true,true,T(1),nthreads);
auto tmp1=tmp.template subarray<2>({0,0},{nphi, nphi0});
fmav<T> ftmp1(tmp1);
r2r_fftpack(ftmp1,ftmp1,{0},true,true,T(1),nthreads);
auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads);
auto tmp0=tmp.template subarray<2>({0,0},{nphi0, nphi0});
fmav<T> ftmp0(tmp0);
for (size_t i=0; i<nphi0; ++i)
for (size_t j=0; j<nphi0; ++j)
tmp0.v(i,j) *= fct[(i+1)/2] * fct[(j+1)/2];
// FFT to (theta, phi) domain on minimal grid
r2r_fftpack(ftmp0,ftmp0,{0,1},false, false,T(1./(nphi0*nphi0)),nthreads);
fmav<T> ftmp(tmp);
fmav<T> ftmp0(tmp.template subarray<2>({0,0},{nphi0, nphi0}));
convolve_1d(ftmp, ftmp0, 0, fct, nthreads);
for (size_t j=0; j<nphi0; ++j)
{
tmp0.v(0,j)*=0.5;
tmp0.v(ntheta0-1,j)*=0.5;
}
for (size_t i=0; i<ntheta0; ++i)
arr.v(0,j) = 0.5*tmp(0,j);
for (size_t i=1; i+1<ntheta0; ++i)
for (size_t j=0; j<nphi0; ++j)
arr.v(i,j) = tmp0(i,j);
arr.v(i,j) = tmp(i,j);
for (size_t j=0; j<nphi0; ++j)
arr.v(ntheta0-1,j) = 0.5*tmp(ntheta0-1,j);
}
vector<size_t> getIdx(const mav<T,2> &ptg) const
......
......@@ -242,16 +242,28 @@ PYBIND11_MODULE(pyinterpol_ng, m)
m.doc() = pyinterpol_ng_DS;
py::class_<PyInterpolator<fptype>> (m, "PyInterpolator", pyinterpolator_DS)
using inter_d = PyInterpolator<double>;
py::class_<inter_d> (m, "PyInterpolator", pyinterpolator_DS)
.def(py::init<const py::array &, const py::array &, bool, int64_t, int64_t, fptype, fptype, int>(),
initnormal_DS, "sky"_a, "beam"_a, "separate"_a, "lmax"_a, "kmax"_a, "epsilon"_a, "ofactor"_a=fptype(1.5),
"nthreads"_a=0)
.def(py::init<int64_t, int64_t, int64_t, fptype, fptype, int>(), initadjoint_DS,
"lmax"_a, "kmax"_a, "ncomp"_a, "epsilon"_a, "ofactor"_a=fptype(1.5), "nthreads"_a=0)
.def ("interpol", &PyInterpolator<fptype>::pyinterpol, interpol_DS, "ptg"_a)
.def ("deinterpol", &PyInterpolator<fptype>::pydeinterpol, deinterpol_DS, "ptg"_a, "data"_a)
.def ("getSlm", &PyInterpolator<fptype>::pygetSlm, getSlm_DS, "beam"_a)
.def ("support", &PyInterpolator<fptype>::support);
.def ("interpol", &inter_d::pyinterpol, interpol_DS, "ptg"_a)
.def ("deinterpol", &inter_d::pydeinterpol, deinterpol_DS, "ptg"_a, "data"_a)
.def ("getSlm", &inter_d::pygetSlm, getSlm_DS, "beam"_a)
.def ("support", &inter_d::support);
// using inter_f = PyInterpolator<float>;
// py::class_<inter_f> (m, "PyInterpolator_f", pyinterpolator_DS)
// .def(py::init<const py::array &, const py::array &, bool, int64_t, int64_t, fptype, fptype, int>(),
// initnormal_DS, "sky"_a, "beam"_a, "separate"_a, "lmax"_a, "kmax"_a, "epsilon"_a, "ofactor"_a=fptype(1.5),
// "nthreads"_a=0)
// .def(py::init<int64_t, int64_t, int64_t, fptype, fptype, int>(), initadjoint_DS,
// "lmax"_a, "kmax"_a, "ncomp"_a, "epsilon"_a, "ofactor"_a=fptype(1.5), "nthreads"_a=0)
// .def ("interpol", &inter_f::pyinterpol, interpol_DS, "ptg"_a)
// .def ("deinterpol", &inter_f::pydeinterpol, deinterpol_DS, "ptg"_a, "data"_a)
// .def ("getSlm", &inter_f::pygetSlm, getSlm_DS, "beam"_a)
// .def ("support", &inter_f::support);
#if 1
m.def("rotate_alm", &pyrotate_alm<fptype>, "alm"_a, "lmax"_a, "psi"_a, "theta"_a,
"phi"_a);
......
......@@ -604,21 +604,6 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp
auto tmpsize = axsize*((othersize>=vlen) ? vlen : 1);
return aligned_array<T>(tmpsize);
}
template<typename T, typename T0> aligned_array<T> alloc_tmp
(const fmav_info &info, const shape_t &axes)
{
size_t fullsize=info.size();
size_t tmpsize=0;
for (size_t i=0; i<axes.size(); ++i)
{
auto axsize = info.shape(axes[i]);
auto othersize = fullsize/axsize;
constexpr auto vlen = native_simd<T0>::size();
auto sz = axsize*((othersize>=vlen) ? vlen : 1);
if (sz>tmpsize) tmpsize=sz;
}
return aligned_array<T>(tmpsize);
}
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
const fmav<Cmplx<T>> &src, Cmplx<native_simd<T>> *MRUTIL_RESTRICT dst)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment