Commit 4b36a29f authored by Martin Reinecke's avatar Martin Reinecke
Browse files

Merge branch 'convolution_work' into 'master'

Convolution work

See merge request mtr/cxxbase!7
parents 474ffe65 ed1b7cf7
...@@ -42,11 +42,13 @@ def convolve(alm1, alm2, lmax): ...@@ -42,11 +42,13 @@ def convolve(alm1, alm2, lmax):
lmax=60 lmax=60
kmax=13 kmax=13
ncomp=1 ncomp=1
separate=True separate=False
ncomp2 = ncomp if separate else 1 nptg = 1000000
epsilon = 1e-4 epsilon = 1e-4
ofactor = 2 ofactor = 1.5
nthreads = 0 nthreads = 0 # use as many threads as available
ncomp2 = ncomp if separate else 1
# get random sky a_lm # get random sky a_lm
# the a_lm arrays follow the same conventions as those in healpy # the a_lm arrays follow the same conventions as those in healpy
...@@ -93,13 +95,14 @@ plt.imshow(bar2-bar[:,:,0]) ...@@ -93,13 +95,14 @@ plt.imshow(bar2-bar[:,:,0])
plt.show() plt.show()
ptg=np.random.uniform(0.,1.,3*1000000).reshape(1000000,3) ptg=np.random.uniform(0.,1.,3*nptg).reshape(nptg,3)
ptg[:,0]*=np.pi ptg[:,0]*=np.pi
ptg[:,1]*=2*np.pi ptg[:,1]*=2*np.pi
ptg[:,2]*=2*np.pi ptg[:,2]*=2*np.pi
#foo = pyinterpol_ng.PyInterpolator(slm,blm,separate,lmax, kmax, epsilon=1e-6, nthreads=2) #foo = pyinterpol_ng.PyInterpolator(slm,blm,separate,lmax, kmax, epsilon=1e-6, nthreads=2)
t0=time.time() t0=time.time()
bar=foo.interpol(ptg) bar=foo.interpol(ptg)
del foo
print("interpolation time: ", time.time()-t0) print("interpolation time: ", time.time()-t0)
fake = np.random.uniform(0.,1., (ptg.shape[0],ncomp2)) fake = np.random.uniform(0.,1., (ptg.shape[0],ncomp2))
foo2 = pyinterpol_ng.PyInterpolator(lmax, kmax, ncomp2, epsilon=epsilon, ofactor=ofactor, nthreads=nthreads) foo2 = pyinterpol_ng.PyInterpolator(lmax, kmax, ncomp2, epsilon=epsilon, ofactor=ofactor, nthreads=nthreads)
......
...@@ -22,6 +22,96 @@ ...@@ -22,6 +22,96 @@
namespace mr { namespace mr {
namespace detail_fft {
using std::vector;
template<typename T, typename T0> aligned_array<T> alloc_tmp_conv
(const fmav_info &info, size_t axis, size_t len)
{
auto othersize = info.size()/info.shape(axis);
constexpr auto vlen = native_simd<T0>::size();
auto tmpsize = len*((othersize>=vlen) ? vlen : 1);
return aligned_array<T>(tmpsize);
}
template<typename Tplan, typename T, typename T0, typename Exec>
MRUTIL_NOINLINE void general_convolve(const fmav<T> &in, fmav<T> &out,
const size_t axis, const vector<T0> &kernel, size_t nthreads,
const Exec &exec)
{
std::shared_ptr<Tplan> plan1, plan2;
size_t l_in=in.shape(axis), l_out=out.shape(axis);
size_t l_min=std::min(l_in, l_out), l_max=std::max(l_in, l_out);
MR_assert(kernel.size()==l_min/2+1, "bad kernel size");
plan1 = get_plan<Tplan>(l_in);
plan2 = get_plan<Tplan>(l_out);
execParallel(
util::thread_count(nthreads, in, axis, native_simd<T0>::size()),
[&](Scheduler &sched) {
constexpr auto vlen = native_simd<T0>::size();
auto storage = alloc_tmp_conv<T,T0>(in, axis, l_max);
multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
#ifndef MRUTIL_NO_SIMD
if (vlen>1)
while (it.remaining()>=vlen)
{
it.advance(vlen);
auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
exec(it, in, out, tdatav, *plan1, *plan2, kernel);
}
#endif
while (it.remaining()>0)
{
it.advance(1);
auto buf = reinterpret_cast<T *>(storage.data());
exec(it, in, out, buf, *plan1, *plan2, kernel);
}
}); // end of parallel region
}
struct ExecConvR1
{
template <typename T0, typename T, size_t vlen> void operator() (
const multi_iter<vlen> &it, const fmav<T0> &in, fmav<T0> &out,
T * buf, const pocketfft_r<T0> &plan1, const pocketfft_r<T0> &plan2,
const vector<T0> &kernel) const
{
size_t l_in = plan1.length(),
l_out = plan2.length(),
l_min = std::min(l_in, l_out);
copy_input(it, in, buf);
plan1.exec(buf, T0(1), true);
for (size_t i=0; i<l_min; ++i) buf[i]*=kernel[(i+1)/2];
for (size_t i=l_in; i<l_out; ++i) buf[i] = T(0);
plan2.exec(buf, T0(1), false);
copy_output(it, buf, out);
}
};
template<typename T> void convolve_1d(const fmav<T> &in,
fmav<T> &out, size_t axis, const vector<T> &kernel, size_t nthreads=1)
{
MR_assert(axis<in.ndim(), "bad axis number");
MR_assert(in.ndim()==out.ndim(), "dimensionality mismatch");
if (in.data()==out.data())
MR_assert(in.stride()==out.stride(), "strides mismatch");
for (size_t i=0; i<in.ndim(); ++i)
if (i!=axis)
MR_assert(in.shape(i)==out.shape(i), "shape mismatch");
MR_assert(!((in.shape(axis)&1) || (out.shape(axis)&1)),
"input and output axis lengths must be even");
if (in.size()==0) return;
general_convolve<pocketfft_r<T>>(in, out, axis, kernel, nthreads,
ExecConvR1());
}
}
using detail_fft::convolve_1d;
namespace detail_interpol_ng { namespace detail_interpol_ng {
using namespace std; using namespace std;
...@@ -41,78 +131,54 @@ template<typename T> class Interpolator ...@@ -41,78 +131,54 @@ template<typename T> class Interpolator
void correct(mav<T,2> &arr, int spin) void correct(mav<T,2> &arr, int spin)
{ {
T sfct = (spin&1) ? -1 : 1; T sfct = (spin&1) ? -1 : 1;
mav<T,2> tmp({nphi,nphi}); mav<T,2> tmp({nphi,nphi0});
tmp.apply([](T &v){v=0.;}); // copy and extend to second half
auto tmp0=tmp.template subarray<2>({0,0},{nphi0, nphi0}); for (size_t j=0; j<nphi0; ++j)
fmav<T> ftmp0(tmp0); tmp.v(0,j) = arr(0,j);
for (size_t i=0; i<ntheta0; ++i)
for (size_t j=0; j<nphi0; ++j)
tmp0.v(i,j) = arr(i,j);
// extend to second half
for (size_t i=1, i2=nphi0-1; i+1<ntheta0; ++i,--i2) for (size_t i=1, i2=nphi0-1; i+1<ntheta0; ++i,--i2)
for (size_t j=0,j2=nphi0/2; j<nphi0; ++j,++j2) for (size_t j=0,j2=nphi0/2; j<nphi0; ++j,++j2)
{ {
if (j2>=nphi0) j2-=nphi0; if (j2>=nphi0) j2-=nphi0;
tmp0.v(i2,j) = sfct*tmp0(i,j2); tmp.v(i,j2) = arr(i,j2);
tmp.v(i2,j) = sfct*tmp(i,j2);
} }
// FFT to frequency domain on minimal grid for (size_t j=0; j<nphi0; ++j)
r2r_fftpack(ftmp0,ftmp0,{0,1},true,true,T(1./(nphi0*nphi0)),nthreads); tmp.v(ntheta0-1,j) = arr(ntheta0-1,j);
// correct amplitude at Nyquist frequency
for (size_t i=0; i<nphi0; ++i)
{
tmp0.v(i,nphi0-1)*=0.5;
tmp0.v(nphi0-1,i)*=0.5;
}
auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads); auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads);
for (size_t i=0; i<nphi0; ++i) for (auto &f:fct) f/=nphi0;
for (size_t j=0; j<nphi0; ++j) fmav<T> ftmp(tmp);
tmp0.v(i,j) *= fct[(i+1)/2] * fct[(j+1)/2]; fmav<T> ftmp0(tmp.template subarray<2>({0,0},{nphi0, nphi0}));
auto tmp1=tmp.template subarray<2>({0,0},{nphi, nphi0}); convolve_1d(ftmp0, ftmp, 0, fct, nthreads);
fmav<T> ftmp1(tmp1); fmav<T> ftmp2(tmp.template subarray<2>({0,0},{ntheta, nphi0}));
// zero-padded FFT in theta direction
r2r_fftpack(ftmp1,ftmp1,{0},false,false,T(1),nthreads);
auto tmp2=tmp.template subarray<2>({0,0},{ntheta, nphi});
fmav<T> ftmp2(tmp2);
fmav<T> farr(arr); fmav<T> farr(arr);
// zero-padded FFT in phi direction convolve_1d(ftmp2, farr, 1, fct, nthreads);
r2r_fftpack(ftmp2,farr,{1},false,false,T(1),nthreads);
} }
void decorrect(mav<T,2> &arr, int spin) void decorrect(mav<T,2> &arr, int spin)
{ {
T sfct = (spin&1) ? -1 : 1; T sfct = (spin&1) ? -1 : 1;
mav<T,2> tmp({nphi,nphi}); mav<T,2> tmp({nphi,nphi0});
fmav<T> ftmp(tmp); auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads);
for (auto &f:fct) f/=nphi0;
for (size_t i=0; i<ntheta; ++i) fmav<T> farr(arr);
for (size_t j=0; j<nphi; ++j) fmav<T> ftmp2(tmp.template subarray<2>({0,0},{ntheta, nphi0}));
tmp.v(i,j) = arr(i,j); convolve_1d(farr, ftmp2, 1, fct, nthreads);
// extend to second half // extend to second half
for (size_t i=1, i2=nphi-1; i+1<ntheta; ++i,--i2) for (size_t i=1, i2=nphi-1; i+1<ntheta; ++i,--i2)
for (size_t j=0,j2=nphi/2; j<nphi; ++j,++j2) for (size_t j=0,j2=nphi0/2; j<nphi0; ++j,++j2)
{ {
if (j2>=nphi) j2-=nphi; if (j2>=nphi0) j2-=nphi0;
tmp.v(i2,j) = sfct*tmp(i,j2); tmp.v(i2,j) = sfct*tmp(i,j2);
} }
r2r_fftpack(ftmp,ftmp,{1},true,true,T(1),nthreads); fmav<T> ftmp(tmp);
auto tmp1=tmp.template subarray<2>({0,0},{nphi, nphi0}); fmav<T> ftmp0(tmp.template subarray<2>({0,0},{nphi0, nphi0}));
fmav<T> ftmp1(tmp1); convolve_1d(ftmp, ftmp0, 0, fct, nthreads);
r2r_fftpack(ftmp1,ftmp1,{0},true,true,T(1),nthreads);
auto fct = kernel.correction_factors(nphi, nphi0/2+1, nthreads);
auto tmp0=tmp.template subarray<2>({0,0},{nphi0, nphi0});
fmav<T> ftmp0(tmp0);
for (size_t i=0; i<nphi0; ++i)
for (size_t j=0; j<nphi0; ++j)
tmp0.v(i,j) *= fct[(i+1)/2] * fct[(j+1)/2];
// FFT to (theta, phi) domain on minimal grid
r2r_fftpack(ftmp0,ftmp0,{0,1},false, false,T(1./(nphi0*nphi0)),nthreads);
for (size_t j=0; j<nphi0; ++j) for (size_t j=0; j<nphi0; ++j)
{ arr.v(0,j) = 0.5*tmp(0,j);
tmp0.v(0,j)*=0.5; for (size_t i=1; i+1<ntheta0; ++i)
tmp0.v(ntheta0-1,j)*=0.5;
}
for (size_t i=0; i<ntheta0; ++i)
for (size_t j=0; j<nphi0; ++j) for (size_t j=0; j<nphi0; ++j)
arr.v(i,j) = tmp0(i,j); arr.v(i,j) = tmp(i,j);
for (size_t j=0; j<nphi0; ++j)
arr.v(ntheta0-1,j) = 0.5*tmp(ntheta0-1,j);
} }
vector<size_t> getIdx(const mav<T,2> &ptg) const vector<size_t> getIdx(const mav<T,2> &ptg) const
......
...@@ -242,16 +242,28 @@ PYBIND11_MODULE(pyinterpol_ng, m) ...@@ -242,16 +242,28 @@ PYBIND11_MODULE(pyinterpol_ng, m)
m.doc() = pyinterpol_ng_DS; m.doc() = pyinterpol_ng_DS;
py::class_<PyInterpolator<fptype>> (m, "PyInterpolator", pyinterpolator_DS) using inter_d = PyInterpolator<double>;
py::class_<inter_d> (m, "PyInterpolator", pyinterpolator_DS)
.def(py::init<const py::array &, const py::array &, bool, int64_t, int64_t, fptype, fptype, int>(), .def(py::init<const py::array &, const py::array &, bool, int64_t, int64_t, fptype, fptype, int>(),
initnormal_DS, "sky"_a, "beam"_a, "separate"_a, "lmax"_a, "kmax"_a, "epsilon"_a, "ofactor"_a=fptype(1.5), initnormal_DS, "sky"_a, "beam"_a, "separate"_a, "lmax"_a, "kmax"_a, "epsilon"_a, "ofactor"_a=fptype(1.5),
"nthreads"_a=0) "nthreads"_a=0)
.def(py::init<int64_t, int64_t, int64_t, fptype, fptype, int>(), initadjoint_DS, .def(py::init<int64_t, int64_t, int64_t, fptype, fptype, int>(), initadjoint_DS,
"lmax"_a, "kmax"_a, "ncomp"_a, "epsilon"_a, "ofactor"_a=fptype(1.5), "nthreads"_a=0) "lmax"_a, "kmax"_a, "ncomp"_a, "epsilon"_a, "ofactor"_a=fptype(1.5), "nthreads"_a=0)
.def ("interpol", &PyInterpolator<fptype>::pyinterpol, interpol_DS, "ptg"_a) .def ("interpol", &inter_d::pyinterpol, interpol_DS, "ptg"_a)
.def ("deinterpol", &PyInterpolator<fptype>::pydeinterpol, deinterpol_DS, "ptg"_a, "data"_a) .def ("deinterpol", &inter_d::pydeinterpol, deinterpol_DS, "ptg"_a, "data"_a)
.def ("getSlm", &PyInterpolator<fptype>::pygetSlm, getSlm_DS, "beam"_a) .def ("getSlm", &inter_d::pygetSlm, getSlm_DS, "beam"_a)
.def ("support", &PyInterpolator<fptype>::support); .def ("support", &inter_d::support);
// using inter_f = PyInterpolator<float>;
// py::class_<inter_f> (m, "PyInterpolator_f", pyinterpolator_DS)
// .def(py::init<const py::array &, const py::array &, bool, int64_t, int64_t, fptype, fptype, int>(),
// initnormal_DS, "sky"_a, "beam"_a, "separate"_a, "lmax"_a, "kmax"_a, "epsilon"_a, "ofactor"_a=fptype(1.5),
// "nthreads"_a=0)
// .def(py::init<int64_t, int64_t, int64_t, fptype, fptype, int>(), initadjoint_DS,
// "lmax"_a, "kmax"_a, "ncomp"_a, "epsilon"_a, "ofactor"_a=fptype(1.5), "nthreads"_a=0)
// .def ("interpol", &inter_f::pyinterpol, interpol_DS, "ptg"_a)
// .def ("deinterpol", &inter_f::pydeinterpol, deinterpol_DS, "ptg"_a, "data"_a)
// .def ("getSlm", &inter_f::pygetSlm, getSlm_DS, "beam"_a)
// .def ("support", &inter_f::support);
#if 1 #if 1
m.def("rotate_alm", &pyrotate_alm<fptype>, "alm"_a, "lmax"_a, "psi"_a, "theta"_a, m.def("rotate_alm", &pyrotate_alm<fptype>, "alm"_a, "lmax"_a, "psi"_a, "theta"_a,
"phi"_a); "phi"_a);
......
...@@ -604,21 +604,6 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp ...@@ -604,21 +604,6 @@ template<typename T, typename T0> aligned_array<T> alloc_tmp
auto tmpsize = axsize*((othersize>=vlen) ? vlen : 1); auto tmpsize = axsize*((othersize>=vlen) ? vlen : 1);
return aligned_array<T>(tmpsize); return aligned_array<T>(tmpsize);
} }
template<typename T, typename T0> aligned_array<T> alloc_tmp
(const fmav_info &info, const shape_t &axes)
{
size_t fullsize=info.size();
size_t tmpsize=0;
for (size_t i=0; i<axes.size(); ++i)
{
auto axsize = info.shape(axes[i]);
auto othersize = fullsize/axsize;
constexpr auto vlen = native_simd<T0>::size();
auto sz = axsize*((othersize>=vlen) ? vlen : 1);
if (sz>tmpsize) tmpsize=sz;
}
return aligned_array<T>(tmpsize);
}
template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it, template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
const fmav<Cmplx<T>> &src, Cmplx<native_simd<T>> *MRUTIL_RESTRICT dst) const fmav<Cmplx<T>> &src, Cmplx<native_simd<T>> *MRUTIL_RESTRICT dst)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment