Commit d8b05551 authored by Martin Reinecke's avatar Martin Reinecke
Browse files

Merge branch 'auto_choice' into 'ducc0'

gridder: add option to choose nu and nv automatically

See merge request !31
parents 7dab1b43 bf17b1e7
Pipeline #80782 passed with stages
in 15 minutes and 21 seconds
This diff is collapsed.
......@@ -203,8 +203,6 @@ py::array py_transpose(const py::array &in, py::array &out)
MR_fail("unsupported datatype");
}
void py_set_thread_pool_size(size_t new_pool_size)
{ set_pool_size(new_pool_size); }
const char *misc_DS = R"""(
Various unsorted utilities
......@@ -227,8 +225,6 @@ void add_misc(py::module &msup)
m.def("ascontiguousarray",&py_ascontiguousarray, "in"_a);
m.def("transpose",&py_transpose, "in"_a, "out"_a);
m.def("set_thread_pool_size",&py_set_thread_pool_size, "new_pool_size"_a);
}
}
......
......@@ -67,14 +67,14 @@ def explicit_gridder(uvw, freq, ms, wgt, nxdirty, nydirty, xpixsize, ypixsize,
@pmp("nxdirty", (30, 128))
@pmp("nydirty", (128, 250))
@pmp("ofactor", (1.2, 1.5, 1.7, 2.0))
@pmp("ofactor", (0, 1.2, 1.5, 1.7, 2.0))
@pmp("nrow", (2, 27))
@pmp("nchan", (1, 5))
@pmp("epsilon", (1e-1, 1e-3, 1e-5))
@pmp("singleprec", (True, False))
@pmp("wstacking", (True, False))
@pmp("use_wgt", (True, False))
@pmp("nthreads", (1, 2))
@pmp("nthreads", (1, 2, 7))
def test_adjointness_ms2dirty(nxdirty, nydirty, ofactor, nrow, nchan, epsilon,
singleprec, wstacking, use_wgt, nthreads):
if singleprec and epsilon < 5e-5:
......@@ -93,6 +93,8 @@ def test_adjointness_ms2dirty(nxdirty, nydirty, ofactor, nrow, nchan, epsilon,
nu += 1
if nv & 1:
nv += 1
if ofactor == 0:
nu = nv = 0
if singleprec:
ms = ms.astype("c8")
dirty = dirty.astype("f4")
......@@ -110,14 +112,14 @@ def test_adjointness_ms2dirty(nxdirty, nydirty, ofactor, nrow, nchan, epsilon,
@pmp('nxdirty', [16, 64])
@pmp('nydirty', [64])
@pmp('ofactor', [1.2, 1.4, 1.7, 2])
@pmp('ofactor', [0, 1.2, 1.4, 1.7, 2])
@pmp("nrow", (1, 2, 27))
@pmp("nchan", (1, 5))
@pmp("epsilon", (1e-2, 1e-3, 1e-4, 1e-7))
@pmp("singleprec", (False,))
@pmp("wstacking", (False, True))
@pmp("use_wgt", (True,))
@pmp("nthreads", (1, 2))
@pmp("nthreads", (1, 2, 7))
@pmp("fov", (1., 20.))
def test_ms2dirty_against_wdft2(nxdirty, nydirty, ofactor, nrow, nchan, epsilon, singleprec, wstacking, use_wgt, fov, nthreads):
if singleprec and epsilon < 5e-5:
......@@ -136,6 +138,8 @@ def test_ms2dirty_against_wdft2(nxdirty, nydirty, ofactor, nrow, nchan, epsilon,
nu += 1
if nv & 1:
nv += 1
if ofactor == 0:
nu = nv = 0
if singleprec:
ms = ms.astype("c8")
if wgt is not None:
......
......@@ -35,7 +35,7 @@ namespace py = pybind11;
auto None = py::none();
template<typename T> py::array ms2dirty2(const py::array &uvw_,
const py::array &freq_, const py::array &ms_, const py::object &wgt_,
const py::array &freq_, const py::array &ms_, const py::object &wgt_, const py::object &mask_,
size_t npix_x, size_t npix_y, double pixsize_x, double pixsize_y, size_t nu,
size_t nv, double epsilon, bool do_wstacking, size_t nthreads,
size_t verbosity)
......@@ -45,11 +45,13 @@ template<typename T> py::array ms2dirty2(const py::array &uvw_,
auto ms = to_mav<complex<T>,2>(ms_, false);
auto wgt = get_optional_const_Pyarr<T>(wgt_, {ms.shape(0),ms.shape(1)});
auto wgt2 = to_mav<T,2>(wgt, false);
auto mask = get_optional_const_Pyarr<uint8_t>(mask_, {uvw.shape(0),freq.shape(0)});
auto mask2 = to_mav<uint8_t,2>(mask, false);
auto dirty = make_Pyarr<T>({npix_x,npix_y});
auto dirty2 = to_mav<T,2>(dirty, true);
{
py::gil_scoped_release release;
ms2dirty(uvw,freq,ms,wgt2,pixsize_x,pixsize_y,nu,nv,epsilon,
ms2dirty(uvw,freq,ms,wgt2,mask2,pixsize_x,pixsize_y,nu,nv,epsilon,
do_wstacking,nthreads,dirty2,verbosity);
}
return move(dirty);
......@@ -58,13 +60,13 @@ py::array Pyms2dirty(const py::array &uvw,
const py::array &freq, const py::array &ms, const py::object &wgt,
size_t npix_x, size_t npix_y, double pixsize_x, double pixsize_y, size_t nu,
size_t nv, double epsilon, bool do_wstacking, size_t nthreads,
size_t verbosity)
size_t verbosity, const py::object &mask)
{
if (isPyarr<complex<float>>(ms))
return ms2dirty2<float>(uvw, freq, ms, wgt, npix_x, npix_y,
return ms2dirty2<float>(uvw, freq, ms, wgt, mask, npix_x, npix_y,
pixsize_x, pixsize_y, nu, nv, epsilon, do_wstacking, nthreads, verbosity);
if (isPyarr<complex<double>>(ms))
return ms2dirty2<double>(uvw, freq, ms, wgt, npix_x, npix_y,
return ms2dirty2<double>(uvw, freq, ms, wgt, mask, npix_x, npix_y,
pixsize_x, pixsize_y, nu, nv, epsilon, do_wstacking, nthreads, verbosity);
MR_fail("type matching failed: 'ms' has neither type 'c8' nor 'c16'");
}
......@@ -93,7 +95,9 @@ nu, nv: int
oversampling values lie between 1.5 and 2.
Increasing the oversampling factor decreases the kernel support width
required for the desired accuracy, so it typically reduces run-time; on the
other hand, this will increase memory consumption.
other hand, this will increase memory consumption.
If at least one of these two values is 0, the library will automatically
pick values that result in a fast computation.
epsilon: float
accuracy at which the computation should be done. Must be larger than 2e-13.
If `ms` has type np.complex64, it must be larger than 1e-5.
......@@ -106,6 +110,8 @@ verbosity: int
0: no output
1: some output
2: detailed output
mask: np.array((nrows, nchan), dtype=np.uint8), optional
If present, only visibilities are processed for which mask!=0
Returns
=======
......@@ -114,7 +120,7 @@ np.array((nxdirty, nydirty), dtype=float of same precision as `ms`)
)""";
template<typename T> py::array dirty2ms2(const py::array &uvw_,
const py::array &freq_, const py::array &dirty_, const py::object &wgt_,
const py::array &freq_, const py::array &dirty_, const py::object &wgt_, const py::object &mask_,
double pixsize_x, double pixsize_y, size_t nu, size_t nv, double epsilon,
bool do_wstacking, size_t nthreads, size_t verbosity)
{
......@@ -123,11 +129,13 @@ template<typename T> py::array dirty2ms2(const py::array &uvw_,
auto dirty = to_mav<T,2>(dirty_, false);
auto wgt = get_optional_const_Pyarr<T>(wgt_, {uvw.shape(0),freq.shape(0)});
auto wgt2 = to_mav<T,2>(wgt, false);
auto mask = get_optional_const_Pyarr<uint8_t>(mask_, {uvw.shape(0),freq.shape(0)});
auto mask2 = to_mav<uint8_t,2>(mask, false);
auto ms = make_Pyarr<complex<T>>({uvw.shape(0),freq.shape(0)});
auto ms2 = to_mav<complex<T>,2>(ms, true);
{
py::gil_scoped_release release;
dirty2ms(uvw,freq,dirty,wgt2,pixsize_x,pixsize_y,nu,nv,epsilon,
dirty2ms(uvw,freq,dirty,wgt2,mask2,pixsize_x,pixsize_y,nu,nv,epsilon,
do_wstacking,nthreads,ms2,verbosity);
}
return move(ms);
......@@ -135,13 +143,13 @@ template<typename T> py::array dirty2ms2(const py::array &uvw_,
py::array Pydirty2ms(const py::array &uvw,
const py::array &freq, const py::array &dirty, const py::object &wgt,
double pixsize_x, double pixsize_y, size_t nu, size_t nv, double epsilon,
bool do_wstacking, size_t nthreads, size_t verbosity)
bool do_wstacking, size_t nthreads, size_t verbosity, const py::object &mask)
{
if (isPyarr<float>(dirty))
return dirty2ms2<float>(uvw, freq, dirty, wgt,
return dirty2ms2<float>(uvw, freq, dirty, wgt, mask,
pixsize_x, pixsize_y, nu, nv, epsilon, do_wstacking, nthreads, verbosity);
if (isPyarr<double>(dirty))
return dirty2ms2<double>(uvw, freq, dirty, wgt,
return dirty2ms2<double>(uvw, freq, dirty, wgt, mask,
pixsize_x, pixsize_y, nu, nv, epsilon, do_wstacking, nthreads, verbosity);
MR_fail("type matching failed: 'dirty' has neither type 'f4' nor 'f8'");
}
......@@ -168,7 +176,9 @@ nu, nv: int
oversampling values lie between 1.5 and 2.
Increasing the oversampling factor decreases the kernel support width
required for the desired accuracy, so it typically reduces run-time; on the
other hand, this will increase memory consumption.
other hand, this will increase memory consumption.
If at least one of these two values is 0, the library will automatically
pick values that result in a fast computation.
epsilon: float
accuracy at which the computation should be done. Must be larger than 2e-13.
If `dirty` has type np.float32, it must be larger than 1e-5.
......@@ -181,6 +191,8 @@ verbosity: int
0: no output
1: some output
2: detailed output
mask: np.array((nrows, nchan), dtype=np.uint8), optional
If present, only visibilities are processed for which mask!=0
Returns
=======
......@@ -195,10 +207,10 @@ void add_wgridder(py::module &msup)
m.def("ms2dirty", &Pyms2dirty, ms2dirty_DS, "uvw"_a, "freq"_a, "ms"_a,
"wgt"_a=None, "npix_x"_a, "npix_y"_a, "pixsize_x"_a, "pixsize_y"_a, "nu"_a, "nv"_a,
"epsilon"_a, "do_wstacking"_a=false, "nthreads"_a=1, "verbosity"_a=0);
"epsilon"_a, "do_wstacking"_a=false, "nthreads"_a=1, "verbosity"_a=0, "mask"_a=None);
m.def("dirty2ms", &Pydirty2ms, dirty2ms_DS, "uvw"_a, "freq"_a, "dirty"_a,
"wgt"_a=None, "pixsize_x"_a, "pixsize_y"_a, "nu"_a, "nv"_a, "epsilon"_a,
"do_wstacking"_a=false, "nthreads"_a=1, "verbosity"_a=0);
"do_wstacking"_a=false, "nthreads"_a=1, "verbosity"_a=0, "mask"_a=None);
}
}
......
......@@ -29,7 +29,6 @@
#include <queue>
#include <atomic>
#include <vector>
#include <memory>
#if __has_include(<pthread.h>)
#include <pthread.h>
#endif
......@@ -41,7 +40,9 @@ namespace detail_threading {
#ifndef DUCC0_NO_THREADING
std::atomic<size_t> default_nthreads_(std::max<size_t>(1, std::thread::hardware_concurrency()));
static const size_t max_threads_ = std::max<size_t>(1, std::thread::hardware_concurrency());
std::atomic<size_t> default_nthreads_(max_threads_);
size_t get_default_nthreads()
{ return default_nthreads_; }
......@@ -49,6 +50,8 @@ size_t get_default_nthreads()
void set_default_nthreads(size_t new_default_nthreads)
{ default_nthreads_ = std::max<size_t>(1, new_default_nthreads); }
size_t max_threads() { return max_threads_; }
class latch
{
std::atomic<size_t> num_left_;
......@@ -152,9 +155,9 @@ class thread_pool
threads_(nthreads)
{ create_threads(); }
~thread_pool() { shutdown(); }
thread_pool(): thread_pool(max_threads_) {}
size_t size() const { return threads_.size(); }
~thread_pool() { shutdown(); }
void submit(std::function<void()> work)
{
......@@ -176,12 +179,9 @@ class thread_pool
}
};
thread_pool &get_pool(size_t nthreads=0)
inline thread_pool &get_pool()
{
static std::unique_ptr<thread_pool> pool;
if ((!pool) && (nthreads==0)) nthreads=default_nthreads_;
if ((!pool) || ((nthreads!=0) && (nthreads!=pool->size()))) // resize
pool = std::make_unique<thread_pool>(nthreads);
static thread_pool pool;
#if __has_include(<pthread.h>)
static std::once_flag f;
call_once(f,
......@@ -194,12 +194,9 @@ thread_pool &get_pool(size_t nthreads=0)
});
#endif
return *pool;
return pool;
}
void set_pool_size(size_t new_pool_size)
{ get_pool(new_pool_size); }
class Distribution
{
private:
......@@ -320,7 +317,7 @@ void Distribution::thread_map(std::function<void(Scheduler &)> f)
return;
}
auto &pool = get_pool();
auto & pool = get_pool();
latch counter(nthreads_);
std::exception_ptr ex;
std::mutex ex_mut;
......
......@@ -47,11 +47,10 @@ class Scheduler
virtual Range getNext() = 0;
};
size_t max_threads();
void set_default_nthreads(size_t new_default_nthreads);
size_t get_default_nthreads();
void set_pool_size(size_t new_pool_size);
void execSingle(size_t nwork,
std::function<void(Scheduler &)> func);
void execStatic(size_t nwork, size_t nthreads, size_t chunksize,
......@@ -64,7 +63,7 @@ void execParallel(size_t nthreads, std::function<void(Scheduler &)> func);
} // end of namespace detail_threading
using detail_threading::set_pool_size;
using detail_threading::max_threads;
using detail_threading::get_default_nthreads;
using detail_threading::set_default_nthreads;
using detail_threading::Scheduler;
......
......@@ -24,16 +24,23 @@
#include <chrono>
#include <string>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <map>
#include "ducc0/infra/error_handling.h"
namespace ducc0 {
namespace detail_timers {
using namespace std;
class SimpleTimer
{
private:
using clock = std::chrono::steady_clock;
using clock = chrono::steady_clock;
clock::time_point starttime;
public:
......@@ -42,27 +49,105 @@ class SimpleTimer
void reset()
{ starttime = clock::now(); }
double operator()() const
{
return std::chrono::duration<double>(clock::now() - starttime).count();
}
{ return chrono::duration<double>(clock::now() - starttime).count(); }
};
class TimerHierarchy
{
private:
using clock = std::chrono::steady_clock;
using clock = chrono::steady_clock;
class tstack_node
{
private:
using maptype = map<string,tstack_node>;
using Tipair = pair<maptype::const_iterator,double>;
public:
tstack_node *parent;
string name;
double accTime;
std::map<std::string,tstack_node> child;
maptype child;
tstack_node(tstack_node *parent_)
: parent(parent_), accTime(0.) {}
private:
double full_acc() const
{
double t_own = accTime;
for (const auto &nd: child)
t_own += nd.second.full_acc();
return t_own;
}
double add_timings(const std::string &prefix,
std::map<std::string, double> &res) const
size_t max_namelen() const
{
auto res=name.length();
for (const auto &ch: child)
res=max(res,ch.second.max_namelen());
return res;
}
static void floatformat(double val, size_t pre, size_t post, ostream &os)
{
size_t fct=1;
for (size_t i=0; i<post; ++i, fct*=10);
os << setw(pre) << int(val) << "." << setw(post) << setfill('0')
<< int((val-int(val))*fct+0.5) << setfill(' ');
}
static void printline(const string &indent, int twidth, int slen,
const string &name, double val, double total,
ostream &os)
{
os << indent << "+- " << name << setw(slen+1-name.length()) << ":";
floatformat(100*val/total, 3, 2, os);
os << "% (";
floatformat(val, twidth-5, 4, os);
os << "s)\n";
}
void report(const string &indent, int twidth, int slen, ostream &os) const
{
double total=full_acc();
vector<Tipair> tmp;
for (auto it=child.cbegin(); it!=child.cend(); ++it)
tmp.push_back(make_pair(it, it->second.full_acc()));
if (tmp.size()>0)
{
sort(tmp.begin(),tmp.end(),
[](const Tipair &a, const Tipair &b){ return a.second>b.second; });
double tsum=0;
os << indent << "|\n";
for (unsigned i=0; i<tmp.size(); ++i)
{
printline(indent, twidth, slen, tmp[i].first->first, tmp[i].second, total, os);
(tmp[i].first->second).report(indent+"| ",twidth,slen,os);
tsum+=tmp[i].second;
}
printline(indent, twidth, slen, "<unaccounted>", total-tsum, total, os);
if (indent!="") os << indent << "\n";
}
}
public:
tstack_node(const string &name_, tstack_node *parent_=nullptr)
: parent(parent_), name(name_), accTime(0.) {}
void report(ostream &os) const
{
auto slen=string("<unaccounted>").size();
slen = max(slen, max_namelen());
double total=full_acc();
os << "\nTotal wall clock time for " << name << ": " << setprecision(4) << total << "s\n";
// printf("\nTotal wall clock time for '%s': %1.4fs\n",name.c_str(),total);
int logtime=max(1,int(log10(total)+1));
report("",logtime+5,slen, os);
}
void addTime(double dt)
{ accTime += dt; }
double add_timings(const string &prefix,
map<string, double> &res) const
{
double t_own = accTime;
for (const auto &nd: child)
......@@ -79,26 +164,25 @@ class TimerHierarchy
void adjust_time()
{
auto tnow = clock::now();
curnode->accTime +=
std::chrono::duration <double>(tnow - last_time).count();
curnode->addTime(chrono::duration<double>(tnow - last_time).count());
last_time = tnow;
}
void push_internal(const std::string &name)
void push_internal(const string &name)
{
auto it=curnode->child.find(name);
if (it==curnode->child.end())
{
MR_assert(name.find(':') == std::string::npos, "reserved character");
it = curnode->child.insert(make_pair(name,tstack_node(curnode))).first;
MR_assert(name.find(':') == string::npos, "reserved character");
it = curnode->child.insert(make_pair(name,tstack_node(name, curnode))).first;
}
curnode=&(it->second);
}
public:
TimerHierarchy()
: last_time(clock::now()), root(nullptr), curnode(&root) {}
void push(const std::string &name)
TimerHierarchy(const string &name="<root>")
: last_time(clock::now()), root(name, nullptr), curnode(&root) {}
void push(const string &name)
{
adjust_time();
push_internal(name);
......@@ -109,20 +193,27 @@ class TimerHierarchy
curnode = curnode->parent;
MR_assert(curnode!=nullptr, "tried to pop from empty timer stack");
}
void poppush(const std::string &name)
void poppush(const string &name)
{
pop();
push_internal(name);
}
std::map<std::string, double> get_timings()
map<string, double> get_timings()
{
adjust_time();
std::map<std::string, double> res;
map<string, double> res;
root.add_timings("root", res);
return res;
}
void report(ostream &os) const
{ ostringstream oss; root.report(oss); os<<oss.str(); }
};
}
using detail_timers::SimpleTimer;
using detail_timers::TimerHierarchy;
}
#endif
......@@ -24,6 +24,8 @@
#include <vector>
#include <memory>
#include <cmath>
#include <type_traits>
#include "ducc0/infra/simd.h"
#include "ducc0/math/gl_integrator.h"
#include "ducc0/math/constants.h"
......@@ -46,8 +48,15 @@ vector<double> getCoeffs(size_t W, size_t D, const function<double(double)> &fun
double l = -1+2.*i/double(W);
double r = -1+2.*(i+1)/double(W);
// function values at Chebyshev nodes
double avg = 0;
for (size_t j=0; j<=D; ++j)
{
y[j] = func(chebroot[j]*(r-l)*0.5 + (r+l)*0.5);
avg += y[j];
}
avg/=(D+1);
for (size_t j=0; j<=D; ++j)
y[j] -= avg;
// Chebyshev coefficients
for (size_t j=0; j<=D; ++j)
{
......@@ -70,6 +79,7 @@ vector<double> getCoeffs(size_t W, size_t D, const function<double(double)> &fun
for (size_t j=0; j<=D; ++j)
for (size_t k=0; k<=D; ++k)
lcf2[k] += C[j*(D+1) + k]*lcf[j];
lcf2[0] += avg;
for (size_t j=0; j<=D; ++j)
coeff[j*W + i] = lcf2[D-j];
}
......@@ -286,7 +296,7 @@ template<typename T> class HornerKernel: public GriddingKernel<T>
virtual T eval_single(T x) const
{ return (this->*evalsinglefunc)(x); }
virtual double corfunc(double x) const {return corr.corfunc(x); }
virtual double corfunc(double x) const { return corr.corfunc(x); }
/* Computes the correction function values at a coordinates
[0, dx, 2*dx, ..., (n-1)*dx] */
......@@ -320,37 +330,84 @@ template<size_t W, typename T> class TemplateKernel
constexpr size_t support() const { return W; }
void eval(T x, native_simd<T> *res) const
[[gnu::always_inline]] void eval2s(T x, T y, T z, native_simd<T> * DUCC0_RESTRICT res) const
{
x = (x+1)*W-1;
for (size_t i=0; i<nvec; ++i)
z += W*T(0.5); // now in [0; W[
auto nth = min(W-1, size_t(max(T(0), z)));
z = (z-nth)*2-1;
if constexpr (nvec==1)
{
auto tval = coeff[i];
auto tvalx = coeff[0];
auto tvaly = coeff[0];
auto tvalz = coeff[0];
for (size_t j=1; j<=D; ++j)
tval = tval*x + coeff[j*nvec+i];
res[i] = tval;
{
tvalx = tvalx*x + coeff[j];
tvaly = tvaly*y + coeff[j];
tvalz = tvalz*z + coeff[j];
}
res[0] = tvalx*T(tvalz[nth]);
res[1] = tvaly;
}
else
{
auto ptrz = scoeff+nth;
auto tvalz = *ptrz;
for (size_t j=1; j<=D; ++j)
tvalz = tvalz*z + ptrz[j*sstride];
for (size_t i=0; i<nvec; ++i)
{
auto tvalx = coeff[i];
auto tvaly = coeff[i];
for (size_t j=1; j<=D; ++j)
{
tvalx = tvalx*x + coeff[j*nvec+i];
tvaly = tvaly*y + coeff[j*nvec+i];
}
res[i] = tvalx*tvalz;
res[i+nvec] = tvaly;
}
}
}
T eval_single(T x) const
[[gnu::always_inline]] void eval2(T x, T y, native_simd<T> * DUCC0_RESTRICT res) const
{
auto nth = min(W-1, size_t(max(T(0), (x+1)*W*T(0.5))));
x = (x+1)*W-2*nth-1;