diff --git a/nifty_gridder.cc b/nifty_gridder.cc index aaa33298f77185e7692ed6dbe734d2e6e5faf695..2f5c456fc2fe9340275c3a98004e03b3088b7b11 100644 --- a/nifty_gridder.cc +++ b/nifty_gridder.cc @@ -1,15 +1,42 @@ +/* + * This file is part of nifty_gridder. + * + * nifty_gridder is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * nifty_gridder is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with nifty_fridder; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + #include #include #include #include +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#define NOINLINE __attribute__ ((noinline)) +#else +#define RESTRICT +#endif + using namespace std; namespace py = pybind11; namespace { -constexpr double pi = 3.141592653589793238462643383279502884197; +// +// Utilities for converting between Cartesian coordinates and Peano index +// static const uint16_t utab[] = { #define Z(a) 0x##a##0, 0x##a##1, 0x##a##4, 0x##a##5 @@ -99,10 +126,13 @@ uint32_t morton2peano2D_32(uint32_t v, int bits) void myassert(bool cond, const char *msg) { if (cond) return; - cerr << msg << endl; - throw 42; + throw runtime_error(msg); } +// +// Utilities for indirect sorting (argsort) +// + template class IdxComp__ { private: @@ -149,6 +179,71 @@ inline double fmodulo (double v1, double v2) // return (v1>=0) ? ((v10.1) ? (1.+x)*(1.-x) : 1.-x*x; } + +void legendre_prep(int n, vector &x, vector &w) + { + constexpr double pi = 3.141592653589793238462643383279502884197; + constexpr double eps = 3e-14; + int m = (n+1)>>1; + x.resize(m); + w.resize(m); + + double t0 = 1 - (1-1./n) / (8.*n*n); + double t1 = 1./(4.*n+2.); + +#pragma omp parallel +{ + int i; +#pragma omp for schedule(dynamic,100) + for (i=1; i<=m; ++i) + { + double x0 = cos(pi * ((i<<2)-1) * t1) * t0; + + int dobreak=0; + int j=0; + double dpdx; + while(1) + { + double P_1 = 1.0; + double P0 = x0; + double dx, x1; + + for (int k=2; k<=n; k++) + { + double P_2 = P_1; + P_1 = P0; +// P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k; + P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2); + } + + dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0); + + /* Newton step */ + x1 = x0 - P0/dpdx; + dx = x0-x1; + x0 = x1; + if (dobreak) break; + + if (abs(dx)<=eps) dobreak=1; + if (++j>=100) throw runtime_error("convergence problem"); + } + + x[m-i] = x0; + w[m-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx); + } +} // end of parallel region + } + +// +// Start of real gridder functionality +// + using a_i_c = py::array_t; using a_d_c = py::array_t; using a_c_c = py::array_t, @@ -183,86 +278,75 @@ a_i_c peanoindex(const a_d_c &uv_, int nu, int nv) return res; } -class Helper +int get_w(double epsilon) { - private: - int nu, nv, nspread, nbuf; - double r2lamb, fac; - vector kernel; + static const vector maxmaperr { 1e8, 0.32, 0.021, 6.2e-4, + 1.08e-5, 1.25e-7, 8.25e-10, 5.70e-12, 1.22e-13, 2.48e-15, 4.82e-17, + 6.74e-19, 5.41e-21, 4.41e-23, 7.88e-25, 3.9e-26 }; + double epssq = epsilon*epsilon; + + for (size_t i=1; imaxmaperr[i]) return i; + throw runtime_error("requested epsilon too small - minimum is 2e-13"); + } + +class Helper + { + protected: + int nu, nv; + public: + int w; + double beta; + protected: + int nsafe, su; public: - vector ku, kv; - int iu, iv, idxu0, idxv0; - complex val; + int sv; - Helper(int nu_, int nv_, int nspread_, double r2lamb_) - : nu(nu_), nv(nv_), nspread(nspread_), nbuf(2*nspread_), r2lamb(r2lamb_), - fac(pi/r2lamb), kernel(nspread+1), ku(nbuf), kv(nbuf) - { - // Precompute gridding kernel - for (size_t i=0; i vis) + vector kernel; + int iu0, iv0; // start index of the current visibility + int bu0, bv0; // start index of the current buffer + + void NOINLINE update(double u_in, double v_in) { auto u = fmodulo(u_in, 1.)*nu; - iu = min(nu-1, int(u)); - auto du = u-iu; - idxu0 = (iu-nspread+1+nu)%nu; + iu0 = int(u-w*0.5 + 1 + nu) - nu; + if (iu0+w>nu+nsafe) iu0 = nu+nsafe-w; auto v = fmodulo(v_in, 1.)*nv; - iv = min(nv-1, int(v)); - auto dv = v-iv; - idxv0 = (iv-nspread+1+nv)%nv; - - val = vis*exp(-fac*(du*du + dv*dv)); - - auto fu0 = exp(2*fac*du); - auto fv0 = exp(2*fac*dv); - auto fu = 1.; - auto fv = 1.; - for (int i=0; inv+nsafe) iv0 = nv+nsafe-w; + double xw=2./w; + for (int i=0; ibu0+su) || (iv0+w>bv0+sv); } -class Buffer - { - protected: - int nu, nv, nspread, su; - public: - int sv; - protected: - int u0, v0; - - bool need_to_move(int iu, int iv) const + void update_position() { - return (abs(iu-u0)>su-nspread) || (abs(iv-v0)>sv-nspread); + bu0=max(-nsafe, min(nu+nsafe-su, iu0+nsafe-su/2)); + bv0=max(-nsafe, min(nv+nsafe-sv, iv0+nsafe-sv/2)); } - void update_position(int iu, int iv) + protected: + Helper(int nu_, int nv_, double epsilon) + : nu(nu_), nv(nv_), + w(get_w(epsilon)), beta(2.3*w), nsafe((w+1)/2), + su(min(max(2*w,80), nu)), sv(min(max(2*w,80), nv)), + kernel(2*w), + bu0(-1000000), bv0(-1000000) { - int safe_u = su-nspread, safe_v = sv-nspread; - u0=max(safe_u, min(nu-1-safe_u, iu)); - v0=max(safe_v, min(nv-1-safe_v, iv)); + if (min(nu,nv)<2*nsafe) throw runtime_error("grid dimensions too small"); } - - public: - Buffer(int nu_, int nv_, int nspread_) - : nu(nu_), nv(nv_), nspread(nspread_), - su(nspread+min(3*nspread, nu)), sv(3*nspread+min(nspread, nv)), - u0(-1000000), v0(-1000000) - {} }; -class WriteBuffer: public Buffer +class WriteHelper: public Helper { private: vector> data; @@ -270,17 +354,17 @@ class WriteBuffer: public Buffer void dump() { - if (u0<0) return; + if (bu0<-nsafe) return; // nothing written into buffer yet #pragma omp critical { - int idxu = (u0-su+1+nu)%nu; - int idxv0 = (v0-sv+1+nv)%nv; - for (int iu=0; iu<2*su; ++iu) + int idxu = (bu0+nu)%nu; + int idxv0 = (bv0+nv)%nv; + for (int iu=0; iu=nv) idxv=0; } if (++idxu>=nu) idxu=0; @@ -290,24 +374,24 @@ class WriteBuffer: public Buffer public: complex *p0; - WriteBuffer(int nu_, int nv_, int nspread_, complex *grid_) - : Buffer(nu_, nv_, nspread_), data(4*su*sv,0.), grid(grid_) {} - ~WriteBuffer() { dump(); } + WriteHelper(int nu_, int nv_, double epsilon, complex *grid_) + : Helper(nu_, nv_, epsilon), data(su*sv, 0.), grid(grid_) {} + ~WriteHelper() { dump(); } - void prep_write(int iu, int iv) - /* iu = [0; nu-1]; iv = [0; nv-1] */ + void prep_write(double u_in, double v_in) { - if (need_to_move(iu, iv)) + update(u_in, v_in); + if (need_to_move()) { dump(); - update_position(iu, iv); + update_position(); fill(data.begin(), data.end(), 0.); } - p0 = data.data() + 2*sv*(iu-u0+su-nspread) + iv-v0+sv-nspread; + p0 = data.data() + sv*(iu0-bu0) + iv0-bv0; } }; -class ReadBuffer: public Buffer +class ReadHelper: public Helper { private: vector> data; @@ -315,14 +399,14 @@ class ReadBuffer: public Buffer void load() { - int idxu = (u0-su+1+nu)%nu; - int idxv0 = (v0-sv+1+nv)%nv; - for (int iu=0; iu<2*su; ++iu) + int idxu = (bu0+nu)%nu; + int idxv0 = (bv0+nv)%nv; + for (int iu=0; iu=nv) idxv=0; } if (++idxu>=nu) idxu=0; @@ -331,23 +415,47 @@ class ReadBuffer: public Buffer public: const complex *p0; - ReadBuffer(int nu_, int nv_, int nspread_, const complex *grid_) - : Buffer(nu_, nv_, nspread_), data(4*su*sv,0.), grid(grid_) {} + ReadHelper(int nu_, int nv_, double epsilon, const complex *grid_) + : Helper(nu_, nv_, epsilon), data(su*sv,0.), grid(grid_), p0(nullptr) {} - void prep_read(int iu, int iv) - /* iu = [0; nu-1]; iv = [0; nv-1] */ + void prep_read(double u_in, double v_in) { - if (need_to_move(iu, iv)) + update(u_in, v_in); + if (need_to_move()) { - update_position(iu, iv); + update_position(); load(); } - p0 = data.data() + 2*sv*(iu-u0+su-nspread) + iv-v0+sv-nspread; + p0 = data.data() + sv*(iu0-bu0) + iv0-bv0; } }; -a_c_c to_grid (const a_d_c &uv_, const a_c_c &vis_, - int nu, int nv, int nspread, double r2lamb) +a_d_c complex2hartley (const a_c_c &grid_) + { + myassert(grid_.ndim()==2, "grid array must be 2D"); + int nu = grid_.shape(0), nv = grid_.shape(1); + auto grid = grid_.data(); + + int odim[] = {nu,nv}; + a_d_c res(odim); + auto grid2 = res.mutable_data(); + for (int u=0; u tmp = hlp.val*hlp.ku[cu]; - for (int cv=0; cv<2*nspread; ++cv) - *ptr++ += tmp*hlp.kv[cv]; - ptr+=delta; + complex tmp = v*ku[cu]; + for (int cv=0; cv(v1+v2, v1-v2); } } return res; } -a_c_c from_grid (const a_d_c &uv_, const a_c_c &grid_, - int nu, int nv, int nspread, double r2lamb) +a_c_c from_grid (const a_d_c &uv_, const a_d_c &grid0_, double epsilon) { myassert(uv_.ndim()==2, "uv array must be 2D"); myassert(uv_.shape(1)==2, "uv.shape[1] must be 2"); - myassert(grid_.ndim()==2, "grid array must be 2D"); + myassert(grid0_.ndim()==2, "grid array must be 2D"); + auto grid_ = hartley2complex(grid0_); int nvis = uv_.shape(0); auto uv = uv_.data(); auto grid = grid_.data(); - myassert(nu==grid_.shape(0), "oops"); - myassert(nv==grid_.shape(1), "oops"); + int nu=grid_.shape(0), + nv=grid_.shape(1); int odim[] = {nvis}; a_c_c res(odim); @@ -431,52 +542,55 @@ a_c_c from_grid (const a_d_c &uv_, const a_c_c &grid_, // Loop over sampling points #pragma omp parallel { - Helper hlp(nu, nv, nspread, r2lamb); - ReadBuffer buf(nu, nv, nspread, grid); - int delta = 2*(buf.sv-nspread); + ReadHelper hlp(nu, nv, epsilon, grid); + double emb = exp(-2*hlp.beta); #pragma omp for schedule(dynamic,10000) for (int ipart=0; ipart r = 0.; - buf.prep_read(hlp.iu, hlp.iv); - auto ptr = buf.p0; - for (int cu=0; cu<2*nspread; ++cu) + auto * RESTRICT ptr = hlp.p0; + int w = hlp.w; + const double * RESTRICT ku = hlp.kernel.data(); + const double * RESTRICT kv = hlp.kernel.data()+hlp.w; + for (int cu=0; cu tmp = 0.; - for (int cv=0; cv<2*nspread; ++cv) - tmp += (*ptr++) * hlp.kv[cv]; - r+=tmp*hlp.ku[cu]; - ptr += delta; + for (int cv=0; cv x, wgt; + legendre_prep(2*p,x,wgt); + auto psi = x; + for (auto &v:psi) + v = exp(beta*(sqrt(1-v*v)-1.)); + int odim[] = {int(nval)}; + a_d_c res(odim); + auto val = res.mutable_data(); + for (size_t k=0; k(v1+v2, v1-v2); - } + double tmp=0; + for (int i=0; i=1.10.4', 'pybind11>=2.2.1'], + setup_requires=['numpy>=1.15.0', 'pybind11>=2.2.4'], ext_modules=get_extension_modules(), - install_requires=['numpy>=1.10.4', 'pybind11>=2.2.1'] + install_requires=['numpy>=1.15.0', 'pybind11>=2.2.4'] )