Skip to content
Snippets Groups Projects
nifty_gridder.cc 45.58 KiB
/*
 *  This file is part of nifty_gridder.
 *
 *  nifty_gridder is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  nifty_gridder is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with nifty_gridder; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

/* Copyright (C) 2019 Max-Planck-Society
   Author: Martin Reinecke */

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <iostream>
#include <algorithm>
#include <cstdlib>
#include <cmath>

#include "pocketfft_hdronly.h"

using namespace std;

namespace py = pybind11;

namespace {

auto None = py::none();

//
// basic utilities
//

void myassert(bool cond, const char *msg)
  {
  if (cond) return;
  throw runtime_error(msg);
  }

/*! Returns the remainder of the division \a v1/v2.
    The result is non-negative.
    \a v1 can be positive or negative; \a v2 must be positive. */
template<typename T> inline T fmodulo (T v1, T v2)
  {
  if (v1>=0)
    return (v1<v2) ? v1 : fmod(v1,v2);
  T tmp=fmod(v1,v2)+v2;
  return (tmp==v2) ? T(0) : tmp;
  }

static size_t nthreads = 1;

constexpr auto set_nthreads_DS = R"""(
Specifies the number of threads to be used by the module

Parameters
==========
nthreads: int
    the number of threads to be used. Must be >=1.
)""";
void set_nthreads(size_t nthreads_)
  {
  myassert(nthreads_>=1, "nthreads must be >= 1");
  nthreads = nthreads_;
  }

constexpr auto get_nthreads_DS = R"""(
Returns the number of threads used by the module

Returns
=======
int : the number of threads used by the module
)""";
size_t get_nthreads()
  { return nthreads; }

//
// Utilities for Gauss-Legendre quadrature
//

static inline double one_minus_x2 (double x)
  { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }

void legendre_prep(int n, vector<double> &x, vector<double> &w)
  {
  constexpr double pi = 3.141592653589793238462643383279502884197;
  constexpr double eps = 3e-14;
  int m = (n+1)>>1;
  x.resize(m);
  w.resize(m);

  double t0 = 1 - (1-1./n) / (8.*n*n);
  double t1 = 1./(4.*n+2.);

#pragma omp parallel num_threads(nthreads)
{
  int i;
#pragma omp for schedule(dynamic,100)
  for (i=1; i<=m; ++i)
    {
    double x0 = cos(pi * ((i<<2)-1) * t1) * t0;

    int dobreak=0;
    int j=0;
    double dpdx;
    while(1)
      {
      double P_1 = 1.0;
      double P0 = x0;
      double dx, x1;

      for (int k=2; k<=n; k++)
        {
        double P_2 = P_1;
        P_1 = P0;
//        P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
        P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
        }

      dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);

      /* Newton step */
      x1 = x0 - P0/dpdx;
      dx = x0-x1;
      x0 = x1;
      if (dobreak) break;

      if (abs(dx)<=eps) dobreak=1;
      myassert(++j<100, "convergence problem");
      }
    x[m-i] = x0;
    w[m-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
    }
} // end of parallel region
  }

//
// Start of real gridder functionality
//

template<typename T>
  using pyarr = py::array_t<T>;
// The "_c" suffix here stands for "C memory order, contiguous"
template<typename T>
  using pyarr_c = py::array_t<T, py::array::c_style | py::array::forcecast>;

template<typename T> pyarr_c<T> makeArray(const vector<size_t> &shape)
  { return pyarr_c<T>(shape); }

size_t get_w(double epsilon)
  {
  static const vector<double> maxmaperr { 1e8, 0.32, 0.021, 6.2e-4,
    1.08e-5, 1.25e-7, 8.25e-10, 5.70e-12, 1.22e-13, 2.48e-15, 4.82e-17,
    6.74e-19, 5.41e-21, 4.41e-23, 7.88e-25, 3.9e-26 };

  double epssq = epsilon*epsilon;

  for (size_t i=1; i<maxmaperr.size(); ++i)
    if (epssq>maxmaperr[i]) return i;
  throw runtime_error("requested epsilon too small - minimum is 2e-13");
  }

void checkArray(const py::array &arr, const char *aname,
  const vector<size_t> &shape)
  {
  if (size_t(arr.ndim())!=shape.size())
    {
    cerr << "Array '" << aname << "' has " << arr.ndim() << " dimensions; "
            "expected " << shape.size() << endl;
    throw runtime_error("bad dimensionality");
    }
  for (size_t i=0; i<shape.size(); ++i)
    if ((shape[i]!=0) && (size_t(arr.shape(i))!=shape[i]))
      {
      cerr << "Dimension " << i << " of array '" << aname << "' has size "
           << arr.shape(i) << "; expected " << shape[i] << endl;
      throw runtime_error("bad array size");
      }
  }

template<typename T> pyarr<T> provideArray(const py::object &in,
  const vector<size_t> &shape)
  {
  if (in.is_none())
    {
    auto tmp_ = makeArray<T>(shape);
    size_t sz = size_t(tmp_.size());
    auto tmp = tmp_.mutable_data();
    for (size_t i=0; i<sz; ++i)
      tmp[i] = T(0);
    return tmp_;
    }
  auto tmp_ = in.cast<pyarr<T>>();
  checkArray(tmp_, "temporary", shape);
  return tmp_;
  }

template<typename T> pyarr<T> providePotentialArray(const py::object &in,
  const vector<size_t> &shape)
  {
  if (in.is_none())
    return makeArray<T>(vector<size_t>(shape.size(), 0));
  auto tmp_ = in.cast<pyarr<T>>();
  checkArray(tmp_, "temporary", shape);
  return tmp_;
  }

template<typename T> pyarr_c<T> provideCArray(py::object &in,
  const vector<size_t> &shape)
  {
  if (in.is_none())
    {
    auto tmp_ = makeArray<T>(shape);
    size_t sz = size_t(tmp_.size());
    auto tmp = tmp_.mutable_data();
    for (size_t i=0; i<sz; ++i)
      tmp[i] = T(0);
    return tmp_;
    }
  auto tmp_ = in.cast<pyarr_c<T>>();
  checkArray(tmp_, "temporary", shape);
  return tmp_;
  }

template<typename T> pyarr_c<T> complex2hartley
  (const pyarr_c<complex<T>> &grid_, py::object &grid_in)
  {
  checkArray(grid_, "grid", {0,0});
  size_t nu = size_t(grid_.shape(0)), nv = size_t(grid_.shape(1));
  auto grid = grid_.data();

  auto res = provideCArray<T>(grid_in, {nu, nv});
  auto grid2 = res.mutable_data();
  {
  py::gil_scoped_release release;
#pragma omp parallel for num_threads(nthreads)
  for (size_t u=0; u<nu; ++u)
    {
    size_t xu = (u==0) ? 0 : nu-u;
    for (size_t v=0; v<nv; ++v)
      {
      size_t xv = (v==0) ? 0 : nv-v;
      size_t i1 = u*nv+v;
      size_t i2 = xu*nv+xv;
      grid2[i1] += T(0.5)*(grid[i1].real()+grid[i1].imag()+
                           grid[i2].real()-grid[i2].imag());
      }
    }
  }
  return res;
  }

template<typename T> pyarr_c<complex<T>> hartley2complex
  (const pyarr_c<T> &grid_)
  {
  checkArray(grid_, "grid", {0, 0});
  size_t nu = size_t(grid_.shape(0)), nv = size_t(grid_.shape(1));
  auto grid = grid_.data();

  auto res=makeArray<complex<T>>({nu, nv});
  auto grid2 = res.mutable_data();
  {
  py::gil_scoped_release release;
#pragma omp parallel for num_threads(nthreads)
  for (size_t u=0; u<nu; ++u)
    {
    size_t xu = (u==0) ? 0 : nu-u;
    for (size_t v=0; v<nv; ++v)
      {
      size_t xv = (v==0) ? 0 : nv-v;
      size_t i1 = u*nv+v;
      size_t i2 = xu*nv+xv;
      T v1 = T(0.5)*grid[i1];
      T v2 = T(0.5)*grid[i2];
      grid2[i1] = complex<T>(v1+v2, v1-v2);
      }
    }
  }
  return res;
  }

template<typename T> void hartley2_2D(const pyarr_c<T> &in, pyarr_c<T> &out)
  {
  size_t nu=in.shape(0), nv=in.shape(1);
  pocketfft::stride_t s_i{in.strides(0), in.strides(1)},
                      s_o{out.strides(0), out.strides(1)};
  auto d_i = in.data();
  auto ptmp = out.mutable_data();
  {
  py::gil_scoped_release release;
  pocketfft::r2r_separable_hartley({nu, nv}, s_i, s_o, {0,1}, d_i, ptmp, T(1),
    nthreads);
#pragma omp parallel for num_threads(nthreads)
  for(size_t i=1; i<(nu+1)/2; ++i)
    for(size_t j=1; j<(nv+1)/2; ++j)
       {
       T a = ptmp[i*nv+j];
       T b = ptmp[(nu-i)*nv+j];
       T c = ptmp[i*nv+nv-j];
       T d = ptmp[(nu-i)*nv+nv-j];
       ptmp[i*nv+j] = T(0.5)*(a+b+c-d);
       ptmp[(nu-i)*nv+j] = T(0.5)*(a+b+d-c);
       ptmp[i*nv+nv-j] = T(0.5)*(a+c+d-b);
       ptmp[(nu-i)*nv+nv-j] = T(0.5)*(b+c+d-a);
       }
  }
  }

/* Compute correction factors for the ES gridding kernel
   This implementation follows eqs. (3.8) to (3.10) of Barnett et al. 2018 */
vector<double> correction_factors (size_t n, size_t nval, size_t w)
  {
  constexpr double pi = 3.141592653589793238462643383279502884197;
  auto beta = 2.3*w;
  auto p = int(1.5*w+2);
  double alpha = pi*w/n;
  vector<double> x, wgt;
  legendre_prep(2*p,x,wgt);
  auto psi = x;
  for (auto &v:psi)
    v = exp(beta*(sqrt(1-v*v)-1.));
  vector<double> res(nval);
#pragma omp parallel for schedule(static) num_threads(nthreads)
  for (size_t k=0; k<nval; ++k)
    {
    double tmp=0;
    for (int i=0; i<p; ++i)
      tmp += wgt[i]*psi[i]*cos(alpha*k*x[i]);
    res[k] = 1./(w*tmp);
    }
  return res;
  }

template<typename T> struct UVW
  {
  T u, v, w;
  UVW () {}
  UVW (T u_, T v_, T w_) : u(u_), v(v_), w(w_) {}
  UVW operator* (T fct) const
    { return UVW(u*fct, v*fct, w*fct); }
  };

constexpr auto Baselines_DS = R"""(
Class storing UVW coordinates and channel information.

Parameters
==========
coord: np.array((nrows, 3), dtype=np.float)
    u, v and w coordinates for each row
freq: np.array((nchannels,), dtype=np.float)
    frequency for each individual channel (in Hz)
)""";
template<typename T> class Baselines
  {
  private:
    vector<UVW<T>> coord;
    vector<T> f_over_c;
    size_t nrows, nchan;

  public:
    Baselines(const pyarr<T> &coord_, const pyarr<T> &freq_)
      {
      constexpr double speedOfLight = 299792458.;
      checkArray(coord_, "coord", {0, 3});
      checkArray(freq_, "freq", {0});
      nrows = coord_.shape(0);
      nchan = freq_.shape(0);
      myassert(nrows*nchan<(size_t(1)<<32), "too many entries in MS");
      auto freq = freq_.template unchecked<1>();
      auto cood = coord_.template unchecked<2>();
      {
      py::gil_scoped_release release;
      f_over_c.resize(nchan);
      for (size_t i=0; i<nchan; ++i)
        f_over_c[i] = freq(i)/speedOfLight;
      coord.resize(nrows);
      for (size_t i=0; i<coord.size(); ++i)
        coord[i] = UVW<T>(cood(i,0), cood(i,1), cood(i,2));
      }
      }

    UVW<T> effectiveCoord(uint32_t index) const
      {
      size_t irow = index/nchan;
      size_t ichan = index-nchan*irow;
      return coord[irow]*f_over_c[ichan];
      }
    UVW<T> effectiveCoord(size_t irow, size_t ichan) const
      { return coord[irow]*f_over_c[ichan]; }
    size_t Nrows() const { return nrows; }
    size_t Nchannels() const { return nchan; }

    static constexpr auto ms2vis_DS = R"""(
    Extracts visibility data from a measurement for the provided indices.

    Parameters
    ==========
    ms: np.array((nrows, nchannels), dtype=np.complex)
        the measurement set's visibility data
    idx: np.array((nvis,), dtype=np.uint32)
        the indices to be extracted

    Returns
    =======
    np.array((nvis,), dtype=np.complex)
        The visibility data for the index array
    )""";

    pyarr_c<T> effectiveuvw(const pyarr_c<uint32_t> &idx_) const
    {
      checkArray(idx_, "idx", {0});
      size_t nvis = size_t(idx_.shape(0));
      auto idx = idx_.template unchecked<1>();
      auto res_=makeArray<T>({nvis, 3});
      auto res = res_.template mutable_unchecked<2>();
      for (size_t i=0; i<nvis; i++)
        {
          auto uvw = effectiveCoord(idx(i));
          res(i,0) = uvw.u;
          res(i,1) = uvw.v;
          res(i,2) = uvw.w;
        }
      return res_;
    }

    template<typename T2> pyarr_c<T2> ms2vis(const pyarr<T2> &ms_,
      const pyarr_c<uint32_t> &idx_) const
      {
      checkArray(idx_, "idx", {0});
      checkArray(ms_, "ms", {nrows, nchan});
      size_t nvis = size_t(idx_.shape(0));
      auto idx = idx_.template unchecked<1>();
      auto ms = ms_.template unchecked<2>();

      auto res=makeArray<T2>({nvis});
      auto vis = res.mutable_data();
      {
      py::gil_scoped_release release;
#pragma omp parallel for num_threads(nthreads)
      for (size_t i=0; i<nvis; ++i)
        {
        auto t = idx(i);
        auto row = t/nchan;
        auto chan = t-row*nchan;
        vis[i] = ms(row, chan);
        }
      }
      return res;
      }

    static constexpr auto vis2ms_DS = R"""(
    Produces a new MS with the provided visibilities set.

    Parameters
    ==========
    vis: np.array((nvis,), dtype=np.complex)
        The visibility data for the index array
    idx: np.array((nvis,), dtype=np.uint32)
        the indices to be inserted
    ms_in: np.array((nrows, nchannels), dtype=np.complex), optional
        input measurement set to which the visibilities are added.

    Returns
    =======
    np.array((nrows, nchannels), dtype=np.complex)
        the measurement set's visibility data (0 where not covered by idx)
    )""";
    template<typename T2> pyarr_c<T2> vis2ms(const pyarr<T2> &vis_,
      const pyarr<uint32_t> &idx_, py::object &ms_in) const
      {
      checkArray(vis_, "vis", {0});
      size_t nvis = size_t(vis_.shape(0));
      checkArray(idx_, "idx", {nvis});
      auto idx = idx_.template unchecked<1>();
      auto vis = vis_.template unchecked<1>();

      auto res = provideArray<T2>(ms_in, {nrows, nchan});
      auto ms = res.template mutable_unchecked<2>();
      {
      py::gil_scoped_release release;
#pragma omp parallel for num_threads(nthreads)
      for (size_t i=0; i<nvis; ++i)
        {
        auto t = idx(i);
        auto row = t/nchan;
        auto chan = t-row*nchan;
        ms(row, chan) += vis(i);
        }
      }
      return res;
      }
  };

constexpr int logsquare=4;

constexpr auto grid2dirty_DS = R"""(
Converts from UV grid to dirty image (FFT, cropping, correction)

Parameters
==========
grid: np.array((nu, nv), dtype=np.float64)
    gridded UV data

Returns
=======
nd.array((nxdirty, nydirty), dtype=np.float64)
    the dirty image
)""";

constexpr auto dirty2grid_DS = R"""(
Converts from a dirty image to a UV grid (correction, padding, FFT)

Parameters
==========
dirty: nd.array((nxdirty, nydirty), dtype=np.float64)
    the dirty image

Returns
=======
np.array((nu, nv), dtype=np.float64)
    gridded UV data
)""";

constexpr auto apply_taper_DS = R"""(
Applies the taper (or its inverse) to an image

Parameters
==========
img: nd.array((nxdirty, nydirty), dtype=np.float64)
    the image
divide: bool
    if True, the routine dividex by the taper, otherwise it multiplies by it

Returns
=======
np.array((nxdirty, nydirty), dtype=np.float64)
    the image with the taper applied
)""";

constexpr auto apply_wscreen_DS = R"""(
Applies the w screen to an image

Parameters
==========
dirty: nd.array((nxdirty, nydirty), dtype=np.complex128)
    the image
w : float
    the w value to use
adjoint: bool
    if True, apply the complex conjugate of the w screen

Returns
=======
np.array((nxdirty, nydirty), dtype=np.complex128)
    the image with the w screen applied
)""";

constexpr auto GridderConfig_DS = R"""(
Class storing information related to the gridding/degridding process.

Parameters
==========
nxdirty: int
    x resolution of the dirty image; must be even
nydirty: int
    y resolution of the dirty image; must be even
epsilon: float
    required accuracy for the gridding/degridding step
    Must be >= 2e-13.
pixsize_x: float
    Pixel size in x direction (radians)
pixsize_y: float
    Pixel size in y direction (radians)
)""";
template<typename T> class GridderConfig
  {
  private:
    size_t nx_dirty, ny_dirty;
    double eps, psx, psy;
    size_t w, nsafe, nu, nv;
    T beta;
    vector<T> cfu, cfv;

    complex<T> wscreen(double x, double y, double w, bool adjoint) const
      {
      constexpr double pi = 3.141592653589793238462643383279502884197;
      double eps = sqrt(x+y);
      double s = sin(eps);
      double nm1 = -s*s/(1.+cos(eps));
      double n = nm1+1., xn = 1./n;
      double phase = 2*pi*w*nm1;
      if (adjoint) phase *= -1;
      return complex<T>(cos(phase)*xn, sin(phase)*xn);
      }

  public:
    GridderConfig(size_t nxdirty, size_t nydirty, double epsilon,
      double pixsize_x, double pixsize_y)
      : nx_dirty(nxdirty), ny_dirty(nydirty), eps(epsilon),
        psx(pixsize_x), psy(pixsize_y),
        w(get_w(epsilon)), nsafe((w+1)/2),
        nu(max(2*nsafe,2*nx_dirty)), nv(max(2*nsafe,2*ny_dirty)),
        beta(2.3*w),
        cfu(nx_dirty), cfv(ny_dirty)
      {
      {
      py::gil_scoped_release release;
      myassert((nx_dirty&1)==0, "nx_dirty must be even");
      myassert((ny_dirty&1)==0, "ny_dirty must be even");
      myassert(epsilon>0, "epsilon must be positive");
      myassert(pixsize_x>0, "pixsize_x must be positive");
      myassert(pixsize_y>0, "pixsize_y must be positive");

      auto tmp = correction_factors(nu, nx_dirty/2+1, w);
      cfu[nx_dirty/2]=tmp[0];
      cfu[0]=tmp[nx_dirty/2];
      for (size_t i=1; i<nx_dirty/2; ++i)
        cfu[nx_dirty/2-i] = cfu[nx_dirty/2+i] = tmp[i];
      tmp = correction_factors(nv, ny_dirty/2+1, w);
      cfv[ny_dirty/2]=tmp[0];
      cfv[0]=tmp[ny_dirty/2];
      for (size_t i=1; i<ny_dirty/2; ++i)
        cfv[ny_dirty/2-i] = cfv[ny_dirty/2+i] = tmp[i];
      }
      }
    size_t Nxdirty() const { return nx_dirty; }
    size_t Nydirty() const { return ny_dirty; }
    double Epsilon() const { return eps; }
    double Pixsize_x() const { return psx; }
    double Pixsize_y() const { return psy; }
    size_t Nu() const { return nu; }
    size_t Nv() const { return nv; }
    size_t W() const { return w; }
    size_t Nsafe() const { return nsafe; }
    T Beta() const { return beta; }

    pyarr_c<T> grid2dirty(const pyarr_c<T> &grid) const
      {
      checkArray(grid, "grid", {nu, nv});
      auto tmp = makeArray<T>({nu, nv});
      auto ptmp = tmp.mutable_data();
      hartley2_2D<T>(grid, tmp);
      auto res = makeArray<T>({nx_dirty, ny_dirty});
      auto pout = res.mutable_data();
      {
      py::gil_scoped_release release;
      for (size_t i=0; i<nx_dirty; ++i)
        for (size_t j=0; j<ny_dirty; ++j)
          {
          size_t i2 = nu-nx_dirty/2+i;
          if (i2>=nu) i2-=nu;
          size_t j2 = nv-ny_dirty/2+j;
          if (j2>=nv) j2-=nv;
          pout[ny_dirty*i + j] = ptmp[nv*i2+j2]*cfu[i]*cfv[j];
          }
      }
      return res;
      }
    pyarr_c<T> apply_taper(const pyarr_c<T> &img, bool divide) const
      {
      checkArray(img, "img", {nx_dirty, ny_dirty});
      auto pin = img.data();
      auto res = makeArray<T>({nx_dirty, ny_dirty});
      auto pout = res.mutable_data();
      {
      py::gil_scoped_release release;
      if (divide)
        for (size_t i=0; i<nx_dirty; ++i)
          for (size_t j=0; j<ny_dirty; ++j)
            pout[ny_dirty*i + j] = pin[ny_dirty*i + j]/(cfu[i]*cfv[j]);
      else
        for (size_t i=0; i<nx_dirty; ++i)
          for (size_t j=0; j<ny_dirty; ++j)
            pout[ny_dirty*i + j] = pin[ny_dirty*i + j]*cfu[i]*cfv[j];
      }
      return res;
      }
    pyarr_c<complex<T>> grid2dirty_c(const pyarr_c<complex<T>> &grid) const
      {
      checkArray(grid, "grid", {nu, nv});
      auto tmp = makeArray<complex<T>>({nu, nv});
      auto ptmp = tmp.mutable_data();
      pocketfft::c2c({nu,nv},{grid.strides(0),grid.strides(1)},
        {tmp.strides(0), tmp.strides(1)}, {0,1}, pocketfft::BACKWARD,
        grid.data(), tmp.mutable_data(), T(1), nthreads);
      auto res = makeArray<complex<T>>({nx_dirty, ny_dirty});
      auto pout = res.mutable_data();
      {
      py::gil_scoped_release release;
      for (size_t i=0; i<nx_dirty; ++i)
        for (size_t j=0; j<ny_dirty; ++j)
          {
          size_t i2 = nu-nx_dirty/2+i;
          if (i2>=nu) i2-=nu;
          size_t j2 = nv-ny_dirty/2+j;
          if (j2>=nv) j2-=nv;
          pout[ny_dirty*i + j] = ptmp[nv*i2+j2]*cfu[i]*cfv[j];
          }
      }
      return res;
      }

    pyarr_c<T> dirty2grid(const pyarr_c<T> &dirty) const
      {
      checkArray(dirty, "dirty", {nx_dirty, ny_dirty});
      auto pdirty = dirty.data();
      auto tmp = makeArray<T>({nu, nv});
      auto ptmp = tmp.mutable_data();
      {
      py::gil_scoped_release release;
      for (size_t i=0; i<nu*nv; ++i)
        ptmp[i] = 0.;
      for (size_t i=0; i<nx_dirty; ++i)
        for (size_t j=0; j<ny_dirty; ++j)
          {
          size_t i2 = nu-nx_dirty/2+i;
          if (i2>=nu) i2-=nu;
          size_t j2 = nv-ny_dirty/2+j;
          if (j2>=nv) j2-=nv;
          ptmp[nv*i2+j2] = pdirty[ny_dirty*i + j]*cfu[i]*cfv[j];
          }
      }
      hartley2_2D<T>(tmp, tmp);
      return tmp;
      }
    pyarr_c<complex<T>> dirty2grid_c(const pyarr_c<complex<T>> &dirty) const
      {
      checkArray(dirty, "dirty", {nx_dirty, ny_dirty});
      auto pdirty = dirty.data();
      auto tmp = makeArray<complex<T>>({nu, nv});
      auto ptmp = tmp.mutable_data();
      pocketfft::stride_t strides{tmp.strides(0),tmp.strides(1)};
      {
      py::gil_scoped_release release;
      for (size_t i=0; i<nu*nv; ++i)
        ptmp[i] = 0.;
      for (size_t i=0; i<nx_dirty; ++i)
        for (size_t j=0; j<ny_dirty; ++j)
          {
          size_t i2 = nu-nx_dirty/2+i;
          if (i2>=nu) i2-=nu;
          size_t j2 = nv-ny_dirty/2+j;
          if (j2>=nv) j2-=nv;
          ptmp[nv*i2+j2] = pdirty[ny_dirty*i + j]*cfu[i]*cfv[j];
          }
      pocketfft::c2c({nu,nv}, strides, strides, {0,1}, pocketfft::FORWARD,
        ptmp, ptmp, T(1), nthreads);
      }
      return tmp;
      }
    inline void getpix(T u_in, T v_in, T &u, T &v, int &iu0, int &iv0) const
      {
      u=fmodulo(u_in*psx, T(1))*nu,
      iu0 = int(u-w*0.5 + 1 + nu) - nu;
      if (iu0+w>nu+nsafe) iu0 = nu+nsafe-w;
      v=fmodulo(v_in*psy, T(1))*nv;
      iv0 = int(v-w*0.5 + 1 + nv) - nv;
      if (iv0+w>nv+nsafe) iv0 = nv+nsafe-w;
      }
    pyarr_c<complex<T>> apply_wscreen(const pyarr_c<complex<T>> &dirty_, double w, bool adjoint) const
      {
      checkArray(dirty_, "dirty", {nx_dirty, ny_dirty});
      auto dirty = dirty_.data();
      auto res_ = makeArray<complex<T>>({nx_dirty, ny_dirty});
      auto res = res_.mutable_data();
      double x0 = -0.5*nx_dirty*psx,
             y0 = -0.5*ny_dirty*psy;
      {
      py::gil_scoped_release release;
#pragma omp parallel num_threads(nthreads)
{
#pragma omp for schedule(static)
      for (size_t i=0; i<=nx_dirty/2; ++i)
        {
        double fx = x0+i*psx;
        fx *= fx;
        for (size_t j=0; j<=ny_dirty/2; ++j)
          {
          double fy = y0+j*psy;
          auto ws = wscreen(fx, fy*fy, w, adjoint);
          res[ny_dirty*i+j] = dirty[ny_dirty*i+j]*ws; // lower left
          size_t i2 = nx_dirty-i, j2 = ny_dirty-j;
          if ((i>0)&&(i<i2))
            {
            res[ny_dirty*i2+j] = dirty[ny_dirty*i2+j]*ws; // lower right
            if ((j>0)&&(j<j2))
              res[ny_dirty*i2+j2] = dirty[ny_dirty*i2+j2]*ws; // upper right
            }
          if ((j>0)&&(j<j2))
            res[ny_dirty*i+j2] = dirty[ny_dirty*i+j2]*ws; // upper left
          }
        }
}
      }
    return res_;
    }
  };

template<typename T, typename T2=complex<T>> class Helper
  {
  private:
    const GridderConfig<T> &gconf;
    int nu, nv, nsafe, w;
    T beta;
    const T2 *grid_r;
    T2 *grid_w;
    int su, sv;
    int iu0, iv0; // start index of the current visibility
    int bu0, bv0; // start index of the current buffer

    vector<T2> rbuf, wbuf;

    void dump() const
      {
      if (bu0<-nsafe) return; // nothing written into buffer yet

#pragma omp critical (gridder_writing_to_grid)
{
      int idxu = (bu0+nu)%nu;
      int idxv0 = (bv0+nv)%nv;
      for (int iu=0; iu<su; ++iu)
        {
        int idxv = idxv0;
        for (int iv=0; iv<sv; ++iv)
          {
          grid_w[idxu*nv + idxv] += wbuf[iu*sv + iv];
          if (++idxv>=nv) idxv=0;
          }
        if (++idxu>=nu) idxu=0;
        }
}
      }

    void load()
      {
      int idxu = (bu0+nu)%nu;
      int idxv0 = (bv0+nv)%nv;
      for (int iu=0; iu<su; ++iu)
        {
        int idxv = idxv0;
        for (int iv=0; iv<sv; ++iv)
          {
          rbuf[iu*sv + iv] = grid_r[idxu*nv + idxv];
          if (++idxv>=nv) idxv=0;
          }
        if (++idxu>=nu) idxu=0;
        }
      }

  public:
    const T2 *p0r;
    T2 *p0w;
    vector<T> kernel;

    Helper(const GridderConfig<T> &gconf_, const T2 *grid_r_, T2 *grid_w_)
      : gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()),
        w(gconf.W()), beta(gconf.Beta()), grid_r(grid_r_), grid_w(grid_w_),
        su(2*nsafe+(1<<logsquare)), sv(2*nsafe+(1<<logsquare)),
        bu0(-1000000), bv0(-1000000),
        rbuf(su*sv*(grid_r!=nullptr),T(0)),
        wbuf(su*sv*(grid_w!=nullptr),T(0)),
        kernel(2*w)
      {}
    ~Helper() { if (grid_w) dump(); }

    int lineJump() const { return sv; }

    void prep(T u_in, T v_in)
      {
      T u, v;
      gconf.getpix(u_in, v_in, u, v, iu0, iv0);
      T xw=T(2)/w;
      auto x0 = xw*(iu0-u);
      auto y0 = xw*(iv0-v);
      for (int i=0; i<w; ++i)
        {
        auto x = x0+i*xw;
        kernel[i  ] = beta*sqrt(T(1)-x*x);
        auto y = y0+i*xw;
        kernel[i+w] = beta*sqrt(T(1)-y*y);
        }
      for (auto &k : kernel)
        k = exp(k);

      if ((iu0<bu0) || (iv0<bv0) || (iu0+w>bu0+su) || (iv0+w>bv0+sv))
        {
        if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); }
        bu0=((((iu0+nsafe)>>logsquare)<<logsquare))-nsafe;
        bv0=((((iv0+nsafe)>>logsquare)<<logsquare))-nsafe;
        if (grid_r) load();
        }
      p0r = grid_r ? rbuf.data() + sv*(iu0-bu0) + iv0-bv0 : nullptr;
      p0w = grid_w ? wbuf.data() + sv*(iu0-bu0) + iv0-bv0 : nullptr;
      }
  };

constexpr auto vis2grid_c_DS = R"""(
Grids visibilities onto a UV grid

Parameters
==========
baselines: Baselines
    the Baselines object
gconf: GridderConf
    the GridderConf object to be used
    (used to optimize the ordering of the indices)
idx: np.array((nvis,), dtype=np.uint32)
    the indices for the entries to be gridded
vis: np.array((nvis,), dtype=np.complex)
    The visibility data for the index array
grid_in: np.array((nu,nv), dtype=np.complex128), optional
    If present, the result is added to this array.
wgt: np.array((nvis,), dtype=np.float64), optional
    If present, visibilities are multiplied by the corresponding entries.

Returns
=======
np.array((nu,nv), dtype=np.complex128):
    the gridded visibilities
)""";
template<typename T> pyarr_c<complex<T>> vis2grid_c(
  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
  const pyarr<uint32_t> &idx_, const pyarr<complex<T>> &vis_,
  py::object &grid_in, const py::object &wgt_)
  {
  checkArray(vis_, "vis", {0});
  size_t nvis = size_t(vis_.shape(0));
  checkArray(idx_, "idx", {nvis});
  auto vis=vis_.template unchecked<1>();
  auto idx = idx_.template unchecked<1>();
  pyarr<T> wgt2 = providePotentialArray<T>(wgt_, {nvis});
  bool have_wgt = wgt2.size()>0;
  auto wgt = wgt2.template unchecked<1>();

  size_t nu=gconf.Nu(), nv=gconf.Nv();
  auto res = provideCArray<complex<T>>(grid_in, {nu, nv});
  auto grid = res.mutable_data();
  {
  py::gil_scoped_release release;
  T beta = gconf.Beta();
  size_t w = gconf.W();

#pragma omp parallel num_threads(nthreads)
{
  Helper<T> hlp(gconf, nullptr, grid);
  T emb = exp(-2*beta);
  int jump = hlp.lineJump();
  const T * ku = hlp.kernel.data();
  const T * kv = hlp.kernel.data()+w;

  // Loop over sampling points
#pragma omp for schedule(guided,100)
  for (size_t ipart=0; ipart<nvis; ++ipart)
    {
    UVW<T> coord = baselines.effectiveCoord(idx(ipart));
    hlp.prep(coord.u, coord.v);
    auto * ptr = hlp.p0w;
    auto v(vis(ipart)*emb);
    if (have_wgt)
      v*=wgt(ipart);
    for (size_t cu=0; cu<w; ++cu)
      {
      complex<T> tmp(v*ku[cu]);
      for (size_t cv=0; cv<w; ++cv)
        ptr[cv] += tmp*kv[cv];
      ptr+=jump;
      }
    }
} // end of parallel region
  }
  return res;
  }

constexpr auto vis2grid_DS = R"""(
Grids visibilities onto a UV grid

Parameters
==========
baselines: Baselines
    the Baselines object
gconf: GridderConf
    the GridderConf object to be used
    (used to optimize the ordering of the indices)
idx: np.array((nvis,), dtype=np.uint32)
    the indices for the entries to be gridded
vis: np.array((nvis,), dtype=np.complex)
    The visibility data for the index array
grid_in: np.array((nu,nv), dtype=np.float64), optional
    If present, the result is added to this array.
wgt: np.array((nvis,), dtype=np.float64), optional
    If present, visibilities are multiplied by the corresponding entries.

Returns
=======
np.array((nu,nv), dtype=np.float64):
    the gridded visibilities (made real by making use of Hermitian symmetry)
)""";
template<typename T> pyarr_c<T> vis2grid(const Baselines<T> &baselines,
  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
  const pyarr<complex<T>> &vis_, py::object &grid_in, const py::object &wgt_)
  { return complex2hartley(vis2grid_c(baselines, gconf, idx_, vis_, None, wgt_), grid_in); }

constexpr auto ms2grid_c_DS = R"""(
Grids measurement set data onto a UV grid

Parameters
==========
baselines: Baselines
    the Baselines object
gconf: GridderConf
    the GridderConf object to be used
    (used to optimize the ordering of the indices)
idx: np.array((nvis,), dtype=np.uint32)
    the indices for the entries to be gridded
ms: np.array((nrows, nchannels), dtype=np.complex128)
    the measurement set.
grid_in: np.array((nu,nv), dtype=np.complex128), optional
    If present, the result is added to this array.
wgt: np.array((nrows, nchannels), dtype=np.float64), optional
    If present, visibilities are multiplied by the corresponding entries.

Returns
=======
np.array((nu,nv), dtype=np.complex128):
    the gridded visibilities
)""";
template<typename T> pyarr_c<complex<T>> ms2grid_c(
  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
  const pyarr<uint32_t> &idx_, const pyarr<complex<T>> &ms_,
  py::object &grid_in, const py::object &wgt_)
  {
  auto nrows = baselines.Nrows();
  auto nchan = baselines.Nchannels();
  checkArray(ms_, "ms", {nrows, nchan});
  checkArray(idx_, "idx", {0});
  size_t nvis = size_t(idx_.shape(0));
  auto ms = ms_.template unchecked<2>();
  auto idx = idx_.template unchecked<1>();
  auto wgt2 = providePotentialArray<T>(wgt_, {nrows, nchan});
  bool have_wgt = wgt2.size()>0;
  auto wgt = wgt2.template unchecked<2>();

  size_t nu=gconf.Nu(), nv=gconf.Nv();
  auto res = provideCArray<complex<T>>(grid_in, {nu, nv});
  auto grid = res.mutable_data();
  {
  py::gil_scoped_release release;
  T beta = gconf.Beta();
  size_t w = gconf.W();

#pragma omp parallel num_threads(nthreads)
{
  Helper<T> hlp(gconf, nullptr, grid);
  T emb = exp(-2*beta);
  int jump = hlp.lineJump();
  const T * ku = hlp.kernel.data();
  const T * kv = hlp.kernel.data()+w;

  // Loop over sampling points
#pragma omp for schedule(guided,100)
  for (size_t ipart=0; ipart<nvis; ++ipart)
    {
    auto tidx = idx(ipart);
    auto row = tidx/nchan;
    auto chan = tidx-row*nchan;
    UVW<T> coord = baselines.effectiveCoord(tidx);
    hlp.prep(coord.u, coord.v);
    auto * ptr = hlp.p0w;
    auto v(ms(row,chan)*emb);
    if (have_wgt)
      v*=wgt(row, chan);
    for (size_t cu=0; cu<w; ++cu)
      {
      complex<T> tmp(v*ku[cu]);
      for (size_t cv=0; cv<w; ++cv)
        ptr[cv] += tmp*kv[cv];
      ptr+=jump;
      }
    }
} // end of parallel region
  }
  return res;
  }

template<typename T> pyarr_c<T> ms2grid(const Baselines<T> &baselines,
  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
  const pyarr<complex<T>> &ms_, py::object &grid_in, const py::object &wgt_)
  { return complex2hartley(ms2grid_c(baselines, gconf, idx_, ms_, None, wgt_), grid_in); }

template<typename T> pyarr_c<complex<T>> grid2vis_c(
  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
  const pyarr<uint32_t> &idx_, const pyarr_c<complex<T>> &grid_,
  const py::object &wgt_)
  {
  size_t nu=gconf.Nu(), nv=gconf.Nv();
  checkArray(idx_, "idx", {0});
  auto grid = grid_.data();
  checkArray(grid_, "grid", {nu, nv});
  size_t nvis = size_t(idx_.shape(0));
  auto idx = idx_.template unchecked<1>();
  auto wgt2 = providePotentialArray<T>(wgt_, {nvis});
  bool have_wgt = wgt2.size()>0;
  auto wgt = wgt2.template unchecked<1>();

  auto res = makeArray<complex<T>>({nvis});
  auto vis = res.mutable_data();
  {
  py::gil_scoped_release release;
  T beta = gconf.Beta();
  size_t w = gconf.W();

  // Loop over sampling points
#pragma omp parallel num_threads(nthreads)
{
  Helper<T> hlp(gconf, grid, nullptr);
  T emb = exp(-2*beta);
  int jump = hlp.lineJump();
  const T * ku = hlp.kernel.data();
  const T * kv = hlp.kernel.data()+w;

#pragma omp for schedule(guided,100)
  for (size_t ipart=0; ipart<nvis; ++ipart)
    {
    UVW<T> coord = baselines.effectiveCoord(idx(ipart));
    hlp.prep(coord.u, coord.v);
    complex<T> r = 0;
    const auto * ptr = hlp.p0r;
    for (size_t cu=0; cu<w; ++cu)
      {
      complex<T> tmp(0);
      for (size_t cv=0; cv<w; ++cv)
        tmp += ptr[cv] * kv[cv];
      r += tmp*ku[cu];
      ptr += jump;
      }
    if (have_wgt) r*=wgt[ipart];
    vis[ipart] = r*emb;
    }
}
  }
  return res;
  }

constexpr auto grid2vis_DS = R"""(
Degrids visibilities from a UV grid

Parameters
==========
baselines: Baselines
    the Baselines object
gconf: GridderConf
    the GridderConf object to be used
    (used to optimize the ordering of the indices)
idx: np.array((nvis,), dtype=np.uint32)
    the indices for the entries to be degridded
grid: np.array((nu,nv), dtype=np.float64):
    the gridded visibilities (made real by making use of Hermitian symmetry)
wgt: np.array((nvis,), dtype=np.float64), optional
    If present, visibilities are multiplied by the corresponding entries.

Returns
=======
np.array((nvis,), dtype=np.complex)
    The degridded visibility data
)""";
template<typename T> pyarr_c<complex<T>> grid2vis(const Baselines<T> &baselines,
  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
  const pyarr_c<T> &grid_, const py::object &wgt_)
  { return grid2vis_c(baselines, gconf, idx_, hartley2complex(grid_), wgt_); }

template<typename T> pyarr_c<complex<T>> grid2ms_c(const Baselines<T> &baselines,
  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
  const pyarr_c<complex<T>> &grid_, py::object &ms_in, const py::object &wgt_)
  {
  auto nrows = baselines.Nrows();
  auto nchan = baselines.Nchannels();
  size_t nu=gconf.Nu(), nv=gconf.Nv();
  checkArray(idx_, "idx", {0});
  auto grid = grid_.data();
  checkArray(grid_, "grid", {nu, nv});
  size_t nvis = size_t(idx_.shape(0));
  auto idx = idx_.template unchecked<1>();
  auto wgt2 = providePotentialArray<T>(wgt_, {nrows, nchan});
  bool have_wgt = wgt2.size()>0;
  auto wgt = wgt2.template unchecked<2>();

  auto res = provideArray<complex<T>>(ms_in, {nrows, nchan});
  auto ms = res.template mutable_unchecked<2>();
  {
  py::gil_scoped_release release;
  T beta = gconf.Beta();
  size_t w = gconf.W();

  // Loop over sampling points
#pragma omp parallel num_threads(nthreads)
{
  Helper<T> hlp(gconf, grid, nullptr);
  T emb = exp(-2*beta);
  int jump = hlp.lineJump();
  const T * ku = hlp.kernel.data();
  const T * kv = hlp.kernel.data()+w;

#pragma omp for schedule(guided,100)
  for (size_t ipart=0; ipart<nvis; ++ipart)
    {
    auto tidx = idx(ipart);
    auto row = tidx/nchan;
    auto chan = tidx-row*nchan;
    UVW<T> coord = baselines.effectiveCoord(tidx);
    hlp.prep(coord.u, coord.v);
    complex<T> r = 0;
    const auto * ptr = hlp.p0r;
    for (size_t cu=0; cu<w; ++cu)
      {
      complex<T> tmp(0);
      for (size_t cv=0; cv<w; ++cv)
        tmp += ptr[cv] * kv[cv];
      r += tmp*ku[cu];
      ptr += jump;
      }
    r*=emb;
    if (have_wgt)
      r*=wgt(row, chan);
    ms(row,chan) += r*emb;
    }
}
  }
  return res;
  }

template<typename T> pyarr_c<complex<T>> grid2ms(const Baselines<T> &baselines,
  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
  const pyarr_c<T> &grid_, py::object &ms_in, const py::object &wgt_)
  { return grid2ms_c(baselines, gconf, idx_, hartley2complex(grid_), ms_in, wgt_); }

template<typename T> pyarr_c<complex<T>> apply_holo(
  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
  const pyarr<uint32_t> &idx_, const pyarr_c<complex<T>> &grid_,
  const py::object &wgt_)
  {
  size_t nu=gconf.Nu(), nv=gconf.Nv();
  checkArray(idx_, "idx", {0});
  auto grid = grid_.data();
  checkArray(grid_, "grid", {nu, nv});
  size_t nvis = size_t(idx_.shape(0));
  auto idx = idx_.template unchecked<1>();
  pyarr<T> wgt2 = providePotentialArray<T>(wgt_, {nvis});
  bool have_wgt = wgt2.size()>0;
  auto wgt = wgt2.template unchecked<1>();
  auto res = makeArray<complex<T>>({nu, nv});
  auto ogrid = res.mutable_data();
  {
  py::gil_scoped_release release;

  for (size_t i=0; i<nu*nv; ++i) ogrid[i] = T(0);
  T beta = gconf.Beta();
  size_t w = gconf.W();

  // Loop over sampling points
#pragma omp parallel num_threads(nthreads)
{
  Helper<T> hlp(gconf, grid, ogrid);
  T emb = exp(-2*beta);
  int jump = hlp.lineJump();
  const T * ku = hlp.kernel.data();
  const T * kv = hlp.kernel.data()+w;

#pragma omp for schedule(guided,100)
  for (size_t ipart=0; ipart<nvis; ++ipart)
    {
    auto tidx = idx(ipart);
    UVW<T> coord = baselines.effectiveCoord(tidx);
    hlp.prep(coord.u, coord.v);
    complex<T> r = 0;
    const auto * ptr = hlp.p0r;
    for (size_t cu=0; cu<w; ++cu)
      {
      complex<T> tmp(0);
      for (size_t cv=0; cv<w; ++cv)
        tmp += ptr[cv] * kv[cv];
      r += tmp*ku[cu];
      ptr += jump;
      }
    r*=emb*emb;
    if (have_wgt)
      {
      auto twgt = wgt(ipart);
      r*=twgt*twgt;
      }
    auto * wptr = hlp.p0w;
    for (size_t cu=0; cu<w; ++cu)
      {
      complex<T> tmp(r*ku[cu]);
      for (size_t cv=0; cv<w; ++cv)
        wptr[cv] += tmp*kv[cv];
      wptr += jump;
      }
    }
}
  }
  return res;
  }

template<typename T> pyarr_c<T> get_correlations(
  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
  const pyarr<uint32_t> &idx_, int du, int dv, const py::object &wgt_)
  {
  size_t nu=gconf.Nu(), nv=gconf.Nv();
  size_t w = gconf.W();
  myassert(size_t(abs(du))<w, "|du| must be smaller than W");
  myassert(size_t(abs(dv))<w, "|dv| must be smaller than W");
  checkArray(idx_, "idx", {0});
  size_t nvis = size_t(idx_.shape(0));
  auto idx = idx_.template unchecked<1>();
  pyarr<T> wgt2 = providePotentialArray<T>(wgt_, {nvis});
  bool have_wgt = wgt2.size()>0;
  auto wgt = wgt2.template unchecked<1>();

  auto res = makeArray<T>({nu, nv});
  auto ogrid = res.mutable_data();
  {
  py::gil_scoped_release release;
  T beta = gconf.Beta();
  for (size_t i=0; i<nu*nv; ++i) ogrid[i] = 0.;

  size_t u0, u1, v0, v1;
  if (du>=0)
    { u0=0; u1=w-du; }
  else
    { u0=-du; u1=w; }
  if (dv>=0)
    { v0=0; v1=w-dv; }
  else
    { v0=-dv; v1=w; }

  // Loop over sampling points
#pragma omp parallel num_threads(nthreads)
{
  Helper<T,T> hlp(gconf, nullptr, ogrid);
  T emb = exp(-2*beta);
  int jump = hlp.lineJump();
  const T * ku = hlp.kernel.data();
  const T * kv = hlp.kernel.data()+w;

#pragma omp for schedule(guided,100)
  for (size_t ipart=0; ipart<nvis; ++ipart)
    {
    UVW<T> coord = baselines.effectiveCoord(idx(ipart));
    hlp.prep(coord.u, coord.v);
    auto * wptr = hlp.p0w + u0*jump;
    auto f0 = emb*emb;
    if (have_wgt)
      {
      auto twgt = wgt(ipart);
      f0*=twgt*twgt;
      }
    for (size_t cu=u0; cu<u1; ++cu)
      {
      auto f1=ku[cu]*ku[cu+du]*f0;
      for (size_t cv=v0; cv<v1; ++cv)
        wptr[cv] += f1*kv[cv]*kv[cv+dv];
      wptr += jump;
      }
    }
}
  }
  return res;
  }

constexpr auto getIndices_DS = R"""(
Selects a subset of entries from a `Baselines` object.

Parameters
==========
baselines: Baselines
    the Baselines object
gconf: GridderConf
    the GridderConf object to be used with the returned indices.
    (used to optimize the ordering of the indices)
flags: np.array((nrows, nchannels), dtype=np.bool)
    "True" indicates that the value should not be used
chbegin: int
    first channel to use (-1: start with the first available channel)
chend: int
    one-past last channel to use (-1: one past the last available channel)
wmin: float
    only select entries with w>=wmin
wmax: float
    only select entries with w<wmax

Returns
=======
np.array((nvis,), dtype=np.uint32)
    the compressed indices for all entries which match the selected criteria
    and are not flagged.
)""";
template<typename T> pyarr_c<uint32_t> getIndices(const Baselines<T> &baselines,
  const GridderConfig<T> &gconf, const pyarr_c<bool> &flags_, int chbegin,
  int chend, T wmin, T wmax)
  {
  size_t nrow=baselines.Nrows(),
         nchan=baselines.Nchannels(),
         nsafe=gconf.Nsafe();
  if (chbegin<0) chbegin=0;
  if (chend<0) chend=nchan;
  myassert(chend>chbegin, "empty channel range selected");
  myassert(chend<=int(nchan), "chend too large");
  myassert(wmax>wmin, "empty w range selected");
  checkArray(flags_, "flags", {nrow, nchan});
  auto flags = flags_.data();
  constexpr int side=1<<logsquare;
  size_t nbu = (gconf.Nu()+1+side-1) >> logsquare,
         nbv = (gconf.Nv()+1+side-1) >> logsquare;
  vector<uint32_t> acc(nbu*nbv+1, 0);
  vector<uint32_t> tmp(nrow*(chend-chbegin));
  {
  py::gil_scoped_release release;
  for (size_t irow=0, idx=0; irow<nrow; ++irow)
    for (int ichan=chbegin; ichan<chend; ++ichan)
      if (!flags[irow*nchan+ichan])
        {
        auto uvw = baselines.effectiveCoord(irow, ichan);
        if ((uvw.w>=wmin) && (uvw.w<wmax))
          {
          T u, v;
          int iu0, iv0;
          gconf.getpix(uvw.u, uvw.v, u, v, iu0, iv0);
          iu0 = (iu0+nsafe)>>logsquare;
          iv0 = (iv0+nsafe)>>logsquare;
          ++acc[nbv*iu0 + iv0 + 1];
          tmp[idx++] = nbv*iu0 + iv0;
          }
        }

  for (size_t i=1; i<acc.size(); ++i)
    acc[i] += acc[i-1];
  }
  auto res = makeArray<uint32_t>({acc.back()});
  auto iout = res.mutable_data();
  {
  py::gil_scoped_release release;
  for (size_t irow=0, idx=0; irow<nrow; ++irow)
    for (int ichan=chbegin; ichan<chend; ++ichan)
      if (!flags[irow*nchan+ichan])
        {
        auto uvw = baselines.effectiveCoord(irow, ichan);
        if ((uvw.w>=wmin) && (uvw.w<wmax))
          iout[acc[tmp[idx++]]++] = irow*nchan+ichan;
        }
  }
  return res;
  }

} // unnamed namespace

PYBIND11_MODULE(nifty_gridder, m)
  {
  using namespace pybind11::literals;
  py::class_<Baselines<double>> (m, "Baselines", Baselines_DS)
    .def(py::init<const pyarr_c<double> &, const pyarr_c<double> &>(),
      "coord"_a, "freq"_a)
    .def ("Nrows",&Baselines<double>::Nrows)
    .def ("Nchannels",&Baselines<double>::Nchannels)
    .def ("ms2vis",&Baselines<double>::ms2vis<complex<double>>,
      Baselines<double>::ms2vis_DS, "ms"_a, "idx"_a)
    .def ("effectiveuvw",&Baselines<double>::effectiveuvw, "idx"_a)
    .def ("vis2ms",&Baselines<double>::vis2ms<complex<double>>,
      Baselines<double>::vis2ms_DS, "vis"_a, "idx"_a, "ms_in"_a=None);
  py::class_<GridderConfig<double>> (m, "GridderConfig", GridderConfig_DS)
    .def(py::init<size_t, size_t, double, double, double>(),"nxdirty"_a,
      "nydirty"_a, "epsilon"_a, "pixsize_x"_a, "pixsize_y"_a)
    .def("Nxdirty", &GridderConfig<double>::Nxdirty)
    .def("Nydirty", &GridderConfig<double>::Nydirty)
    .def("Epsilon", &GridderConfig<double>::Epsilon)
    .def("Pixsize_x", &GridderConfig<double>::Pixsize_x)
    .def("Pixsize_y", &GridderConfig<double>::Pixsize_y)
    .def("Nu", &GridderConfig<double>::Nu)
    .def("Nv", &GridderConfig<double>::Nv)
    .def("W", &GridderConfig<double>::W)
    .def("apply_taper", &GridderConfig<double>::apply_taper, apply_taper_DS,
      "img"_a, "divide"_a=false)
    .def("grid2dirty", &GridderConfig<double>::grid2dirty,
      grid2dirty_DS, "grid"_a)
    .def("grid2dirty_c", &GridderConfig<double>::grid2dirty_c, "grid"_a)
    .def("dirty2grid", &GridderConfig<double>::dirty2grid,
      dirty2grid_DS, "dirty"_a)
    .def("dirty2grid_c", &GridderConfig<double>::dirty2grid_c, "dirty"_a)
    .def("apply_wscreen", &GridderConfig<double>::apply_wscreen,
      apply_wscreen_DS, "dirty"_a, "w"_a, "adjoint"_a)

    // pickle support
    .def(py::pickle(
        // __getstate__
        [](const GridderConfig<double> & gc) {
          // Encode object state in tuple
          return py::make_tuple(gc.Nxdirty(), gc.Nydirty(), gc.Epsilon(),
                                gc.Pixsize_x(), gc.Pixsize_y());
        },
        // __setstate__
        [](py::tuple t) {
          myassert(t.size()==5,"Invalid state");

          // Reconstruct from tuple
          return GridderConfig<double>(t[0].cast<size_t>(), t[1].cast<size_t>(),
                                       t[2].cast<double>(), t[3].cast<double>(),
                                       t[4].cast<double>());

        }));
  m.def("getIndices", getIndices<double>, getIndices_DS, "baselines"_a,
    "gconf"_a, "flags"_a, "chbegin"_a=-1, "chend"_a=-1,
    "wmin"_a=-1e30, "wmax"_a=1e30);
  m.def("vis2grid",&vis2grid<double>, vis2grid_DS, "baselines"_a, "gconf"_a,
    "idx"_a, "vis"_a, "grid_in"_a=None, "wgt"_a=None);
  m.def("ms2grid",&ms2grid<double>, "baselines"_a, "gconf"_a, "idx"_a, "ms"_a,
    "grid_in"_a=None, "wgt"_a=None);
  m.def("grid2vis",&grid2vis<double>, grid2vis_DS, "baselines"_a, "gconf"_a,
    "idx"_a, "grid"_a, "wgt"_a=None);
  m.def("grid2ms",&grid2ms<double>, "baselines"_a, "gconf"_a, "idx"_a,
    "grid"_a, "ms_in"_a=None, "wgt"_a=None);
  m.def("vis2grid_c",&vis2grid_c<double>, vis2grid_c_DS, "baselines"_a,
    "gconf"_a, "idx"_a, "vis"_a, "grid_in"_a=None, "wgt"_a=None);
  m.def("ms2grid_c",&ms2grid_c<double>, ms2grid_c_DS, "baselines"_a, "gconf"_a,
    "idx"_a, "ms"_a, "grid_in"_a=None, "wgt"_a=None);
  m.def("grid2vis_c",&grid2vis_c<double>, "baselines"_a, "gconf"_a, "idx"_a,
    "grid"_a, "wgt"_a=None);
  m.def("grid2ms_c",&grid2ms_c<double>, "baselines"_a, "gconf"_a, "idx"_a,
    "grid"_a, "ms_in"_a=None, "wgt"_a=None);
  m.def("apply_holo",&apply_holo<double>, "baselines"_a, "gconf"_a, "idx"_a,
    "grid"_a, "wgt"_a=None);
  m.def("get_correlations", &get_correlations<double>, "baselines"_a, "gconf"_a,
    "idx"_a, "du"_a, "dv"_a, "wgt"_a=None);
  m.def("set_nthreads",&set_nthreads, set_nthreads_DS, "nthreads"_a);
  m.def("get_nthreads",&get_nthreads, get_nthreads_DS);
  }