diff --git a/gridder_cxx.h b/gridder_cxx.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ddec362398c2660436af5782efdf328da2a50b6
--- /dev/null
+++ b/gridder_cxx.h
@@ -0,0 +1,778 @@
+#ifndef GRIDDER_CXX_H
+#define GRIDDER_CXX_H
+
+/*
+ *  This file is part of nifty_gridder.
+ *
+ *  nifty_gridder is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  nifty_gridder is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with nifty_gridder; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Copyright (C) 2019 Max-Planck-Society
+   Author: Martin Reinecke */
+
+#include <iostream>
+#include <algorithm>
+#include <cstdlib>
+#include <cmath>
+#include <vector>
+
+#include "pocketfft_hdronly.h"
+#include <array>
+
+template<typename T, size_t ndim, bool c_contiguous> class mav
+  {
+  static_assert((ndim>0) && (ndim<3), "only supports 1D and 2D arrays");
+
+  private:
+    T *d;
+    std::array<size_t, ndim> shp;
+    std::array<ptrdiff_t, ndim> str;
+
+  public:
+    mav(T *d_, const std::array<size_t,ndim> &shp_,
+        const std::array<ptrdiff_t,ndim> &str_)
+      : d(d_), shp(shp_), str(str_)
+      { static_assert(!c_contiguous, "this type does not accept strides"); }
+    mav(T *d_, const std::array<size_t,ndim> &shp_)
+      : d(d_), shp(shp_)
+      {
+      static_assert(c_contiguous, "this type requires strides");
+      str[ndim-1]=1;
+      for (size_t d=2; d<=ndim; ++d)
+        str[ndim-d] = str[ndim-d+1]*shp[ndim-d+1];
+      }
+    operator mav<const T, ndim, true>() const
+      {
+      static_assert(c_contiguous);
+      return mav<const T, ndim, true>(d,shp);
+      }
+    operator mav<const T, ndim, false>() const
+      { return mav<const T, ndim, false>(d,shp, str); }
+    T &operator[](size_t i)
+      { return operator()(i); }
+    const T &operator[](size_t i) const
+      { return operator()(i); }
+    T &operator()(size_t i)
+      {
+      static_assert(ndim==1, "ndim must be 1");
+      return c_contiguous ? d[i] : d[str[0]*i];
+      }
+    const T &operator()(size_t i) const
+      {
+      static_assert(ndim==1, "ndim must be 1");
+      return c_contiguous ? d[i] : d[str[0]*i];
+      }
+    T &operator()(size_t i, size_t j)
+      {
+      static_assert(ndim==2, "ndim must be 2");
+      return c_contiguous ? d[str[0]*i + j] : d[str[0]*i + str[1]*j];
+      }
+    const T &operator()(size_t i, size_t j) const
+      {
+      static_assert(ndim==2, "ndim must be 2");
+      return c_contiguous ? d[str[0]*i + j] : d[str[0]*i + str[1]*j];
+      }
+    size_t shape(size_t i) const { return shp[i]; }
+    const std::array<size_t,ndim> &shape() const { return shp; }
+    size_t size() const
+      {
+      size_t res=1;
+      for (auto v: shp) res*=v;
+      return res;
+      }
+    size_t stride(size_t i) const { return str[i]; }
+    T *data()
+      {
+//      static_assert(c_contiguous, "type is not C contiguous");
+      return d;
+      }
+    const T *data() const
+      {
+//      static_assert(c_contiguous, "type is not C contiguous");
+      return d;
+      }
+  };
+
+template<typename T, size_t ndim> using cmav = mav<T,ndim,true>;
+template<typename T, size_t ndim> using smav = mav<T,ndim,false>;
+
+//
+// basic utilities
+//
+
+void myassert(bool cond, const char *msg)
+  {
+  if (cond) return;
+  throw std::runtime_error(msg);
+  }
+
+template<size_t ndim> void checkShape
+  (const std::array<size_t, ndim> &shp1, const std::array<size_t, ndim> &shp2)
+  {
+  for (size_t i=0; i<ndim; ++i)
+    myassert(shp1[i]==shp2[i], "shape mismatch");
+  }
+
+/*! Returns the remainder of the division \a v1/v2.
+    The result is non-negative.
+    \a v1 can be positive or negative; \a v2 must be positive. */
+template<typename T> inline T fmodulo (T v1, T v2)
+  {
+  if (v1>=0)
+    return (v1<v2) ? v1 : fmod(v1,v2);
+  T tmp=fmod(v1,v2)+v2;
+  return (tmp==v2) ? T(0) : tmp;
+  }
+
+static size_t nthreads = 1;
+
+constexpr auto set_nthreads_DS = R"""(
+Specifies the number of threads to be used by the module
+
+Parameters
+==========
+nthreads: int
+    the number of threads to be used. Must be >=1.
+)""";
+void set_nthreads(size_t nthreads_)
+  {
+  myassert(nthreads_>=1, "nthreads must be >= 1");
+  nthreads = nthreads_;
+  }
+
+constexpr auto get_nthreads_DS = R"""(
+Returns the number of threads used by the module
+
+Returns
+=======
+int : the number of threads used by the module
+)""";
+size_t get_nthreads()
+  { return nthreads; }
+
+//
+// Utilities for Gauss-Legendre quadrature
+//
+
+static inline double one_minus_x2 (double x)
+  { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
+
+void legendre_prep(int n, std::vector<double> &x, std::vector<double> &w)
+  {
+  constexpr double pi = 3.141592653589793238462643383279502884197;
+  constexpr double eps = 3e-14;
+  int m = (n+1)>>1;
+  x.resize(m);
+  w.resize(m);
+
+  double t0 = 1 - (1-1./n) / (8.*n*n);
+  double t1 = 1./(4.*n+2.);
+
+#pragma omp parallel num_threads(nthreads)
+{
+  int i;
+#pragma omp for schedule(dynamic,100)
+  for (i=1; i<=m; ++i)
+    {
+    double x0 = cos(pi * ((i<<2)-1) * t1) * t0;
+
+    int dobreak=0;
+    int j=0;
+    double dpdx;
+    while(1)
+      {
+      double P_1 = 1.0;
+      double P0 = x0;
+      double dx, x1;
+
+      for (int k=2; k<=n; k++)
+        {
+        double P_2 = P_1;
+        P_1 = P0;
+//        P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
+        P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
+        }
+
+      dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);
+
+      /* Newton step */
+      x1 = x0 - P0/dpdx;
+      dx = x0-x1;
+      x0 = x1;
+      if (dobreak) break;
+
+      if (abs(dx)<=eps) dobreak=1;
+      myassert(++j<100, "convergence problem");
+      }
+
+    x[m-i] = x0;
+    w[m-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
+    }
+} // end of parallel region
+  }
+
+//
+// Start of real gridder functionality
+//
+
+size_t get_supp(double epsilon)
+  {
+  static const std::vector<double> maxmaperr { 1e8, 0.32, 0.021, 6.2e-4,
+    1.08e-5, 1.25e-7, 8.25e-10, 5.70e-12, 1.22e-13, 2.48e-15, 4.82e-17,
+    6.74e-19, 5.41e-21, 4.41e-23, 7.88e-25, 3.9e-26 };
+
+  double epssq = epsilon*epsilon;
+
+  for (size_t i=1; i<maxmaperr.size(); ++i)
+    if (epssq>maxmaperr[i]) return i;
+  throw std::runtime_error("requested epsilon too small - minimum is 2e-13");
+  }
+
+template<typename T> void complex2hartley
+  (const smav<const std::complex<T>, 2> &grid, smav<T,2> &grid2)
+  {
+  myassert(grid.shape()==grid2.shape(), "shape mismatch");
+  size_t nu=grid.shape(0), nv=grid.shape(1);
+
+#pragma omp parallel for num_threads(nthreads)
+  for (size_t u=0; u<nu; ++u)
+    {
+    size_t xu = (u==0) ? 0 : nu-u;
+    for (size_t v=0; v<nv; ++v)
+      {
+      size_t xv = (v==0) ? 0 : nv-v;
+      grid2(u,v) += T(0.5)*(grid( u, v).real()+grid( u, v).imag()+
+                            grid(xu,xv).real()-grid(xu,xv).imag());
+      }
+    }
+  }
+
+template<typename T> void hartley2complex
+  (const smav<const T,2> &grid, smav<std::complex<T>,2> &grid2)
+  {
+  myassert(grid.shape()==grid2.shape(), "shape mismatch");
+  size_t nu=grid.shape(0), nv=grid.shape(1);
+
+#pragma omp parallel for num_threads(nthreads)
+  for (size_t u=0; u<nu; ++u)
+    {
+    size_t xu = (u==0) ? 0 : nu-u;
+    for (size_t v=0; v<nv; ++v)
+      {
+      size_t xv = (v==0) ? 0 : nv-v;
+      T v1 = T(0.5)*grid( u, v);
+      T v2 = T(0.5)*grid(xu,xv);
+      grid2(u,v) = std::complex<T>(v1+v2, v1-v2);
+      }
+    }
+  }
+
+template<typename T> void hartley2_2D(const smav<const T,2> &in, smav<T,2> &out)
+  {
+  myassert(in.shape()==out.shape(), "shape mismatch");
+  size_t nu=in.shape(0), nv=in.shape(1), sz=sizeof(T);
+  pocketfft::stride_t str{ptrdiff_t(sz*nv), ptrdiff_t(sz)};
+  auto d_i = in.data();
+  auto ptmp = out.data();
+  pocketfft::r2r_separable_hartley({nu, nv}, str, str, {0,1}, d_i, ptmp, T(1),
+    nthreads);
+#pragma omp parallel for num_threads(nthreads)
+  for(size_t i=1; i<(nu+1)/2; ++i)
+    for(size_t j=1; j<(nv+1)/2; ++j)
+       {
+       T a = ptmp[i*nv+j];
+       T b = ptmp[(nu-i)*nv+j];
+       T c = ptmp[i*nv+nv-j];
+       T d = ptmp[(nu-i)*nv+nv-j];
+       ptmp[i*nv+j] = T(0.5)*(a+b+c-d);
+       ptmp[(nu-i)*nv+j] = T(0.5)*(a+b+d-c);
+       ptmp[i*nv+nv-j] = T(0.5)*(a+c+d-b);
+       ptmp[(nu-i)*nv+nv-j] = T(0.5)*(b+c+d-a);
+       }
+  }
+
+template<typename T> class EC_Kernel
+  {
+  protected:
+    size_t supp;
+    T beta, emb_;
+
+  public:
+    EC_Kernel(size_t supp_)
+      : supp(supp_), beta(2.3*supp_), emb_(exp(-beta)) {}
+    T operator()(T v) const { return exp(beta*(sqrt(T(1)-v*v)-T(1))); }
+    T val_no_emb(T v) const { return exp(beta*sqrt(T(1)-v*v)); }
+    T emb() const { return emb_; }
+  };
+
+template<typename T> class EC_Kernel_with_correction: public EC_Kernel<T>
+  {
+  protected:
+    static constexpr T pi = T(3.141592653589793238462643383279502884197L);
+    int p;
+    std::vector<T> x, wgt, psi;
+    using EC_Kernel<T>::supp;
+
+  public:
+    using EC_Kernel<T>::operator();
+    EC_Kernel_with_correction(size_t supp_)
+      : EC_Kernel<T>(supp_), p(int(1.5*supp_+2))
+      {
+      legendre_prep(2*p,x,wgt);
+      psi=x;
+      for (auto &v:psi)
+        v=operator()(v);
+      }
+    /* Compute correction factors for the ES gridding kernel
+       This implementation follows eqs. (3.8) to (3.10) of Barnett et al. 2018 */
+    T corfac(T v) const
+      {
+      T tmp=0;
+      for (int i=0; i<p; ++i)
+        tmp += wgt[i]*psi[i]*cos(pi*supp*v*x[i]);
+      return T(1)/(supp*tmp);
+      }
+  };
+/* Compute correction factors for the ES gridding kernel
+   This implementation follows eqs. (3.8) to (3.10) of Barnett et al. 2018 */
+std::vector<double> correction_factors (size_t n, size_t nval, size_t supp)
+  {
+  EC_Kernel_with_correction<double> kernel(supp);
+  std::vector<double> res(nval);
+  double xn = 1./n;
+#pragma omp parallel for schedule(static) num_threads(nthreads)
+  for (size_t k=0; k<nval; ++k)
+    res[k] = kernel.corfac(k*xn);
+  return res;
+  }
+
+template<typename T> struct UVW
+  {
+  T u, v, w;
+  UVW () {}
+  UVW (T u_, T v_, T w_) : u(u_), v(v_), w(w_) {}
+  UVW operator* (T fct) const
+    { return UVW(u*fct, v*fct, w*fct); }
+  };
+
+template<typename T> class Baselines
+  {
+  protected:
+    std::vector<UVW<T>> coord;
+    std::vector<T> f_over_c;
+    size_t nrows, nchan;
+
+  public:
+    Baselines(const smav<const T,2> &coord_, const smav<const T,1> &freq)
+      {
+      constexpr double speedOfLight = 299792458.;
+      myassert(coord_.shape(1)==3, "dimension mismatch");
+      nrows = coord_.shape(0);
+      nchan = freq.shape(0);
+      myassert(nrows*nchan<(size_t(1)<<32), "too many entries in MS");
+      f_over_c.resize(nchan);
+      for (size_t i=0; i<nchan; ++i)
+        f_over_c[i] = freq(i)/speedOfLight;
+      coord.resize(nrows);
+      for (size_t i=0; i<coord.size(); ++i)
+        coord[i] = UVW<T>(coord_(i,0), coord_(i,1), coord_(i,2));
+      }
+
+    UVW<T> effectiveCoord(uint32_t index) const
+      {
+      size_t irow = index/nchan;
+      size_t ichan = index-nchan*irow;
+      return coord[irow]*f_over_c[ichan];
+      }
+    UVW<T> effectiveCoord(size_t irow, size_t ichan) const
+      { return coord[irow]*f_over_c[ichan]; }
+    size_t Nrows() const { return nrows; }
+    size_t Nchannels() const { return nchan; }
+
+    void effectiveUVW(const smav<const uint32_t,1> &idx, smav<T,2> &res) const
+      {
+      size_t nvis = idx.shape(0);
+      myassert(res.shape(0)==nvis, "shape mismatch");
+      myassert(res.shape(1)==3, "shape mismatch");
+      for (size_t i=0; i<nvis; i++)
+        {
+        auto uvw = effectiveCoord(idx(i));
+        res(i,0) = uvw.u;
+        res(i,1) = uvw.v;
+        res(i,2) = uvw.w;
+        }
+      }
+
+    template<typename T2> void ms2vis(const smav<const T2,2> &ms,
+      const smav<const uint32_t,1> &idx, smav<T2,1> &vis) const
+      {
+      myassert(ms.shape(0)==nrows, "shape mismatch");
+      myassert(ms.shape(1)==nchan, "shape mismatch");
+      size_t nvis = idx.shape(0);
+      myassert(vis.shape(0)==nvis, "shape mismatch");
+#pragma omp parallel for num_threads(nthreads)
+      for (size_t i=0; i<nvis; ++i)
+        {
+        auto t = idx(i);
+        auto row = t/nchan;
+        auto chan = t-row*nchan;
+        vis[i] = ms(row, chan);
+        }
+      }
+
+    template<typename T2> void vis2ms(const smav<const T2,1> &vis,
+      const smav<const uint32_t,1> &idx, smav<T2,2> &ms) const
+      {
+      size_t nvis = vis.shape(0);
+      myassert(idx.shape(0)==nvis, "shape mismatch");
+      myassert(ms.shape(0)==nrows, "shape mismatch");
+      myassert(ms.shape(1)==nchan, "shape mismatch");
+#pragma omp parallel for num_threads(nthreads)
+      for (size_t i=0; i<nvis; ++i)
+        {
+        auto t = idx(i);
+        auto row = t/nchan;
+        auto chan = t-row*nchan;
+        ms(row, chan) += vis(i);
+        }
+      }
+  };
+
+template<typename T> class GridderConfig
+  {
+  protected:
+    size_t nx_dirty, ny_dirty;
+    double eps, psx, psy;
+    size_t supp, nsafe, nu, nv;
+    T beta;
+    std::vector<T> cfu, cfv;
+
+    std::complex<T> wscreen(double x, double y, double w, bool adjoint) const
+      {
+      using namespace std;
+      constexpr double pi = 3.141592653589793238462643383279502884197;
+#if 1
+      double eps = sqrt(x+y);
+      double s = sin(eps);
+      double nm1 = -s*s/(1.+cos(eps));
+#else
+      double nm1 = (-x-y)/(sqrt(1.-x-y)+1);
+#endif
+      double n = nm1+1., xn = 1./n;
+      double phase = 2*pi*w*nm1;
+      if (adjoint) phase *= -1;
+      return complex<T>(cos(phase)*xn, sin(phase)*xn);
+      }
+
+  public:
+    GridderConfig(size_t nxdirty, size_t nydirty, double epsilon,
+      double pixsize_x, double pixsize_y)
+      : nx_dirty(nxdirty), ny_dirty(nydirty), eps(epsilon),
+        psx(pixsize_x), psy(pixsize_y),
+        supp(get_supp(epsilon)), nsafe((supp+1)/2),
+        nu(std::max(2*nsafe,2*nx_dirty)), nv(std::max(2*nsafe,2*ny_dirty)),
+        beta(2.3*supp),
+        cfu(nx_dirty), cfv(ny_dirty)
+      {
+      myassert((nx_dirty&1)==0, "nx_dirty must be even");
+      myassert((ny_dirty&1)==0, "ny_dirty must be even");
+      myassert(epsilon>0, "epsilon must be positive");
+      myassert(pixsize_x>0, "pixsize_x must be positive");
+      myassert(pixsize_y>0, "pixsize_y must be positive");
+
+      auto tmp = correction_factors(nu, nx_dirty/2+1, supp);
+      cfu[nx_dirty/2]=tmp[0];
+      cfu[0]=tmp[nx_dirty/2];
+      for (size_t i=1; i<nx_dirty/2; ++i)
+        cfu[nx_dirty/2-i] = cfu[nx_dirty/2+i] = tmp[i];
+      tmp = correction_factors(nv, ny_dirty/2+1, supp);
+      cfv[ny_dirty/2]=tmp[0];
+      cfv[0]=tmp[ny_dirty/2];
+      for (size_t i=1; i<ny_dirty/2; ++i)
+        cfv[ny_dirty/2-i] = cfv[ny_dirty/2+i] = tmp[i];
+      }
+    size_t Nxdirty() const { return nx_dirty; }
+    size_t Nydirty() const { return ny_dirty; }
+    double Epsilon() const { return eps; }
+    double Pixsize_x() const { return psx; }
+    double Pixsize_y() const { return psy; }
+    size_t Nu() const { return nu; }
+    size_t Nv() const { return nv; }
+    size_t Supp() const { return supp; }
+    size_t Nsafe() const { return nsafe; }
+    T Beta() const { return beta; }
+
+    void grid2dirty(const smav<const T,2> &grid, smav<T,2> &grid2) const
+      {
+      checkShape(grid.shape(), {nu,nv});
+      std::vector<T> tmpdat(nu*nv);
+      auto tmp=smav<T,2>(tmpdat.data(),{nu,nv},{nv,1});
+      hartley2_2D<T>(grid, tmp);
+      checkShape(grid2.shape(), {nx_dirty, ny_dirty});
+      for (size_t i=0; i<nx_dirty; ++i)
+        for (size_t j=0; j<ny_dirty; ++j)
+          {
+          size_t i2 = nu-nx_dirty/2+i;
+          if (i2>=nu) i2-=nu;
+          size_t j2 = nv-ny_dirty/2+j;
+          if (j2>=nv) j2-=nv;
+          grid2(i,j) = tmp(i2,j2)*cfu[i]*cfv[j];
+          }
+      }
+#if 0
+    pyarr_c<T> apply_taper(const pyarr_c<T> &img, bool divide) const
+      {
+      checkArray(img, "img", {nx_dirty, ny_dirty});
+      auto pin = img.data();
+      auto res = makeArray<T>({nx_dirty, ny_dirty});
+      auto pout = res.mutable_data();
+      {
+      py::gil_scoped_release release;
+      if (divide)
+        for (size_t i=0; i<nx_dirty; ++i)
+          for (size_t j=0; j<ny_dirty; ++j)
+            pout[ny_dirty*i + j] = pin[ny_dirty*i + j]/(cfu[i]*cfv[j]);
+      else
+        for (size_t i=0; i<nx_dirty; ++i)
+          for (size_t j=0; j<ny_dirty; ++j)
+            pout[ny_dirty*i + j] = pin[ny_dirty*i + j]*cfu[i]*cfv[j];
+      }
+      return res;
+      }
+    pyarr_c<complex<T>> grid2dirty_c(const pyarr_c<complex<T>> &grid) const
+      {
+      checkArray(grid, "grid", {nu, nv});
+      auto tmp = makeArray<complex<T>>({nu, nv});
+      auto ptmp = tmp.mutable_data();
+      pocketfft::c2c({nu,nv},{grid.strides(0),grid.strides(1)},
+        {tmp.strides(0), tmp.strides(1)}, {0,1}, pocketfft::BACKWARD,
+        grid.data(), tmp.mutable_data(), T(1), nthreads);
+      auto res = makeArray<complex<T>>({nx_dirty, ny_dirty});
+      auto pout = res.mutable_data();
+      {
+      py::gil_scoped_release release;
+      for (size_t i=0; i<nx_dirty; ++i)
+        for (size_t j=0; j<ny_dirty; ++j)
+          {
+          size_t i2 = nu-nx_dirty/2+i;
+          if (i2>=nu) i2-=nu;
+          size_t j2 = nv-ny_dirty/2+j;
+          if (j2>=nv) j2-=nv;
+          pout[ny_dirty*i + j] = ptmp[nv*i2+j2]*cfu[i]*cfv[j];
+          }
+      }
+      return res;
+      }
+
+    pyarr_c<T> dirty2grid(const pyarr_c<T> &dirty) const
+      {
+      checkArray(dirty, "dirty", {nx_dirty, ny_dirty});
+      auto pdirty = dirty.data();
+      auto tmp = makeArray<T>({nu, nv});
+      auto ptmp = tmp.mutable_data();
+      {
+      py::gil_scoped_release release;
+      for (size_t i=0; i<nu*nv; ++i)
+        ptmp[i] = 0.;
+      for (size_t i=0; i<nx_dirty; ++i)
+        for (size_t j=0; j<ny_dirty; ++j)
+          {
+          size_t i2 = nu-nx_dirty/2+i;
+          if (i2>=nu) i2-=nu;
+          size_t j2 = nv-ny_dirty/2+j;
+          if (j2>=nv) j2-=nv;
+          ptmp[nv*i2+j2] = pdirty[ny_dirty*i + j]*cfu[i]*cfv[j];
+          }
+      }
+      hartley2_2D<T>(tmp, tmp);
+      return tmp;
+      }
+    pyarr_c<complex<T>> dirty2grid_c(const pyarr_c<complex<T>> &dirty) const
+      {
+      checkArray(dirty, "dirty", {nx_dirty, ny_dirty});
+      auto pdirty = dirty.data();
+      auto tmp = makeArray<complex<T>>({nu, nv});
+      auto ptmp = tmp.mutable_data();
+      pocketfft::stride_t strides{tmp.strides(0),tmp.strides(1)};
+      {
+      py::gil_scoped_release release;
+      for (size_t i=0; i<nu*nv; ++i)
+        ptmp[i] = 0.;
+      for (size_t i=0; i<nx_dirty; ++i)
+        for (size_t j=0; j<ny_dirty; ++j)
+          {
+          size_t i2 = nu-nx_dirty/2+i;
+          if (i2>=nu) i2-=nu;
+          size_t j2 = nv-ny_dirty/2+j;
+          if (j2>=nv) j2-=nv;
+          ptmp[nv*i2+j2] = pdirty[ny_dirty*i + j]*cfu[i]*cfv[j];
+          }
+      pocketfft::c2c({nu,nv}, strides, strides, {0,1}, pocketfft::FORWARD,
+        ptmp, ptmp, T(1), nthreads);
+      }
+      return tmp;
+      }
+#endif
+    inline void getpix(T u_in, T v_in, T &u, T &v, int &iu0, int &iv0) const
+      {
+      u=fmodulo(u_in*psx, T(1))*nu,
+      iu0 = int(u-supp*0.5 + 1 + nu) - nu;
+      if (iu0+supp>nu+nsafe) iu0 = nu+nsafe-supp;
+      v=fmodulo(v_in*psy, T(1))*nv;
+      iv0 = int(v-supp*0.5 + 1 + nv) - nv;
+      if (iv0+supp>nv+nsafe) iv0 = nv+nsafe-supp;
+      }
+#if 0
+    pyarr_c<complex<T>> apply_wscreen(const pyarr_c<complex<T>> &dirty_, double w, bool adjoint) const
+      {
+      checkArray(dirty_, "dirty", {nx_dirty, ny_dirty});
+      auto dirty = dirty_.data();
+      auto res_ = makeArray<complex<T>>({nx_dirty, ny_dirty});
+      auto res = res_.mutable_data();
+      double x0 = -0.5*nx_dirty*psx,
+             y0 = -0.5*ny_dirty*psy;
+      {
+      py::gil_scoped_release release;
+#pragma omp parallel num_threads(nthreads)
+{
+#pragma omp for schedule(static)
+      for (size_t i=0; i<=nx_dirty/2; ++i)
+        {
+        double fx = x0+i*psx;
+        fx *= fx;
+        for (size_t j=0; j<=ny_dirty/2; ++j)
+          {
+          double fy = y0+j*psy;
+          auto ws = wscreen(fx, fy*fy, w, adjoint);
+          res[ny_dirty*i+j] = dirty[ny_dirty*i+j]*ws; // lower left
+          size_t i2 = nx_dirty-i, j2 = ny_dirty-j;
+          if ((i>0)&&(i<i2))
+            {
+            res[ny_dirty*i2+j] = dirty[ny_dirty*i2+j]*ws; // lower right
+            if ((j>0)&&(j<j2))
+              res[ny_dirty*i2+j2] = dirty[ny_dirty*i2+j2]*ws; // upper right
+            }
+          if ((j>0)&&(j<j2))
+            res[ny_dirty*i+j2] = dirty[ny_dirty*i+j2]*ws; // upper left
+          }
+        }
+}
+      }
+      return res_;
+      }
+#endif
+  };
+
+constexpr int logsquare=4;
+
+template<typename T, typename T2=std::complex<T>> class Helper
+  {
+  private:
+    const GridderConfig<T> &gconf;
+    int nu, nv, nsafe, supp;
+    T beta;
+    const T2 *grid_r;
+    T2 *grid_w;
+    int su, sv;
+    int iu0, iv0; // start index of the current visibility
+    int bu0, bv0; // start index of the current buffer
+
+    std::vector<T2> rbuf, wbuf;
+
+    void dump() const
+      {
+      if (bu0<-nsafe) return; // nothing written into buffer yet
+
+#pragma omp critical (gridder_writing_to_grid)
+{
+      int idxu = (bu0+nu)%nu;
+      int idxv0 = (bv0+nv)%nv;
+      for (int iu=0; iu<su; ++iu)
+        {
+        int idxv = idxv0;
+        for (int iv=0; iv<sv; ++iv)
+          {
+          grid_w[idxu*nv + idxv] += wbuf[iu*sv + iv];
+          if (++idxv>=nv) idxv=0;
+          }
+        if (++idxu>=nu) idxu=0;
+        }
+}
+      }
+
+    void load()
+      {
+      int idxu = (bu0+nu)%nu;
+      int idxv0 = (bv0+nv)%nv;
+      for (int iu=0; iu<su; ++iu)
+        {
+        int idxv = idxv0;
+        for (int iv=0; iv<sv; ++iv)
+          {
+          rbuf[iu*sv + iv] = grid_r[idxu*nv + idxv];
+          if (++idxv>=nv) idxv=0;
+          }
+        if (++idxu>=nu) idxu=0;
+        }
+      }
+
+  public:
+    const T2 *p0r;
+    T2 *p0w;
+    std::vector<T> kernel;
+
+    Helper(const GridderConfig<T> &gconf_, const T2 *grid_r_, T2 *grid_w_)
+      : gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()),
+        supp(gconf.Supp()), beta(gconf.Beta()), grid_r(grid_r_),
+        grid_w(grid_w_), su(2*nsafe+(1<<logsquare)), sv(2*nsafe+(1<<logsquare)),
+        bu0(-1000000), bv0(-1000000),
+        rbuf(su*sv*(grid_r!=nullptr),T(0)),
+        wbuf(su*sv*(grid_w!=nullptr),T(0)),
+        kernel(2*supp)
+      {}
+    ~Helper() { if (grid_w) dump(); }
+
+    int lineJump() const { return sv; }
+
+    void prep(T u_in, T v_in)
+      {
+      T u, v;
+      gconf.getpix(u_in, v_in, u, v, iu0, iv0);
+      T xsupp=T(2)/supp;
+      auto x0 = xsupp*(iu0-u);
+      auto y0 = xsupp*(iv0-v);
+      for (int i=0; i<supp; ++i)
+        {
+        auto x = x0+i*xsupp;
+        kernel[i  ] = beta*sqrt(T(1)-x*x);
+        auto y = y0+i*xsupp;
+        kernel[i+supp] = beta*sqrt(T(1)-y*y);
+        }
+      for (auto &k : kernel)
+        k = exp(k);
+
+      if ((iu0<bu0) || (iv0<bv0) || (iu0+supp>bu0+su) || (iv0+supp>bv0+sv))
+        {
+        if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); }
+        bu0=((((iu0+nsafe)>>logsquare)<<logsquare))-nsafe;
+        bv0=((((iv0+nsafe)>>logsquare)<<logsquare))-nsafe;
+        if (grid_r) load();
+        }
+      p0r = grid_r ? rbuf.data() + sv*(iu0-bu0) + iv0-bv0 : nullptr;
+      p0w = grid_w ? wbuf.data() + sv*(iu0-bu0) + iv0-bv0 : nullptr;
+      }
+  };
+
+#endif
diff --git a/nifty_gridder.cc b/nifty_gridder.cc
index f23b5a31741d97023ffcb46d6ef0958488a8ae09..fccb64f2b3cf19c5aa7082e3072b3e3f7c6af274 100644
--- a/nifty_gridder.cc
+++ b/nifty_gridder.cc
@@ -21,12 +21,8 @@
 
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>
-#include <iostream>
-#include <algorithm>
-#include <cstdlib>
-#include <cmath>
 
-#include "pocketfft_hdronly.h"
+#include "gridder_cxx.h"
 
 using namespace std;
 
@@ -40,110 +36,6 @@ auto None = py::none();
 // basic utilities
 //
 
-void myassert(bool cond, const char *msg)
-  {
-  if (cond) return;
-  throw runtime_error(msg);
-  }
-
-/*! Returns the remainder of the division \a v1/v2.
-    The result is non-negative.
-    \a v1 can be positive or negative; \a v2 must be positive. */
-template<typename T> inline T fmodulo (T v1, T v2)
-  {
-  if (v1>=0)
-    return (v1<v2) ? v1 : fmod(v1,v2);
-  T tmp=fmod(v1,v2)+v2;
-  return (tmp==v2) ? T(0) : tmp;
-  }
-
-static size_t nthreads = 1;
-
-constexpr auto set_nthreads_DS = R"""(
-Specifies the number of threads to be used by the module
-
-Parameters
-==========
-nthreads: int
-    the number of threads to be used. Must be >=1.
-)""";
-void set_nthreads(size_t nthreads_)
-  {
-  myassert(nthreads_>=1, "nthreads must be >= 1");
-  nthreads = nthreads_;
-  }
-
-constexpr auto get_nthreads_DS = R"""(
-Returns the number of threads used by the module
-
-Returns
-=======
-int : the number of threads used by the module
-)""";
-size_t get_nthreads()
-  { return nthreads; }
-
-//
-// Utilities for Gauss-Legendre quadrature
-//
-
-static inline double one_minus_x2 (double x)
-  { return (fabs(x)>0.1) ? (1.+x)*(1.-x) : 1.-x*x; }
-
-void legendre_prep(int n, vector<double> &x, vector<double> &w)
-  {
-  constexpr double pi = 3.141592653589793238462643383279502884197;
-  constexpr double eps = 3e-14;
-  int m = (n+1)>>1;
-  x.resize(m);
-  w.resize(m);
-
-  double t0 = 1 - (1-1./n) / (8.*n*n);
-  double t1 = 1./(4.*n+2.);
-
-#pragma omp parallel num_threads(nthreads)
-{
-  int i;
-#pragma omp for schedule(dynamic,100)
-  for (i=1; i<=m; ++i)
-    {
-    double x0 = cos(pi * ((i<<2)-1) * t1) * t0;
-
-    int dobreak=0;
-    int j=0;
-    double dpdx;
-    while(1)
-      {
-      double P_1 = 1.0;
-      double P0 = x0;
-      double dx, x1;
-
-      for (int k=2; k<=n; k++)
-        {
-        double P_2 = P_1;
-        P_1 = P0;
-//        P0 = ((2*k-1)*x0*P_1-(k-1)*P_2)/k;
-        P0 = x0*P_1 + (k-1.)/k * (x0*P_1-P_2);
-        }
-
-      dpdx = (P_1 - x0*P0) * n / one_minus_x2(x0);
-
-      /* Newton step */
-      x1 = x0 - P0/dpdx;
-      dx = x0-x1;
-      x0 = x1;
-      if (dobreak) break;
-
-      if (abs(dx)<=eps) dobreak=1;
-      myassert(++j<100, "convergence problem");
-      }
-
-    x[m-i] = x0;
-    w[m-i] = 2. / (one_minus_x2(x0) * dpdx * dpdx);
-    }
-} // end of parallel region
-  }
-
 //
 // Start of real gridder functionality
 //
@@ -157,19 +49,6 @@ template<typename T>
 template<typename T> pyarr_c<T> makeArray(const vector<size_t> &shape)
   { return pyarr_c<T>(shape); }
 
-size_t get_supp(double epsilon)
-  {
-  static const vector<double> maxmaperr { 1e8, 0.32, 0.021, 6.2e-4,
-    1.08e-5, 1.25e-7, 8.25e-10, 5.70e-12, 1.22e-13, 2.48e-15, 4.82e-17,
-    6.74e-19, 5.41e-21, 4.41e-23, 7.88e-25, 3.9e-26 };
-
-  double epssq = epsilon*epsilon;
-
-  for (size_t i=1; i<maxmaperr.size(); ++i)
-    if (epssq>maxmaperr[i]) return i;
-  throw runtime_error("requested epsilon too small - minimum is 2e-13");
-  }
-
 void checkArray(const py::array &arr, const char *aname,
   const vector<size_t> &shape)
   {
@@ -232,153 +111,111 @@ template<typename T> pyarr_c<T> provideCArray(py::object &in,
   return tmp_;
   }
 
-template<typename T> pyarr_c<T> complex2hartley
-  (const pyarr_c<complex<T>> &grid_, py::object &grid_in)
+template<size_t ndim, typename T> cmav<T,ndim> make_cmav(pyarr_c<T> &in)
   {
-  checkArray(grid_, "grid", {0,0});
-  size_t nu = size_t(grid_.shape(0)), nv = size_t(grid_.shape(1));
-  auto grid = grid_.data();
-
-  auto res = provideCArray<T>(grid_in, {nu, nv});
-  auto grid2 = res.mutable_data();
+  myassert(ndim==in.ndim(), "dimension mismatch");
+  array<size_t,ndim> dims;
+  for (size_t i=0; i<ndim; ++i) dims[i]=in.shape(i);
+  return cmav<T, ndim>(in.mutable_data(),dims);
+  }
+template<size_t ndim, typename T> smav<T,ndim> make_smav(pyarr<T> &in)
   {
-  py::gil_scoped_release release;
-#pragma omp parallel for num_threads(nthreads)
-  for (size_t u=0; u<nu; ++u)
+  myassert(ndim==in.ndim(), "dimension mismatch");
+  array<size_t,ndim> dims;
+  array<ptrdiff_t,ndim> str;
+  for (size_t i=0; i<ndim; ++i)
     {
-    size_t xu = (u==0) ? 0 : nu-u;
-    for (size_t v=0; v<nv; ++v)
-      {
-      size_t xv = (v==0) ? 0 : nv-v;
-      size_t i1 = u*nv+v;
-      size_t i2 = xu*nv+xv;
-      grid2[i1] += T(0.5)*(grid[i1].real()+grid[i1].imag()+
-                           grid[i2].real()-grid[i2].imag());
-      }
+    dims[i]=in.shape(i);
+    str[i]=in.strides(i)/sizeof(T);
+    myassert(str[i]*ptrdiff_t(sizeof(T))==in.strides(i), "weird strides");
     }
+  return smav<T, ndim>(in.mutable_data(),dims,str);
   }
-  return res;
+template<size_t ndim, typename T> smav<T,ndim> make_smav(pyarr_c<T> &in)
+  {
+  myassert(ndim==in.ndim(), "dimension mismatch");
+  array<size_t,ndim> dims;
+  array<ptrdiff_t,ndim> str;
+  for (size_t i=0; i<ndim; ++i)
+    {
+    dims[i]=in.shape(i);
+    str[i]=in.strides(i)/sizeof(T);
+    myassert(str[i]*ptrdiff_t(sizeof(T))==in.strides(i), "weird strides");
+    }
+  return smav<T, ndim>(in.mutable_data(),dims,str);
   }
-
-template<typename T> pyarr_c<complex<T>> hartley2complex
-  (const pyarr_c<T> &grid_)
+template<size_t ndim, typename T> cmav<const T,ndim> make_const_cmav(const pyarr_c<T> &in)
   {
-  checkArray(grid_, "grid", {0, 0});
-  size_t nu = size_t(grid_.shape(0)), nv = size_t(grid_.shape(1));
-  auto grid = grid_.data();
-
-  auto res=makeArray<complex<T>>({nu, nv});
-  auto grid2 = res.mutable_data();
+  myassert(ndim==in.ndim(), "dimension mismatch");
+  array<size_t,ndim> dims;
+  for (size_t i=0; i<ndim; ++i) dims[i]=in.shape(i);
+  return cmav<const T, ndim>(in.data(),dims);
+  }
+template<size_t ndim, typename T> smav<const T,ndim> make_const_smav(const pyarr<T> &in)
   {
-  py::gil_scoped_release release;
-#pragma omp parallel for num_threads(nthreads)
-  for (size_t u=0; u<nu; ++u)
+  myassert(ndim==in.ndim(), "dimension mismatch");
+  array<size_t,ndim> dims;
+  array<ptrdiff_t,ndim> str;
+  for (size_t i=0; i<ndim; ++i)
     {
-    size_t xu = (u==0) ? 0 : nu-u;
-    for (size_t v=0; v<nv; ++v)
-      {
-      size_t xv = (v==0) ? 0 : nv-v;
-      size_t i1 = u*nv+v;
-      size_t i2 = xu*nv+xv;
-      T v1 = T(0.5)*grid[i1];
-      T v2 = T(0.5)*grid[i2];
-      grid2[i1] = complex<T>(v1+v2, v1-v2);
-      }
+    dims[i]=in.shape(i);
+    str[i]=in.strides(i)/sizeof(T);
+    myassert(str[i]*ptrdiff_t(sizeof(T))==in.strides(i), "weird strides");
     }
+  return smav<const T, ndim>(in.data(),dims,str);
   }
-  return res;
+template<size_t ndim, typename T> smav<const T,ndim> make_const_smav(const pyarr_c<T> &in)
+  {
+  myassert(ndim==in.ndim(), "dimension mismatch");
+  array<size_t,ndim> dims;
+  array<ptrdiff_t,ndim> str;
+  for (size_t i=0; i<ndim; ++i)
+    {
+    dims[i]=in.shape(i);
+    str[i]=in.strides(i)/sizeof(T);
+    myassert(str[i]*ptrdiff_t(sizeof(T))==in.strides(i), "weird strides");
+    }
+  return smav<const T, ndim>(in.data(),dims,str);
   }
-
-template<typename T> void hartley2_2D(const pyarr_c<T> &in, pyarr_c<T> &out)
+template<typename T> pyarr_c<T> complex2hartley
+  (const pyarr_c<complex<T>> &grid_, py::object &grid_in)
   {
-  size_t nu=in.shape(0), nv=in.shape(1);
-  pocketfft::stride_t s_i{in.strides(0), in.strides(1)},
-                      s_o{out.strides(0), out.strides(1)};
-  auto d_i = in.data();
-  auto ptmp = out.mutable_data();
+  auto grid = make_const_smav<2>(grid_);
+  size_t nu=grid.shape(0), nv=grid.shape(1);
+
+  auto res = provideCArray<T>(grid_in, {nu, nv});
+  auto grid2 = make_smav<2>(res);
   {
   py::gil_scoped_release release;
-  pocketfft::r2r_separable_hartley({nu, nv}, s_i, s_o, {0,1}, d_i, ptmp, T(1),
-    nthreads);
-#pragma omp parallel for num_threads(nthreads)
-  for(size_t i=1; i<(nu+1)/2; ++i)
-    for(size_t j=1; j<(nv+1)/2; ++j)
-       {
-       T a = ptmp[i*nv+j];
-       T b = ptmp[(nu-i)*nv+j];
-       T c = ptmp[i*nv+nv-j];
-       T d = ptmp[(nu-i)*nv+nv-j];
-       ptmp[i*nv+j] = T(0.5)*(a+b+c-d);
-       ptmp[(nu-i)*nv+j] = T(0.5)*(a+b+d-c);
-       ptmp[i*nv+nv-j] = T(0.5)*(a+c+d-b);
-       ptmp[(nu-i)*nv+nv-j] = T(0.5)*(b+c+d-a);
-       }
+  complex2hartley(grid, grid2);
   }
+  return res;
   }
 
-template<typename T> class EC_Kernel
-  {
-  protected:
-    size_t supp;
-    T beta, emb_;
-
-  public:
-    EC_Kernel(size_t supp_)
-      : supp(supp_), beta(2.3*supp_), emb_(exp(-beta)) {}
-    T operator()(T v) const { return exp(beta*(sqrt(T(1)-v*v)-T(1))); }
-    T val_no_emb(T v) const { return exp(beta*sqrt(T(1)-v*v)); }
-    T emb() const { return emb_; }
-  };
-
-template<typename T> class EC_Kernel_with_correction: public EC_Kernel<T>
+template<typename T> pyarr_c<complex<T>> hartley2complex
+  (const pyarr_c<T> &grid_)
   {
-  protected:
-    static constexpr T pi = T(3.141592653589793238462643383279502884197L);
-    int p;
-    vector<T> x, wgt, psi;
-    using EC_Kernel<T>::supp;
+  auto grid = make_const_smav<2>(grid_);
+  size_t nu=grid.shape(0), nv=grid.shape(1);
 
-  public:
-    using EC_Kernel<T>::operator();
-    EC_Kernel_with_correction(size_t supp_)
-      : EC_Kernel<T>(supp_), p(int(1.5*supp_+2))
-      {
-      legendre_prep(2*p,x,wgt);
-      psi=x;
-      for (auto &v:psi)
-        v=operator()(v);
-      }
-    T corfac(T v) const
-      {
-      T tmp=0;
-      for (int i=0; i<p; ++i)
-        tmp += wgt[i]*psi[i]*cos(pi*supp*v*x[i]);
-      return T(1)/(supp*tmp);
-      }
-  };
-/* Compute correction factors for the ES gridding kernel
-   This implementation follows eqs. (3.8) to (3.10) of Barnett et al. 2018 */
-vector<double> correction_factors (size_t n, size_t nval, size_t supp)
+  auto res = makeArray<complex<T>>({nu, nv});
+  auto grid2 = make_smav<2>(res);
   {
-  EC_Kernel_with_correction<double> kernel(supp);
-  vector<double> res(nval);
-  double xn = 1./n;
-#pragma omp parallel for schedule(static) num_threads(nthreads)
-  for (size_t k=0; k<nval; ++k)
-    res[k] = kernel.corfac(k*xn);
+  py::gil_scoped_release release;
+  hartley2complex(grid, grid2);
+  }
   return res;
   }
 
-template<typename T> struct UVW
+template<typename T> void hartley2_2D(const pyarr_c<T> &in, pyarr_c<T> &out)
   {
-  T u, v, w;
-  UVW () {}
-  UVW (T u_, T v_, T w_) : u(u_), v(v_), w(w_) {}
-  UVW operator* (T fct) const
-    { return UVW(u*fct, v*fct, w*fct); }
-  };
+  auto grid = make_const_smav<2>(in);
+  auto grid2 = make_smav<2>(out);
+  py::gil_scoped_release release;
+  hartley2_2D(grid, grid2);
+  }
 
-constexpr auto Baselines_DS = R"""(
+constexpr auto PyBaselines_DS = R"""(
 Class storing UVW coordinates and channel information.
 
 Parameters
@@ -388,45 +225,23 @@ coord: np.array((nrows, 3), dtype=np.float)
 freq: np.array((nchannels,), dtype=np.float)
     frequency for each individual channel (in Hz)
 )""";
-template<typename T> class Baselines
+template<typename T> class PyBaselines: public Baselines<T>
   {
-  private:
-    vector<UVW<T>> coord;
-    vector<T> f_over_c;
-    size_t nrows, nchan;
+  protected:
+    using Baselines<T>::coord;
+    using Baselines<T>::f_over_c;
+    using Baselines<T>::nrows;
+    using Baselines<T>::nchan;
 
   public:
-    Baselines(const pyarr<T> &coord_, const pyarr<T> &freq_)
-      {
-      constexpr double speedOfLight = 299792458.;
-      checkArray(coord_, "coord", {0, 3});
-      checkArray(freq_, "freq", {0});
-      nrows = coord_.shape(0);
-      nchan = freq_.shape(0);
-      myassert(nrows*nchan<(size_t(1)<<32), "too many entries in MS");
-      auto freq = freq_.template unchecked<1>();
-      auto cood = coord_.template unchecked<2>();
-      {
-      py::gil_scoped_release release;
-      f_over_c.resize(nchan);
-      for (size_t i=0; i<nchan; ++i)
-        f_over_c[i] = freq(i)/speedOfLight;
-      coord.resize(nrows);
-      for (size_t i=0; i<coord.size(); ++i)
-        coord[i] = UVW<T>(cood(i,0), cood(i,1), cood(i,2));
-      }
-      }
+    using Baselines<T>::Baselines;
+    PyBaselines(const pyarr<T> &coord, const pyarr<T> &freq)
+      : Baselines<T>(make_const_smav<2>(coord), make_const_smav<1>(freq))
+      {}
 
-    UVW<T> effectiveCoord(uint32_t index) const
-      {
-      size_t irow = index/nchan;
-      size_t ichan = index-nchan*irow;
-      return coord[irow]*f_over_c[ichan];
-      }
-    UVW<T> effectiveCoord(size_t irow, size_t ichan) const
-      { return coord[irow]*f_over_c[ichan]; }
-    size_t Nrows() const { return nrows; }
-    size_t Nchannels() const { return nchan; }
+    using Baselines<T>::effectiveCoord;
+    using Baselines<T>::Nrows;
+    using Baselines<T>::Nchannels;
 
     static constexpr auto ms2vis_DS = R"""(
     Extracts visibility data from a measurement for the provided indices.
@@ -444,44 +259,31 @@ template<typename T> class Baselines
         The visibility data for the index array
     )""";
 
-    pyarr_c<T> effectiveuvw(const pyarr_c<uint32_t> &idx_) const
-    {
-      checkArray(idx_, "idx", {0});
+   // using Baselines<T>::effectiveUVW;
+    pyarr_c<T> effectiveuvw(const pyarr<uint32_t> &idx_) const
+      {
       size_t nvis = size_t(idx_.shape(0));
-      auto idx = idx_.template unchecked<1>();
+      auto idx=make_const_smav<1>(idx_);
       auto res_=makeArray<T>({nvis, 3});
-      auto res = res_.template mutable_unchecked<2>();
-      for (size_t i=0; i<nvis; i++)
-        {
-          auto uvw = effectiveCoord(idx(i));
-          res(i,0) = uvw.u;
-          res(i,1) = uvw.v;
-          res(i,2) = uvw.w;
-        }
+      auto res=make_smav<2>(res_);
+      {
+      py::gil_scoped_release release;
+      Baselines<T>::effectiveUVW(idx,res);
+      }
       return res_;
-    }
+      }
 
     template<typename T2> pyarr_c<T2> ms2vis(const pyarr<T2> &ms_,
       const pyarr_c<uint32_t> &idx_) const
       {
-      checkArray(idx_, "idx", {0});
-      checkArray(ms_, "ms", {nrows, nchan});
-      size_t nvis = size_t(idx_.shape(0));
-      auto idx = idx_.template unchecked<1>();
-      auto ms = ms_.template unchecked<2>();
-
+      auto idx=make_const_smav<1>(idx_);
+      size_t nvis = size_t(idx.shape(0));
+      auto ms = make_const_smav<2>(ms_);
       auto res=makeArray<T2>({nvis});
-      auto vis = res.mutable_data();
+      auto vis = make_smav<1>(res);
       {
       py::gil_scoped_release release;
-#pragma omp parallel for num_threads(nthreads)
-      for (size_t i=0; i<nvis; ++i)
-        {
-        auto t = idx(i);
-        auto row = t/nchan;
-        auto chan = t-row*nchan;
-        vis[i] = ms(row, chan);
-        }
+      Baselines<T>::ms2vis(ms, idx, vis);
       }
       return res;
       }
@@ -506,31 +308,18 @@ template<typename T> class Baselines
     template<typename T2> pyarr_c<T2> vis2ms(const pyarr<T2> &vis_,
       const pyarr<uint32_t> &idx_, py::object &ms_in) const
       {
-      checkArray(vis_, "vis", {0});
-      size_t nvis = size_t(vis_.shape(0));
-      checkArray(idx_, "idx", {nvis});
-      auto idx = idx_.template unchecked<1>();
-      auto vis = vis_.template unchecked<1>();
-
+      auto vis=make_const_smav<1>(vis_);
+      auto idx=make_const_smav<1>(idx_);
       auto res = provideArray<T2>(ms_in, {nrows, nchan});
-      auto ms = res.template mutable_unchecked<2>();
+      auto ms = make_smav<2>(res);
       {
       py::gil_scoped_release release;
-#pragma omp parallel for num_threads(nthreads)
-      for (size_t i=0; i<nvis; ++i)
-        {
-        auto t = idx(i);
-        auto row = t/nchan;
-        auto chan = t-row*nchan;
-        ms(row, chan) += vis(i);
-        }
+      Baselines<T>::vis2ms(vis, idx, ms);
       }
       return res;
       }
   };
 
-constexpr int logsquare=4;
-
 constexpr auto grid2dirty_DS = R"""(
 Converts from UV grid to dirty image (FFT, cropping, correction)
 
@@ -610,91 +399,48 @@ pixsize_x: float
 pixsize_y: float
     Pixel size in y direction (radians)
 )""";
-template<typename T> class GridderConfig
+template<typename T> class PyGridderConfig: public GridderConfig<T>
   {
-  private:
-    size_t nx_dirty, ny_dirty;
-    double eps, psx, psy;
-    size_t supp, nsafe, nu, nv;
-    T beta;
-    vector<T> cfu, cfv;
-
-    complex<T> wscreen(double x, double y, double w, bool adjoint) const
-      {
-      constexpr double pi = 3.141592653589793238462643383279502884197;
-#if 1
-      double eps = sqrt(x+y);
-      double s = sin(eps);
-      double nm1 = -s*s/(1.+cos(eps));
-#else
-      double nm1 = (-x-y)/(sqrt(1.-x-y)+1);
-#endif
-      double n = nm1+1., xn = 1./n;
-      double phase = 2*pi*w*nm1;
-      if (adjoint) phase *= -1;
-      return complex<T>(cos(phase)*xn, sin(phase)*xn);
-      }
+  protected:
+    using GridderConfig<T>::nx_dirty;
+    using GridderConfig<T>::ny_dirty;
+    using GridderConfig<T>::eps;
+    using GridderConfig<T>::psx;
+    using GridderConfig<T>::psy;
+    using GridderConfig<T>::supp;
+    using GridderConfig<T>::nsafe;
+    using GridderConfig<T>::nu;
+    using GridderConfig<T>::nv;
+    using GridderConfig<T>::beta;
+    using GridderConfig<T>::cfu;
+    using GridderConfig<T>::cfv;
+    using GridderConfig<T>::wscreen;
 
   public:
-    GridderConfig(size_t nxdirty, size_t nydirty, double epsilon,
+    using GridderConfig<T>::GridderConfig;
+    PyGridderConfig(size_t nxdirty, size_t nydirty, double epsilon,
       double pixsize_x, double pixsize_y)
-      : nx_dirty(nxdirty), ny_dirty(nydirty), eps(epsilon),
-        psx(pixsize_x), psy(pixsize_y),
-        supp(get_supp(epsilon)), nsafe((supp+1)/2),
-        nu(max(2*nsafe,2*nx_dirty)), nv(max(2*nsafe,2*ny_dirty)),
-        beta(2.3*supp),
-        cfu(nx_dirty), cfv(ny_dirty)
-      {
-      {
-      py::gil_scoped_release release;
-      myassert((nx_dirty&1)==0, "nx_dirty must be even");
-      myassert((ny_dirty&1)==0, "ny_dirty must be even");
-      myassert(epsilon>0, "epsilon must be positive");
-      myassert(pixsize_x>0, "pixsize_x must be positive");
-      myassert(pixsize_y>0, "pixsize_y must be positive");
-
-      auto tmp = correction_factors(nu, nx_dirty/2+1, supp);
-      cfu[nx_dirty/2]=tmp[0];
-      cfu[0]=tmp[nx_dirty/2];
-      for (size_t i=1; i<nx_dirty/2; ++i)
-        cfu[nx_dirty/2-i] = cfu[nx_dirty/2+i] = tmp[i];
-      tmp = correction_factors(nv, ny_dirty/2+1, supp);
-      cfv[ny_dirty/2]=tmp[0];
-      cfv[0]=tmp[ny_dirty/2];
-      for (size_t i=1; i<ny_dirty/2; ++i)
-        cfv[ny_dirty/2-i] = cfv[ny_dirty/2+i] = tmp[i];
-      }
-      }
-    size_t Nxdirty() const { return nx_dirty; }
-    size_t Nydirty() const { return ny_dirty; }
-    double Epsilon() const { return eps; }
-    double Pixsize_x() const { return psx; }
-    double Pixsize_y() const { return psy; }
-    size_t Nu() const { return nu; }
-    size_t Nv() const { return nv; }
-    size_t Supp() const { return supp; }
-    size_t Nsafe() const { return nsafe; }
-    T Beta() const { return beta; }
+      : GridderConfig<T>(nxdirty, nydirty, epsilon, pixsize_x, pixsize_y) {}
+    using GridderConfig<T>::Nxdirty;
+    using GridderConfig<T>::Nydirty;
+    using GridderConfig<T>::Epsilon;
+    using GridderConfig<T>::Pixsize_x;
+    using GridderConfig<T>::Pixsize_y;
+    using GridderConfig<T>::Nu;
+    using GridderConfig<T>::Nv;
+    using GridderConfig<T>::Supp;
+    using GridderConfig<T>::Nsafe;
+    using GridderConfig<T>::Beta;
 
     pyarr_c<T> grid2dirty(const pyarr_c<T> &grid) const
       {
       checkArray(grid, "grid", {nu, nv});
-      auto tmp = makeArray<T>({nu, nv});
-      auto ptmp = tmp.mutable_data();
-      hartley2_2D<T>(grid, tmp);
       auto res = makeArray<T>({nx_dirty, ny_dirty});
-      auto pout = res.mutable_data();
+      auto grid2=make_const_smav<2>(grid);
+      auto res2=make_smav<2>(res);
       {
       py::gil_scoped_release release;
-      for (size_t i=0; i<nx_dirty; ++i)
-        for (size_t j=0; j<ny_dirty; ++j)
-          {
-          size_t i2 = nu-nx_dirty/2+i;
-          if (i2>=nu) i2-=nu;
-          size_t j2 = nv-ny_dirty/2+j;
-          if (j2>=nv) j2-=nv;
-          pout[ny_dirty*i + j] = ptmp[nv*i2+j2]*cfu[i]*cfv[j];
-          }
+      GridderConfig<T>::grid2dirty(grid2,res2);
       }
       return res;
       }
@@ -790,15 +536,6 @@ template<typename T> class GridderConfig
       }
       return tmp;
       }
-    inline void getpix(T u_in, T v_in, T &u, T &v, int &iu0, int &iv0) const
-      {
-      u=fmodulo(u_in*psx, T(1))*nu,
-      iu0 = int(u-supp*0.5 + 1 + nu) - nu;
-      if (iu0+supp>nu+nsafe) iu0 = nu+nsafe-supp;
-      v=fmodulo(v_in*psy, T(1))*nv;
-      iv0 = int(v-supp*0.5 + 1 + nv) - nv;
-      if (iv0+supp>nv+nsafe) iv0 = nv+nsafe-supp;
-      }
     pyarr_c<complex<T>> apply_wscreen(const pyarr_c<complex<T>> &dirty_, double w, bool adjoint) const
       {
       checkArray(dirty_, "dirty", {nx_dirty, ny_dirty});
@@ -838,103 +575,6 @@ template<typename T> class GridderConfig
       }
   };
 
-template<typename T, typename T2=complex<T>> class Helper
-  {
-  private:
-    const GridderConfig<T> &gconf;
-    int nu, nv, nsafe, supp;
-    T beta;
-    const T2 *grid_r;
-    T2 *grid_w;
-    int su, sv;
-    int iu0, iv0; // start index of the current visibility
-    int bu0, bv0; // start index of the current buffer
-
-    vector<T2> rbuf, wbuf;
-
-    void dump() const
-      {
-      if (bu0<-nsafe) return; // nothing written into buffer yet
-
-#pragma omp critical (gridder_writing_to_grid)
-{
-      int idxu = (bu0+nu)%nu;
-      int idxv0 = (bv0+nv)%nv;
-      for (int iu=0; iu<su; ++iu)
-        {
-        int idxv = idxv0;
-        for (int iv=0; iv<sv; ++iv)
-          {
-          grid_w[idxu*nv + idxv] += wbuf[iu*sv + iv];
-          if (++idxv>=nv) idxv=0;
-          }
-        if (++idxu>=nu) idxu=0;
-        }
-}
-      }
-
-    void load()
-      {
-      int idxu = (bu0+nu)%nu;
-      int idxv0 = (bv0+nv)%nv;
-      for (int iu=0; iu<su; ++iu)
-        {
-        int idxv = idxv0;
-        for (int iv=0; iv<sv; ++iv)
-          {
-          rbuf[iu*sv + iv] = grid_r[idxu*nv + idxv];
-          if (++idxv>=nv) idxv=0;
-          }
-        if (++idxu>=nu) idxu=0;
-        }
-      }
-
-  public:
-    const T2 *p0r;
-    T2 *p0w;
-    vector<T> kernel;
-
-    Helper(const GridderConfig<T> &gconf_, const T2 *grid_r_, T2 *grid_w_)
-      : gconf(gconf_), nu(gconf.Nu()), nv(gconf.Nv()), nsafe(gconf.Nsafe()),
-        supp(gconf.Supp()), beta(gconf.Beta()), grid_r(grid_r_),
-        grid_w(grid_w_), su(2*nsafe+(1<<logsquare)), sv(2*nsafe+(1<<logsquare)),
-        bu0(-1000000), bv0(-1000000),
-        rbuf(su*sv*(grid_r!=nullptr),T(0)),
-        wbuf(su*sv*(grid_w!=nullptr),T(0)),
-        kernel(2*supp)
-      {}
-    ~Helper() { if (grid_w) dump(); }
-
-    int lineJump() const { return sv; }
-
-    void prep(T u_in, T v_in)
-      {
-      T u, v;
-      gconf.getpix(u_in, v_in, u, v, iu0, iv0);
-      T xsupp=T(2)/supp;
-      auto x0 = xsupp*(iu0-u);
-      auto y0 = xsupp*(iv0-v);
-      for (int i=0; i<supp; ++i)
-        {
-        auto x = x0+i*xsupp;
-        kernel[i  ] = beta*sqrt(T(1)-x*x);
-        auto y = y0+i*xsupp;
-        kernel[i+supp] = beta*sqrt(T(1)-y*y);
-        }
-      for (auto &k : kernel)
-        k = exp(k);
-
-      if ((iu0<bu0) || (iv0<bv0) || (iu0+supp>bu0+su) || (iv0+supp>bv0+sv))
-        {
-        if (grid_w) { dump(); fill(wbuf.begin(), wbuf.end(), T(0)); }
-        bu0=((((iu0+nsafe)>>logsquare)<<logsquare))-nsafe;
-        bv0=((((iv0+nsafe)>>logsquare)<<logsquare))-nsafe;
-        if (grid_r) load();
-        }
-      p0r = grid_r ? rbuf.data() + sv*(iu0-bu0) + iv0-bv0 : nullptr;
-      p0w = grid_w ? wbuf.data() + sv*(iu0-bu0) + iv0-bv0 : nullptr;
-      }
-  };
 
 constexpr auto vis2grid_c_DS = R"""(
 Grids visibilities onto a UV grid
@@ -961,7 +601,7 @@ np.array((nu,nv), dtype=np.complex128):
     the gridded visibilities
 )""";
 template<typename T> pyarr_c<complex<T>> vis2grid_c(
-  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
+  const PyBaselines<T> &baselines, const PyGridderConfig<T> &gconf,
   const pyarr<uint32_t> &idx_, const pyarr<complex<T>> &vis_,
   py::object &grid_in, const py::object &wgt_)
   {
@@ -1037,8 +677,8 @@ Returns
 np.array((nu,nv), dtype=np.float64):
     the gridded visibilities (made real by making use of Hermitian symmetry)
 )""";
-template<typename T> pyarr_c<T> vis2grid(const Baselines<T> &baselines,
-  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
+template<typename T> pyarr_c<T> vis2grid(const PyBaselines<T> &baselines,
+  const PyGridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
   const pyarr<complex<T>> &vis_, py::object &grid_in, const py::object &wgt_)
   { return complex2hartley(vis2grid_c(baselines, gconf, idx_, vis_, None, wgt_), grid_in); }
 
@@ -1067,7 +707,7 @@ np.array((nu,nv), dtype=np.complex128):
     the gridded visibilities
 )""";
 template<typename T> pyarr_c<complex<T>> ms2grid_c(
-  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
+  const PyBaselines<T> &baselines, const PyGridderConfig<T> &gconf,
   const pyarr<uint32_t> &idx_, const pyarr<complex<T>> &ms_,
   py::object &grid_in, const py::object &wgt_)
   {
@@ -1124,13 +764,13 @@ template<typename T> pyarr_c<complex<T>> ms2grid_c(
   return res;
   }
 
-template<typename T> pyarr_c<T> ms2grid(const Baselines<T> &baselines,
-  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
+template<typename T> pyarr_c<T> ms2grid(const PyBaselines<T> &baselines,
+  const PyGridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
   const pyarr<complex<T>> &ms_, py::object &grid_in, const py::object &wgt_)
   { return complex2hartley(ms2grid_c(baselines, gconf, idx_, ms_, None, wgt_), grid_in); }
 
 template<typename T> pyarr_c<complex<T>> grid2vis_c(
-  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
+  const PyBaselines<T> &baselines, const PyGridderConfig<T> &gconf,
   const pyarr<uint32_t> &idx_, const pyarr_c<complex<T>> &grid_,
   const py::object &wgt_)
   {
@@ -1205,13 +845,13 @@ Returns
 np.array((nvis,), dtype=np.complex)
     The degridded visibility data
 )""";
-template<typename T> pyarr_c<complex<T>> grid2vis(const Baselines<T> &baselines,
-  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
+template<typename T> pyarr_c<complex<T>> grid2vis(const PyBaselines<T> &baselines,
+  const PyGridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
   const pyarr_c<T> &grid_, const py::object &wgt_)
   { return grid2vis_c(baselines, gconf, idx_, hartley2complex(grid_), wgt_); }
 
-template<typename T> pyarr_c<complex<T>> grid2ms_c(const Baselines<T> &baselines,
-  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
+template<typename T> pyarr_c<complex<T>> grid2ms_c(const PyBaselines<T> &baselines,
+  const PyGridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
   const pyarr_c<complex<T>> &grid_, py::object &ms_in, const py::object &wgt_)
   {
   auto nrows = baselines.Nrows();
@@ -1270,13 +910,13 @@ template<typename T> pyarr_c<complex<T>> grid2ms_c(const Baselines<T> &baselines
   return res;
   }
 
-template<typename T> pyarr_c<complex<T>> grid2ms(const Baselines<T> &baselines,
-  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
+template<typename T> pyarr_c<complex<T>> grid2ms(const PyBaselines<T> &baselines,
+  const PyGridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
   const pyarr_c<T> &grid_, py::object &ms_in, const py::object &wgt_)
   { return grid2ms_c(baselines, gconf, idx_, hartley2complex(grid_), ms_in, wgt_); }
 
 template<typename T> pyarr_c<complex<T>> apply_holo(
-  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
+  const PyBaselines<T> &baselines, const PyGridderConfig<T> &gconf,
   const pyarr<uint32_t> &idx_, const pyarr_c<complex<T>> &grid_,
   const py::object &wgt_)
   {
@@ -1345,7 +985,7 @@ template<typename T> pyarr_c<complex<T>> apply_holo(
   }
 
 template<typename T> pyarr_c<T> get_correlations(
-  const Baselines<T> &baselines, const GridderConfig<T> &gconf,
+  const PyBaselines<T> &baselines, const PyGridderConfig<T> &gconf,
   const pyarr<uint32_t> &idx_, int du, int dv, const py::object &wgt_)
   {
   size_t nu=gconf.Nu(), nv=gconf.Nv();
@@ -1437,8 +1077,8 @@ np.array((nvis,), dtype=np.uint32)
     the compressed indices for all entries which match the selected criteria
     and are not flagged.
 )""";
-template<typename T> pyarr_c<uint32_t> getIndices(const Baselines<T> &baselines,
-  const GridderConfig<T> &gconf, const pyarr_c<bool> &flags_, int chbegin,
+template<typename T> pyarr_c<uint32_t> getIndices(const PyBaselines<T> &baselines,
+  const PyGridderConfig<T> &gconf, const pyarr_c<bool> &flags_, int chbegin,
   int chend, T wmin, T wmax)
   {
   size_t nrow=baselines.Nrows(),
@@ -1494,8 +1134,8 @@ template<typename T> pyarr_c<uint32_t> getIndices(const Baselines<T> &baselines,
   return res;
   }
 
-template<typename T> pyarr_c<complex<T>> vis2dirty_wstack(const Baselines<T> &baselines,
-  const GridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
+template<typename T> pyarr_c<complex<T>> vis2dirty_wstack(const PyBaselines<T> &baselines,
+  const PyGridderConfig<T> &gconf, const pyarr<uint32_t> &idx_,
   const pyarr<complex<T>> &vis_)
   {
   auto nx_dirty=gconf.Nxdirty();
@@ -1617,42 +1257,42 @@ PYBIND11_MODULE(nifty_gridder, m)
   {
   using namespace pybind11::literals;
 
-  py::class_<Baselines<double>> (m, "Baselines", Baselines_DS)
+  py::class_<PyBaselines<double>> (m, "Baselines", PyBaselines_DS)
     .def(py::init<const pyarr_c<double> &, const pyarr_c<double> &>(),
       "coord"_a, "freq"_a)
-    .def ("Nrows",&Baselines<double>::Nrows)
-    .def ("Nchannels",&Baselines<double>::Nchannels)
-    .def ("ms2vis",&Baselines<double>::ms2vis<complex<double>>,
-      Baselines<double>::ms2vis_DS, "ms"_a, "idx"_a)
-    .def ("effectiveuvw",&Baselines<double>::effectiveuvw, "idx"_a)
-    .def ("vis2ms",&Baselines<double>::vis2ms<complex<double>>,
-      Baselines<double>::vis2ms_DS, "vis"_a, "idx"_a, "ms_in"_a=None);
-  py::class_<GridderConfig<double>> (m, "GridderConfig", GridderConfig_DS)
+    .def ("Nrows",&PyBaselines<double>::Nrows)
+    .def ("Nchannels",&PyBaselines<double>::Nchannels)
+    .def ("ms2vis",&PyBaselines<double>::ms2vis<complex<double>>,
+      PyBaselines<double>::ms2vis_DS, "ms"_a, "idx"_a)
+    .def ("effectiveuvw",&PyBaselines<double>::effectiveuvw, "idx"_a)
+    .def ("vis2ms",&PyBaselines<double>::vis2ms<complex<double>>,
+      PyBaselines<double>::vis2ms_DS, "vis"_a, "idx"_a, "ms_in"_a=None);
+  py::class_<PyGridderConfig<double>> (m, "GridderConfig", GridderConfig_DS)
     .def(py::init<size_t, size_t, double, double, double>(),"nxdirty"_a,
       "nydirty"_a, "epsilon"_a, "pixsize_x"_a, "pixsize_y"_a)
-    .def("Nxdirty", &GridderConfig<double>::Nxdirty)
-    .def("Nydirty", &GridderConfig<double>::Nydirty)
-    .def("Epsilon", &GridderConfig<double>::Epsilon)
-    .def("Pixsize_x", &GridderConfig<double>::Pixsize_x)
-    .def("Pixsize_y", &GridderConfig<double>::Pixsize_y)
-    .def("Nu", &GridderConfig<double>::Nu)
-    .def("Nv", &GridderConfig<double>::Nv)
-    .def("Supp", &GridderConfig<double>::Supp)
-    .def("apply_taper", &GridderConfig<double>::apply_taper, apply_taper_DS,
+    .def("Nxdirty", &PyGridderConfig<double>::Nxdirty)
+    .def("Nydirty", &PyGridderConfig<double>::Nydirty)
+    .def("Epsilon", &PyGridderConfig<double>::Epsilon)
+    .def("Pixsize_x", &PyGridderConfig<double>::Pixsize_x)
+    .def("Pixsize_y", &PyGridderConfig<double>::Pixsize_y)
+    .def("Nu", &PyGridderConfig<double>::Nu)
+    .def("Nv", &PyGridderConfig<double>::Nv)
+    .def("Supp", &PyGridderConfig<double>::Supp)
+    .def("apply_taper", &PyGridderConfig<double>::apply_taper, apply_taper_DS,
       "img"_a, "divide"_a=false)
-    .def("grid2dirty", &GridderConfig<double>::grid2dirty,
+    .def("grid2dirty", &PyGridderConfig<double>::grid2dirty,
       grid2dirty_DS, "grid"_a)
-    .def("grid2dirty_c", &GridderConfig<double>::grid2dirty_c, "grid"_a)
-    .def("dirty2grid", &GridderConfig<double>::dirty2grid,
+    .def("grid2dirty_c", &PyGridderConfig<double>::grid2dirty_c, "grid"_a)
+    .def("dirty2grid", &PyGridderConfig<double>::dirty2grid,
       dirty2grid_DS, "dirty"_a)
-    .def("dirty2grid_c", &GridderConfig<double>::dirty2grid_c, "dirty"_a)
-    .def("apply_wscreen", &GridderConfig<double>::apply_wscreen,
+    .def("dirty2grid_c", &PyGridderConfig<double>::dirty2grid_c, "dirty"_a)
+    .def("apply_wscreen", &PyGridderConfig<double>::apply_wscreen,
       apply_wscreen_DS, "dirty"_a, "w"_a, "adjoint"_a)
 
     // pickle support
     .def(py::pickle(
         // __getstate__
-        [](const GridderConfig<double> & gc) {
+        [](const PyGridderConfig<double> & gc) {
           // Encode object state in tuple
           return py::make_tuple(gc.Nxdirty(), gc.Nydirty(), gc.Epsilon(),
                                 gc.Pixsize_x(), gc.Pixsize_y());
@@ -1662,7 +1302,7 @@ PYBIND11_MODULE(nifty_gridder, m)
           myassert(t.size()==5,"Invalid state");
 
           // Reconstruct from tuple
-          return GridderConfig<double>(t[0].cast<size_t>(), t[1].cast<size_t>(),
+          return PyGridderConfig<double>(t[0].cast<size_t>(), t[1].cast<size_t>(),
                                        t[2].cast<double>(), t[3].cast<double>(),
                                        t[4].cast<double>());