diff --git a/nifty_gridder.cc b/nifty_gridder.cc
index fd76da8109ffdf093d55e5d236a7decbfc195c07..2c741f56fe2dbb31a5105de1dfa414f4caacfdee 100644
--- a/nifty_gridder.cc
+++ b/nifty_gridder.cc
@@ -659,6 +659,41 @@ template<typename T> class GridderConfig
       return res;
       }
 
+    pyarr_c<T> grid2dirty_wstacking(const pyarr_c<complex<T>> &grid, double w) const
+      {
+      checkArray(grid, "grid", {nu, nv});
+      auto tmp = makeArray<complex<T>>({nu, nv});
+      auto ptmp = tmp.mutable_data();
+      pocketfft::c2c({nu,nv},{grid.strides(0),grid.strides(1)},
+        {tmp.strides(0), tmp.strides(1)}, {0,1}, pocketfft::BACKWARD,
+        grid.data(), tmp.mutable_data(), T(1), nthreads);
+      auto res = makeArray<T>({nx_dirty, ny_dirty});
+      auto pout = res.mutable_data();
+      {
+        py::gil_scoped_release release;
+#pragma omp parallel num_threads(nthreads)
+{
+#pragma omp for schedule(static)
+        for (size_t i=0; i<nx_dirty; ++i)
+          {
+            double fx = (i-nx_dirty/2.)*Pixsize_x();
+            fx *= fx;
+            for (size_t j=0; j<ny_dirty; ++j)
+              {
+                size_t i2 = nu-nx_dirty/2+i;
+                if (i2>=nu) i2-=nu;
+                size_t j2 = nv-ny_dirty/2+j;
+                if (j2>=nv) j2-=nv;
+                double fy = (j-ny_dirty/2.)*Pixsize_y();
+                auto ws = wscreen(fx, fy*fy, w, true);
+                pout[ny_dirty*i + j] = std::real(ptmp[nv*i2+j2]*cfu[i]*cfv[j]*ws);
+              }
+          }
+ }
+      }
+      return res;
+      }
+
     pyarr_c<T> dirty2grid(const pyarr_c<T> &dirty) const
       {
       checkArray(dirty, "dirty", {nx_dirty, ny_dirty});
@@ -707,6 +742,50 @@ template<typename T> class GridderConfig
       }
       return tmp;
       }
+    inline complex<T> wscreen(double x, double y, double w, bool adjoint) const
+    {
+      constexpr double pi = 3.141592653589793238462643383279502884197;
+      constexpr complex<double> imagunit {0,1};
+      double n = cos(sqrt(x+y));
+      complex<T> phase = 2*pi*w*imagunit*(n-1);
+      if (adjoint) phase *= -1;
+      return exp(phase)/n;
+    }
+    pyarr_c<complex<T>> dirty2grid_wstacking(const pyarr_c<T> &dirty, double w) const
+    {
+      checkArray(dirty, "dirty", {nx_dirty, ny_dirty});
+      auto pdirty = dirty.data();
+      auto tmp = makeArray<complex<T>>({nu, nv});
+      auto ptmp = tmp.mutable_data();
+      pocketfft::stride_t strides{tmp.strides(0),tmp.strides(1)};
+      {
+        py::gil_scoped_release release;
+        for (size_t i=0; i<nu*nv; ++i)
+          ptmp[i] = 0.;
+#pragma omp parallel num_threads(nthreads)
+        {
+#pragma omp for schedule(static)
+        for (size_t i=0; i<nx_dirty; ++i)
+          {
+            double fx = (i-nx_dirty/2.)*Pixsize_x();
+            fx *= fx;
+            for (size_t j=0; j<ny_dirty; ++j)
+              {
+                size_t i2 = nu-nx_dirty/2+i;
+                if (i2>=nu) i2-=nu;
+                size_t j2 = nv-ny_dirty/2+j;
+                if (j2>=nv) j2-=nv;
+                double fy = (j-ny_dirty/2.)*Pixsize_y();
+                auto ws = wscreen(fx, fy*fy, w, false);
+                ptmp[nv*i2+j2] = std::real(pdirty[ny_dirty*i + j])*ws*cfu[i]*cfv[j];
+              }
+          }
+ }
+        pocketfft::c2c({nu,nv}, strides, strides, {0,1}, pocketfft::FORWARD,
+                       ptmp, ptmp, T(1), nthreads);
+      }
+      return tmp;
+    }
     inline void getpix(T u_in, T v_in, T &u, T &v, int &iu0, int &iv0) const
       {
       u=fmodulo(u_in*psx, T(1))*nu,
@@ -1481,6 +1560,8 @@ PYBIND11_MODULE(nifty_gridder, m)
     .def("grid2dirty", &GridderConfig<double>::grid2dirty,
       grid2dirty_DS, "grid"_a)
     .def("grid2dirty_c", &GridderConfig<double>::grid2dirty_c, "grid"_a)
+    .def("grid2dirty_wstacking", &GridderConfig<double>::grid2dirty_wstacking, "grid"_a, "w"_a)
+    .def("dirty2grid_wstacking", &GridderConfig<double>::dirty2grid_wstacking, "grid"_a, "w"_a)
     .def("dirty2grid", &GridderConfig<double>::dirty2grid,
       dirty2grid_DS, "dirty"_a)
     .def("dirty2grid_c", &GridderConfig<double>::dirty2grid_c, "dirty"_a)
diff --git a/profile.py b/profile.py
new file mode 100644
index 0000000000000000000000000000000000000000..26a94e725ccbfa4989793506129f4e2e4ced21c1
--- /dev/null
+++ b/profile.py
@@ -0,0 +1,42 @@
+import nifty_gridder as ng
+from time import time
+import numpy as np
+
+
+def _wscreen(npix, dst, w):
+    dc = (np.linspace(start=-npix/2, stop=npix/2 - 1, num=npix)*dst)**2
+    ls = np.broadcast_to(dc, (dc.shape[0],)*2)
+    theta = np.sqrt(ls + ls.T)
+    n = np.cos(theta)
+    wscreen = np.exp(2*np.pi*1j*w*(n - 1))/n
+    return wscreen
+
+
+def time_op(func, x, ntries=5):
+    t0 = time()
+    for ii in range(ntries):
+        print(ii)
+        func(x)
+    return (time() - t0)/ntries
+
+
+if __name__ == '__main__':
+    ng.set_nthreads(4)
+    ntries = 20
+    nx = 2048
+    dx = 12
+    w = 1000.2
+    ny, dy = nx, dx
+    conf = ng.GridderConfig(nx, ny, 1e-7, dx, dy)
+    x, y = conf.Nu(), conf.Nv()
+
+    fld = np.random.randn(x, y) + 1j*np.random.randn(x, y)
+    func0 = lambda x: (_wscreen(nx, dy, w).conjugate()*conf.grid2dirty_c(x)
+                       ).real
+    func1 = lambda x: conf.grid2dirty_wstacking(x, w)
+    print(time_op(func0, fld), time_op(func1, fld))
+
+    fld = np.random.randn(nx, ny)
+    func0 = lambda x: conf.dirty2grid_c(_wscreen(nx, dx, w)*x.real)
+    func1 = lambda x: conf.dirty2grid_wstacking(x, w)
+    print(time_op(func0, fld), time_op(func1, fld))
diff --git a/test.py b/test.py
index 62b5da5a6c568b065d5254dd2a7c29bf241ad561..cbb5334b35ae5e0e7af61e09686db2357a59a6cd 100644
--- a/test.py
+++ b/test.py
@@ -155,3 +155,28 @@ def test_pickling():
     assert_(gc.Pixsize_y() == gc2.Pixsize_y())
 
 
+def _wscreen(npix, dst, w):
+    dc = (np.linspace(start=-npix/2, stop=npix/2 - 1, num=npix)*dst)**2
+    ls = np.broadcast_to(dc, (dc.shape[0],)*2)
+    theta = np.sqrt(ls + ls.T)
+    n = np.cos(theta)
+    wscreen = np.exp(2*np.pi*1j*w*(n - 1))/n
+    return wscreen
+
+
+@pmp('nx', [4, 18, 54])
+@pmp('dx', [1., 0.13, 132])
+@pmp('w', [0, 10, 8489])
+def test_wstacking(nx, dx, w):
+    ny, dy = nx, dx
+    conf = ng.GridderConfig(nx, ny, 1e-7, dx, dy)
+    x, y = conf.Nu(), conf.Nv()
+    fld = np.random.randn(x, y) + 1j*np.random.randn(x, y)
+    res0 = (_wscreen(nx, dy, w).conjugate()*conf.grid2dirty_c(fld)).real
+    res1 = conf.grid2dirty_wstacking(fld, w)
+    np.testing.assert_allclose(res0, res1)
+
+    fld = np.random.randn(nx, ny)
+    res0 = conf.dirty2grid_c(_wscreen(nx, dx, w)*fld.real)
+    res1 = conf.dirty2grid_wstacking(fld, w)
+    np.testing.assert_allclose(res0, res1)