diff --git a/nifty_gridder.cc b/nifty_gridder.cc
index 2c741f56fe2dbb31a5105de1dfa414f4caacfdee..d7a1c71baf9c2545fc0d3bc69583b15b5a81f674 100644
--- a/nifty_gridder.cc
+++ b/nifty_gridder.cc
@@ -659,41 +659,6 @@ template<typename T> class GridderConfig
       return res;
       }
 
-    pyarr_c<T> grid2dirty_wstacking(const pyarr_c<complex<T>> &grid, double w) const
-      {
-      checkArray(grid, "grid", {nu, nv});
-      auto tmp = makeArray<complex<T>>({nu, nv});
-      auto ptmp = tmp.mutable_data();
-      pocketfft::c2c({nu,nv},{grid.strides(0),grid.strides(1)},
-        {tmp.strides(0), tmp.strides(1)}, {0,1}, pocketfft::BACKWARD,
-        grid.data(), tmp.mutable_data(), T(1), nthreads);
-      auto res = makeArray<T>({nx_dirty, ny_dirty});
-      auto pout = res.mutable_data();
-      {
-        py::gil_scoped_release release;
-#pragma omp parallel num_threads(nthreads)
-{
-#pragma omp for schedule(static)
-        for (size_t i=0; i<nx_dirty; ++i)
-          {
-            double fx = (i-nx_dirty/2.)*Pixsize_x();
-            fx *= fx;
-            for (size_t j=0; j<ny_dirty; ++j)
-              {
-                size_t i2 = nu-nx_dirty/2+i;
-                if (i2>=nu) i2-=nu;
-                size_t j2 = nv-ny_dirty/2+j;
-                if (j2>=nv) j2-=nv;
-                double fy = (j-ny_dirty/2.)*Pixsize_y();
-                auto ws = wscreen(fx, fy*fy, w, true);
-                pout[ny_dirty*i + j] = std::real(ptmp[nv*i2+j2]*cfu[i]*cfv[j]*ws);
-              }
-          }
- }
-      }
-      return res;
-      }
-
     pyarr_c<T> dirty2grid(const pyarr_c<T> &dirty) const
       {
       checkArray(dirty, "dirty", {nx_dirty, ny_dirty});
@@ -742,50 +707,14 @@ template<typename T> class GridderConfig
       }
       return tmp;
       }
-    inline complex<T> wscreen(double x, double y, double w, bool adjoint) const
-    {
+    complex<T> wscreen(double x, double y, double w, bool adjoint) const
+      {
       constexpr double pi = 3.141592653589793238462643383279502884197;
-      constexpr complex<double> imagunit {0,1};
-      double n = cos(sqrt(x+y));
-      complex<T> phase = 2*pi*w*imagunit*(n-1);
+      double n = cos(sqrt(x+y)), xn = 1./n;
+      double phase = 2*pi*w*(n-1);
       if (adjoint) phase *= -1;
-      return exp(phase)/n;
-    }
-    pyarr_c<complex<T>> dirty2grid_wstacking(const pyarr_c<T> &dirty, double w) const
-    {
-      checkArray(dirty, "dirty", {nx_dirty, ny_dirty});
-      auto pdirty = dirty.data();
-      auto tmp = makeArray<complex<T>>({nu, nv});
-      auto ptmp = tmp.mutable_data();
-      pocketfft::stride_t strides{tmp.strides(0),tmp.strides(1)};
-      {
-        py::gil_scoped_release release;
-        for (size_t i=0; i<nu*nv; ++i)
-          ptmp[i] = 0.;
-#pragma omp parallel num_threads(nthreads)
-        {
-#pragma omp for schedule(static)
-        for (size_t i=0; i<nx_dirty; ++i)
-          {
-            double fx = (i-nx_dirty/2.)*Pixsize_x();
-            fx *= fx;
-            for (size_t j=0; j<ny_dirty; ++j)
-              {
-                size_t i2 = nu-nx_dirty/2+i;
-                if (i2>=nu) i2-=nu;
-                size_t j2 = nv-ny_dirty/2+j;
-                if (j2>=nv) j2-=nv;
-                double fy = (j-ny_dirty/2.)*Pixsize_y();
-                auto ws = wscreen(fx, fy*fy, w, false);
-                ptmp[nv*i2+j2] = std::real(pdirty[ny_dirty*i + j])*ws*cfu[i]*cfv[j];
-              }
-          }
- }
-        pocketfft::c2c({nu,nv}, strides, strides, {0,1}, pocketfft::FORWARD,
-                       ptmp, ptmp, T(1), nthreads);
+      return complex<T>(cos(phase)*xn, sin(phase)*xn);
       }
-      return tmp;
-    }
     inline void getpix(T u_in, T v_in, T &u, T &v, int &iu0, int &iv0) const
       {
       u=fmodulo(u_in*psx, T(1))*nu,
@@ -795,6 +724,40 @@ template<typename T> class GridderConfig
       iv0 = int(v-w*0.5 + 1 + nv) - nv;
       if (iv0+w>nv+nsafe) iv0 = nv+nsafe-w;
       }
+    pyarr_c<complex<T>> apply_wscreen(const pyarr_c<complex<T>> &dirty_, double w, bool adjoint) const
+      {
+      checkArray(dirty_, "dirty", {nx_dirty, ny_dirty});
+      auto dirty = dirty_.data();
+      auto res_ = makeArray<complex<T>>({nx_dirty, ny_dirty});
+      auto res = res_.mutable_data();
+      double x0 = -0.5*nx_dirty*psx,
+             y0 = -0.5*ny_dirty*psy;
+      {
+      py::gil_scoped_release release;
+#pragma omp parallel num_threads(nthreads)
+{
+#pragma omp for schedule(static)
+      for (size_t i=0; i<=nx_dirty/2; ++i)
+        {
+        double fx = x0+i*psx;
+        fx *= fx;
+        for (size_t j=0; j<=ny_dirty/2; ++j)
+          {
+          double fy = y0+j*psy;
+          auto ws = wscreen(fx, fy*fy, w, adjoint);
+          res[ny_dirty*i+j] = dirty[ny_dirty*i+j]*ws;
+          if (i>0)
+            res[ny_dirty*(nx_dirty-i)+j] = dirty[ny_dirty*(nx_dirty-i)+j]*ws;
+          if (j>0)
+            res[ny_dirty*i+(ny_dirty-j)] = dirty[ny_dirty*i+(ny_dirty-j)]*ws;
+          if ((i>0)&&(j>0))
+            res[ny_dirty*(nx_dirty-i)+(ny_dirty-j)] = dirty[ny_dirty*(nx_dirty-i)+(ny_dirty-j)]*ws;
+          }
+        }
+}
+      }
+    return res_;
+    }
   };
 
 template<typename T, typename T2=complex<T>> class Helper
@@ -1560,11 +1523,10 @@ PYBIND11_MODULE(nifty_gridder, m)
     .def("grid2dirty", &GridderConfig<double>::grid2dirty,
       grid2dirty_DS, "grid"_a)
     .def("grid2dirty_c", &GridderConfig<double>::grid2dirty_c, "grid"_a)
-    .def("grid2dirty_wstacking", &GridderConfig<double>::grid2dirty_wstacking, "grid"_a, "w"_a)
-    .def("dirty2grid_wstacking", &GridderConfig<double>::dirty2grid_wstacking, "grid"_a, "w"_a)
     .def("dirty2grid", &GridderConfig<double>::dirty2grid,
       dirty2grid_DS, "dirty"_a)
     .def("dirty2grid_c", &GridderConfig<double>::dirty2grid_c, "dirty"_a)
+    .def("apply_wscreen", &GridderConfig<double>::apply_wscreen, "dirty"_a, "w"_a, "adjoint"_a)
 
     // pickle support
     .def(py::pickle(
diff --git a/profile.py b/profile.py
index 26a94e725ccbfa4989793506129f4e2e4ced21c1..dcb1d40dfaccdc732ef74798ca25df9e96a9dceb 100644
--- a/profile.py
+++ b/profile.py
@@ -28,15 +28,14 @@ if __name__ == '__main__':
     w = 1000.2
     ny, dy = nx, dx
     conf = ng.GridderConfig(nx, ny, 1e-7, dx, dy)
-    x, y = conf.Nu(), conf.Nv()
+    x, y = conf.Nxdirty(), conf.Nydirty()
 
     fld = np.random.randn(x, y) + 1j*np.random.randn(x, y)
-    func0 = lambda x: (_wscreen(nx, dy, w).conjugate()*conf.grid2dirty_c(x)
-                       ).real
-    func1 = lambda x: conf.grid2dirty_wstacking(x, w)
+    func0 = lambda x: _wscreen(nx, dy, w).conjugate()*x
+    func1 = lambda x: conf.apply_wscreen(x, w, True)
     print(time_op(func0, fld), time_op(func1, fld))
 
     fld = np.random.randn(nx, ny)
-    func0 = lambda x: conf.dirty2grid_c(_wscreen(nx, dx, w)*x.real)
-    func1 = lambda x: conf.dirty2grid_wstacking(x, w)
+    func0 = lambda x: _wscreen(nx, dx, w)*x.real
+    func1 = lambda x: conf.apply_wscreen(x, w, False)
     print(time_op(func0, fld), time_op(func1, fld))
diff --git a/test.py b/test.py
index cbb5334b35ae5e0e7af61e09686db2357a59a6cd..356ff0e4713eb785aa98aae48e377db2a96bab17 100644
--- a/test.py
+++ b/test.py
@@ -173,10 +173,10 @@ def test_wstacking(nx, dx, w):
     x, y = conf.Nu(), conf.Nv()
     fld = np.random.randn(x, y) + 1j*np.random.randn(x, y)
     res0 = (_wscreen(nx, dy, w).conjugate()*conf.grid2dirty_c(fld)).real
-    res1 = conf.grid2dirty_wstacking(fld, w)
+    res1 = conf.apply_wscreen(conf.grid2dirty_c(fld), w, True).real
     np.testing.assert_allclose(res0, res1)
 
     fld = np.random.randn(nx, ny)
     res0 = conf.dirty2grid_c(_wscreen(nx, dx, w)*fld.real)
-    res1 = conf.dirty2grid_wstacking(fld, w)
+    res1 = conf.dirty2grid_c(conf.apply_wscreen(fld, w, False))
     np.testing.assert_allclose(res0, res1)