diff --git a/gridder_cxx.h b/gridder_cxx.h
index fc312ce1b915ac9d317410f77cd7b564d65f5e56..af8e9047d980eea0dbe5549052a5638c3a5e8f92 100644
--- a/gridder_cxx.h
+++ b/gridder_cxx.h
@@ -1198,10 +1198,16 @@ template<typename T> void apply_wcorr(const GridderConfig &gconf,
     }
   }
 
+struct idxhelper
+  {
+  uint32_t idx;
+  int minplane;
+  };
+
 template<typename Serv> void wstack_common(
   const GridderConfig &gconf, const Serv &srv, double &wmin,
   double &dw, size_t &nplanes, vector<size_t> &nvis_plane,
-  vector<int> &minplane, size_t verbosity)
+  vector<vector<idxhelper>> &minplane, size_t verbosity)
   {
   size_t nvis = srv.Nvis();
   size_t nthreads = gconf.Nthreads();
@@ -1232,22 +1238,24 @@ template<typename Serv> void wstack_common(
   if (verbosity>0) cout << "Kernel support: " << supp << endl;
   if (verbosity>0) cout << "nplanes: " << nplanes << endl;
   nvis_plane.resize(nplanes,0);
-  minplane.resize(nvis);
-#pragma omp parallel num_threads(nthreads)
-{
-  vector<size_t> nvploc(nplanes,0);
-#pragma omp for
+
+  // FIXME: this could still use some tweaking
+  int dbunch=supp;
+  int nbunch=(nplanes+dbunch-1)/dbunch;
+  minplane.resize(nbunch);
+  for (auto &pl: minplane)
+    pl.resize(0);
   for (size_t ipart=0; ipart<nvis; ++ipart)
     {
     int plane0 = int((abs(srv.getCoord(ipart).w)-wmin)/dw+0.5*supp)-int(supp-1);
-    minplane[ipart]=plane0;
+    for (int ibunch=0; ibunch<nbunch; ++ibunch)
+      if ((plane0+dbunch>ibunch*dbunch) && (plane0<(ibunch+1)*dbunch))
+        minplane[ibunch].push_back({ipart,plane0});
     for (int i=max<int>(0,plane0); i<min<int>(nplanes,plane0+supp); ++i)
-      ++nvploc[i];
+      ++nvis_plane[i];
     }
-#pragma omp critical
-  for (size_t i=0; i<nplanes; ++i)
-    nvis_plane[i]+=nvploc[i];
-}
+  for(auto &pl:minplane)
+    pl.shrink_to_fit();
   }
 
 template<typename T, typename Serv> void x2dirty(
@@ -1258,21 +1266,20 @@ template<typename T, typename Serv> void x2dirty(
     {
     auto nx_dirty=gconf.Nxdirty();
     auto ny_dirty=gconf.Nydirty();
-    size_t nvis = srv.Nvis();
     auto supp = gconf.Supp();
     size_t nthreads = gconf.Nthreads();
     double wmin;
     double dw;
     size_t nplanes;
     vector<size_t> nvis_plane;
-    vector<int> minplane;
+    vector<vector<idxhelper>> minplane;
     if (verbosity>0) cout << "Gridding using improved w-stacking" << endl;
     wstack_common(gconf, srv, wmin, dw, nplanes, nvis_plane, minplane, verbosity);
 
     dirty.fill(0);
     vector<complex<T>> vis_loc_;
     vector<uint32_t> idx_loc_;
-    vector<uint32_t> newidx;
+    vector<uint32_t> mapper;
     tmpStorage<complex<T>,2> grid_({gconf.Nu(),gconf.Nv()});
     auto grid=grid_.getMav();
     tmpStorage<complex<T>,2> tdirty_(dirty.shape());
@@ -1286,35 +1293,36 @@ template<typename T, typename Serv> void x2dirty(
       double wcur = wmin+iw*dw;
 
       size_t cnt=0;
-      newidx.resize(nvis_plane[iw]);
+      mapper.resize(nvis_plane[iw]);
+      const auto &bla(minplane[iw/supp]);
+      for (size_t i=0; i<bla.size(); ++i)
+        if ((int(iw)>=bla[i].minplane) && (int(iw)<bla[i].minplane+int(supp)))
+          mapper[cnt++] = bla[i].idx;
+      idx_loc_.resize(nvis_plane[iw]);
       if (srv.isMsServ())
         {
-        for (size_t ipart=0; ipart<nvis; ++ipart)
-          if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp))
-            newidx[cnt++] = srv.getIdx(ipart);
+#pragma omp parallel for num_threads(nthreads)
+        for (size_t i=0; i<nvis_plane[iw]; ++i)
+          idx_loc_[i] = srv.getIdx(mapper[i]);
         grid.fill(0);
-        auto newidx2 = const_mav<uint32_t, 1>(newidx.data(), {newidx.size()});
-        Serv newsrv(srv, newidx2);
+        auto idx_loc = const_mav<uint32_t, 1>(idx_loc_.data(), {idx_loc_.size()});
+        Serv newsrv(srv, idx_loc);
         x2grid_c(gconf, newsrv, grid, wcur, dw);
         }
       else
         {
-        for (size_t ipart=0; ipart<nvis; ++ipart)
-          if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp))
-            newidx[cnt++] = ipart;
         vis_loc_.resize(nvis_plane[iw]);
-        auto vis_loc = mav<complex<T>,1>(vis_loc_.data(), {nvis_plane[iw]});
-        idx_loc_.resize(nvis_plane[iw]);
-        auto idx_loc = mav<uint32_t,1>(idx_loc_.data(), {nvis_plane[iw]});
 #pragma omp parallel for num_threads(nthreads)
         for (size_t i=0; i<nvis_plane[iw]; ++i)
           {
-          auto ipart = newidx[i];
-          vis_loc[i] = srv.getVis(ipart);
-          idx_loc[i] = srv.getIdx(ipart);
+          auto ipart = mapper[i];
+          vis_loc_[i] = srv.getVis(ipart);
+          idx_loc_[i] = srv.getIdx(ipart);
           }
         grid.fill(0);
-        vis2grid_c(srv.getBaselines(), gconf, cmav(idx_loc), cmav(vis_loc), grid,
+        auto vis_loc = const_mav<complex<T>,1>(vis_loc_.data(), {nvis_plane[iw]});
+        auto idx_loc = const_mav<uint32_t,1>(idx_loc_.data(), {nvis_plane[iw]});
+        vis2grid_c(srv.getBaselines(), gconf, idx_loc, vis_loc, grid,
           nullmav<T,1>(), wcur, dw);
         }
       gconf.grid2dirty_c(cmav(grid), tdirty);
@@ -1359,14 +1367,13 @@ template<typename T, typename Serv> void dirty2x(
     {
     auto nx_dirty=gconf.Nxdirty();
     auto ny_dirty=gconf.Nydirty();
-    size_t nvis = srv.Nvis();
     auto supp = gconf.Supp();
     size_t nthreads = gconf.Nthreads();
     double wmin;
     double dw;
     size_t nplanes;
     vector<size_t> nvis_plane;
-    vector<int> minplane;
+    vector<vector<idxhelper>> minplane;
     if (verbosity>0) cout << "Degridding using improved w-stacking" << endl;
     wstack_common(gconf, srv, wmin, dw, nplanes, nvis_plane, minplane, verbosity);
 
@@ -1380,7 +1387,7 @@ template<typename T, typename Serv> void dirty2x(
 
     vector<complex<T>> vis_loc_;
     vector<uint32_t> idx_loc_;
-    vector<uint32_t> newidx;
+    vector<uint32_t> mapper;
     tmpStorage<complex<T>,2> grid_({gconf.Nu(),gconf.Nv()});
     auto grid=grid_.getMav();
     tmpStorage<complex<T>,2> tdirty2_({nx_dirty,ny_dirty});
@@ -1400,35 +1407,36 @@ template<typename T, typename Serv> void dirty2x(
       gconf.apply_wscreen(cmav(tdirty2), tdirty2, wcur, false);
       gconf.dirty2grid_c(cmav(tdirty2), grid);
 
-      newidx.resize(nvis_plane[iw]);
       size_t cnt=0;
+      mapper.resize(nvis_plane[iw]);
+      const auto &bla(minplane[iw/supp]);
+      for (size_t i=0; i<bla.size(); ++i)
+        if ((int(iw)>=bla[i].minplane) && (int(iw)<bla[i].minplane+int(supp)))
+          mapper[cnt++] = bla[i].idx;
+      idx_loc_.resize(nvis_plane[iw]);
       if (srv.isMsServ())
         {
-        for (size_t ipart=0; ipart<nvis; ++ipart)
-          if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp))
-            newidx[cnt++] = srv.getIdx(ipart);
-        auto newidx2 = const_mav<uint32_t, 1>(newidx.data(), {newidx.size()});
-        Serv newsrv(srv, newidx2);
+#pragma omp parallel for num_threads(nthreads)
+        for (size_t i=0; i<nvis_plane[iw]; ++i)
+          idx_loc_[i] = srv.getIdx(mapper[i]);
+        auto idx_loc = const_mav<uint32_t, 1>(idx_loc_.data(), {idx_loc_.size()});
+        Serv newsrv(srv, idx_loc);
         grid2x_c(gconf, cmav(grid), newsrv, wcur, dw);
         }
       else
         {
-        for (size_t ipart=0; ipart<nvis; ++ipart)
-          if ((int(iw)>=minplane[ipart]) && (iw<minplane[ipart]+supp))
-            newidx[cnt++] = ipart;
         vis_loc_.resize(nvis_plane[iw]);
         auto vis_loc = mav<complex<T>,1>(vis_loc_.data(), {nvis_plane[iw]});
         vis_loc.fill(0);
-        idx_loc_.resize(nvis_plane[iw]);
         auto idx_loc = mav<uint32_t,1>(idx_loc_.data(), {nvis_plane[iw]});
 #pragma omp parallel for num_threads(nthreads)
         for (size_t i=0; i<nvis_plane[iw]; ++i)
-          idx_loc[i] = srv.getIdx(newidx[i]);
+          idx_loc[i] = srv.getIdx(mapper[i]);
         grid2vis_c(srv.getBaselines(), gconf, cmav(idx_loc), cmav(grid), vis_loc,
           nullmav<T,1>(), wcur, dw);
 #pragma omp parallel for num_threads(nthreads)
         for (size_t i=0; i<nvis_plane[iw]; ++i)
-          srv.addVis(newidx[i], vis_loc[i]);
+          srv.addVis(mapper[i], vis_loc[i]);
         }
       }
     }