diff --git a/nifty_gridder.cc b/nifty_gridder.cc
index 0ffb5ddde70141d75faeba51f18badab895efe08..7b21869ccbed0ea534d2d179415e5ecbbe65e12f 100644
--- a/nifty_gridder.cc
+++ b/nifty_gridder.cc
@@ -605,6 +605,19 @@ template<typename T> class GridderConfig
       if (adjoint) phase *= -1;
       return complex<T>(cos(phase)*xn, sin(phase)*xn);
       }
+    complex<T> wscreen_nonorm(double x, double y, double w, bool adjoint) const
+      {
+      constexpr double pi = 3.141592653589793238462643383279502884197;
+// PhD : sqrt (1-x*x-y*y)) - 1
+//here 1 - sin(sqrt(x*x+y*y))**2/(1.+cos(sqrt(x*x+y*y)))
+// ==   (cos(sqrt(x*x+y*y))**2 +cos(sqrt(x*x+y*y)))  / (1.+cos(sqrt(x*x+y*y)))
+      double eps = sqrt(x+y);
+      double s = sin(eps);
+      double nm1 = -s*s/(1.+cos(eps));
+      double phase = 2*pi*w*nm1;
+      if (adjoint) phase *= -1;
+      return complex<T>(cos(phase), sin(phase));
+      }
 
   public:
     GridderConfig(size_t nxdirty, size_t nydirty, double epsilon,
@@ -805,8 +818,45 @@ template<typename T> class GridderConfig
         }
 }
       }
-    return res_;
-    }
+      return res_;
+      }
+    pyarr_c<complex<T>> apply_wscreen_nonorm(const pyarr_c<complex<T>> &dirty_, double w, bool adjoint) const
+      {
+      checkArray(dirty_, "dirty", {nx_dirty, ny_dirty});
+      auto dirty = dirty_.data();
+      auto res_ = makeArray<complex<T>>({nx_dirty, ny_dirty});
+      auto res = res_.mutable_data();
+      double x0 = -0.5*nx_dirty*psx,
+             y0 = -0.5*ny_dirty*psy;
+      {
+      py::gil_scoped_release release;
+#pragma omp parallel num_threads(nthreads)
+{
+#pragma omp for schedule(static)
+      for (size_t i=0; i<=nx_dirty/2; ++i)
+        {
+        double fx = x0+i*psx;
+        fx *= fx;
+        for (size_t j=0; j<=ny_dirty/2; ++j)
+          {
+          double fy = y0+j*psy;
+          auto ws = wscreen_nonorm(fx, fy*fy, w, adjoint);
+          res[ny_dirty*i+j] = dirty[ny_dirty*i+j]*ws; // lower left
+          size_t i2 = nx_dirty-i, j2 = ny_dirty-j;
+          if ((i>0)&&(i<i2))
+            {
+            res[ny_dirty*i2+j] = dirty[ny_dirty*i2+j]*ws; // lower right
+            if ((j>0)&&(j<j2))
+              res[ny_dirty*i2+j2] = dirty[ny_dirty*i2+j2]*ws; // upper right
+            }
+          if ((j>0)&&(j<j2))
+            res[ny_dirty*i+j2] = dirty[ny_dirty*i+j2]*ws; // upper left
+          }
+        }
+}
+      }
+      return res_;
+      }
   };
 
 template<typename T, typename T2=complex<T>> class Helper
@@ -1491,16 +1541,28 @@ template<typename T> pyarr_c<complex<T>> vis2dirty_wstack(const Baselines<T> &ba
 cout << "data w range: " << wmin << " to " << wmax << endl;
 
 //FIXME temporary
+double FCT;
 double dw;
 {
   double x0 = -0.5*nx_dirty*psx,
          y0 = -0.5*ny_dirty*psy;
-  double nmin = sqrt(1.-x0*x0-y0*y0)-1.;
+  double nmin = sqrt(max(1.-x0*x0-y0*y0,0.))-1.;
+  cout << abs(-0.5) << endl;
+  cout << nmin << endl;
+  FCT=0.25/nmin;
   dw = 0.25/abs(nmin);
+{
+double nwtmp=(wmax-wmin)/dw;
+double dntmp=abs(nmin)/nwtmp;
+cout << "NW: " << nwtmp << endl;
+cout << "DN: " << dntmp << endl;
+}
+  double du=abs(1./x0)*nx_dirty;
+  cout << "TEST: " << du << " " << dw << " " << du/dw<< endl;
 }
 cout << "delta w: " << dw << endl;
 
-  double w_eps=1e-7; // FIXME
+  double w_eps=1e-12; // FIXME
   auto w_supp = get_supp(w_eps);
 cout << "w_supp: " << w_supp << endl;
   auto beta=2.3*w_supp;
@@ -1512,9 +1574,11 @@ cout << "nplanes: " << nplanes << endl;
   vector<size_t> nvis_plane(nplanes,0);
   for (size_t ipart=0; ipart<nvis; ++ipart)
     {
-    int iplane = int(wval[ipart]-wmin)/dw;
-    for (int i=max<int>(0,iplane-w_supp); i<min<int>(nplanes,iplane+w_supp+1); ++i)
+    for (int i=0; i<nplanes; ++i)
       if (abs(wval[ipart]-(wmin+i*dw))<dwmax) ++nvis_plane[i];
+//    int iplane = int(wval[ipart]-wmin)/dw;
+//    for (int i=max<int>(0,iplane-w_supp); i<min<int>(nplanes,iplane+w_supp+1); ++i)
+//      if (abs(wval[ipart]-(wmin+i*dw))<dwmax) ++nvis_plane[i];
     }
 cout << "nvis/plane: ";
 for (auto nv:nvis_plane) cout << nv << " ";
@@ -1534,60 +1598,71 @@ cout << "working on w plane #" << iw << endl;
     pyarr<uint32_t> idx_loc_ = makeArray<uint32_t>({nvis_plane[iw]});
     auto idx_loc = idx_loc_.mutable_data();
     for (size_t ipart=0; ipart<nvis; ++ipart)
-      if (abs(wcur-wval[ipart]) < dwmax)
+{
+//cout << abs(wval[ipart]-wcur) - dwmax << endl;
+      if (abs(wval[ipart]-wcur) < dwmax)
         {
-        double x=2./(w_supp*dw)*abs(wcur-wval[ipart]);
+//cout << abs(wval[ipart]-wcur) << " " << dwmax<< endl;
 myassert(cnt<nvis_plane[iw],"must not happen");
+        double x=2./(w_supp*dw)*abs(wcur-wval[ipart]);
+        cout << x << " " <<exp(beta*(sqrt(1.-x*x)-1.)) <<  endl;
         vis_loc[cnt] = vis[ipart]*exp(beta*(sqrt(1.-x*x)-1.));
         idx_loc[cnt] = idx[ipart];
         ++cnt;
         }
+}
 myassert(cnt==nvis_plane[iw],"must not happen 2");
-    auto dirty_ = gconf.apply_wscreen(
+    auto dirty_ = gconf.apply_wscreen_nonorm(
       gconf.grid2dirty_c(vis2grid_c(baselines, gconf, idx_loc_, vis_loc_, None, None)), wcur, false);
     auto dirty = dirty_.data();
     for (ptrdiff_t i=0; i<accum_.size(); ++i)
       accum[i] += dirty[i];
     }
-
+//return accum_;
   // correct for w gridding
   {
+auto corr = correction_factors(1000,1000,w_supp);
+auto maxn = abs(sqrt(1.- (nx_dirty/2*psx)*(nx_dirty/2*psx))-1.);
+cout << "psx: " << psx << endl;
+cout << "maxn:" << maxn << " " << corr[0] << " " << corr[int(maxn*1000)] << endl;
   py::gil_scoped_release release;
 cout << "applying correction for gridding in w direction" << endl;
 
   constexpr double pi = 3.141592653589793238462643383279502884197;
-  double x0 = -0.5*nx_dirty*psx,
-         y0 = -0.5*ny_dirty*psy;
   auto beta = 2.3*w_supp;
-  auto p = int(1.5*w_supp+2);
+  auto p = int(3*w_supp+2);
   vector<double> x, wgt;
   legendre_prep(2*p,x,wgt);
   auto psi = x;
   for (auto &v:psi)
     v = exp(beta*(sqrt(1-v*v)-1.));
+  double x0 = -0.5*nx_dirty*psx,
+         y0 = -0.5*ny_dirty*psy;
 #pragma omp parallel num_threads(nthreads)
 {
 #pragma omp for schedule(static)
   for (size_t i=0; i<=nx_dirty/2; ++i)
     {
-//    double fx = x0+i*psx;
-    double fx = -0.5+i*1./nx_dirty;
+    double fx = (-0.25+i*.5/nx_dirty);
+fx = x0+i*psx;
     fx *= fx;
     for (size_t j=0; j<=ny_dirty/2; ++j)
       {
-//      double fy = y0+j*psy;
-      double fy = -0.5+j*1./ny_dirty;
+      double fy = (-0.25+j*.5/ny_dirty);
+fy = y0+j*psy;
       fy*=fy;
-  //    auto n=sqrt(1.-fx-fy)-1.;
-      auto n=cos(sqrt(fx+fy)) -1;
-//      cout << n << endl;
+if (i==0 && j==0)
+  cout << fx << " " << fy << " " << sqrt(1.-fx-fy)-1. << endl;
+if (i==nx_dirty/2 && j==ny_dirty/2)
+  cout << fx << " " << fy << endl;
+      auto n=sqrt(1.-fx-fy)-1.; // circle profile
+//      auto n=cos(sqrt(fx+fy)) -1; // cosine profile
+n*=FCT;//[4.03;4.05] for cosine profile
+//n*=xfct;//[3.8;3.9] for circle profile
       double fct = 0.;
-      for (int i=0; i<p; ++i)
-        fct += wgt[i]*psi[i]*cos(pi*w_supp*n*x[i]);
-//      cout << fct << endl;
+      for (int ix=0; ix<p; ++ix)
+        fct += wgt[ix]*psi[ix]*cos(pi*w_supp*n*x[ix]);
       fct = 1./(w_supp*fct);
-      if ((i==nx_dirty/2) &&(j==ny_dirty/2))
-        cout << i << " " << j << " " << fct << endl;
       size_t i2 = nx_dirty-i, j2 = ny_dirty-j;
       accum[ny_dirty*i+j]*=fct;
       if ((i>0)&&(i<i2))