diff --git a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
index 6302d105d6dd02c35460e3130648b4ee3322effa..8393c6815144278fde4250c08b0c68b0b940044f 100644
--- a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
+++ b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
@@ -16,7 +16,7 @@ __global__
 void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ out,
     int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
 {
-    // grid stride loop over output array to keep 
+    // grid stride loop over output array to keep
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
     {
       double sum = 0.0f;
@@ -30,36 +30,52 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
         double y = tmp.y * tmp.y;
         sum += x + y;
       }
-      size_t toff = out_offset * nchans + currentOutputSpectra * nchans *stride; 
+      size_t toff = out_offset * nchans + currentOutputSpectra * nchans *stride;
       out[toff + i] += (int8_t) ((sum - offset)/scale);
+      // no atomic add for int8, thus no optimized version here.A tomic add can be
+      // implemented using an in32 atomicAdd and bit shifting, but this needs more effort.
     }
 
 }
 
-
 template <typename T>
 __global__
 void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
     int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
 {
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate); i += blockDim.x * gridDim.x)
+
+    const int nb = naccumulate / blockDim.x + 1;
+    const int bs = blockDim.x;
+    const int number_of_spectra = nsamps /( nchans * naccumulate);
+
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate * nb); i += blockDim.x * gridDim.x)
     {
-      double sum = 0;
-      size_t currentOutputSpectra = i / nchans;
-      size_t currentChannel = i % nchans;
+      const size_t bn = i / nchans / number_of_spectra;
+      const size_t currentOutputSpectra = i / nchans;
+      const size_t currentChannel = i % nchans;
 
-      for (size_t j = 0; j < naccumulate; j++)
+      double sum = 0;
+      for (size_t k = 0; k < bs; k++)
       {
+        size_t j = k + bn * bs;
+        if (j >= naccumulate)
+            break;
+
         float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel];
         double x = tmp.x * tmp.x;
         double y = tmp.y * tmp.y;
         sum += x + y;
       }
       size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
-      out[i + toff] += sum;
+
+      atomicAdd(&out[toff + currentChannel], ((sum - offset)/scale));
     }
 }
 
+
+
+
+
 } // namespace kernels