diff --git a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
index 8393c6815144278fde4250c08b0c68b0b940044f..985bfaffb03508e56ba58399e18694cc1ee9ac24 100644
--- a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
+++ b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
@@ -38,38 +38,59 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
 
 }
 
+
+/**
+ * @brief Integrates output power from complex voltage input.
+ *
+ * @param in            Pointer to input data of size nsamps.
+ * @param out           Pointer to output data of size nsamps / (naccumualte * nchans) * nchans
+ * @param nchans        Number of channels
+ * @param naccumulate   Number of input spectra to integrate into one output spectrum
+ * @param scale         Normalization constant for the utput spectrum
+ * @param offset        Output is shifted by this value
+ * @param stride        Stride of the output spectra. For 2, leave a gap of
+ * nchan between every output spetrum, for 1 no gap.
+ * @param out_offset
+ *
+ *
+ */
 template <typename T>
 __global__
 void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
-    int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
+    size_t nchans, size_t nsamps, size_t naccumulate, float scale, float offset, int stride, int out_offset)
 {
 
     const int nb = naccumulate / blockDim.x + 1;
     const int bs = blockDim.x;
-    const int number_of_spectra = nsamps /( nchans * naccumulate);
+    const int number_of_spectra = nsamps / (nchans * naccumulate);
 
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate * nb); i += blockDim.x * gridDim.x)
-    {
-      const size_t bn = i / nchans / number_of_spectra;
-      const size_t currentOutputSpectra = i / nchans;
-      const size_t currentChannel = i % nchans;
+    for (size_t current_spectrum =0; current_spectrum < number_of_spectra; current_spectrum++)
+    { // I could just launch more blocks in a second dimension
+        for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans * nb); i += blockDim.x * gridDim.x)
+        {
+          const size_t channel_number = i % nchans;
+          const size_t bn = i / nchans;
 
-      double sum = 0;
-      for (size_t k = 0; k < bs; k++)
-      {
-        size_t j = k + bn * bs;
-        if (j >= naccumulate)
-            break;
+          double sum = 0;
+          for (size_t k = 0; k < bs; k++)
+          {
+            int cidx = k + bn * bs;
 
-        float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel];
-        double x = tmp.x * tmp.x;
-        double y = tmp.y * tmp.y;
-        sum += x + y;
-      }
-      size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
+            if (cidx >= naccumulate)
+                break;
 
-      atomicAdd(&out[toff + currentChannel], ((sum - offset)/scale));
+            const float2 tmp = in[channel_number + cidx * nchans + current_spectrum * naccumulate * nchans];
+
+            double x = tmp.x * tmp.x;
+            double y = tmp.y * tmp.y;
+            sum += x + y;
+          }
+          size_t toff = out_offset * nchans + current_spectrum * nchans * stride;
+
+          atomicAdd(&out[toff + channel_number], ((sum - offset)/scale));
+        }
     }
+
 }
 
 
diff --git a/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu b/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu
index fdcacff20a361028cb758a81326e0bf79d33ba02..7fbd556e63c45e41fc4da0b75d715c5cdc4d929e 100644
--- a/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu
@@ -99,6 +99,112 @@ TEST_F(DetectorAccumulatorTester, noise_test)
     compare_against_host(gpu_output, host_output);
 }
 
+
+// parameters for the detector accumulator test
+struct detect_accumulate_params
+{
+    size_t nchan;       // umber of channels
+    size_t nspectra;    // number of output spectra
+    size_t naccumulate; // number of spectra to accumulate
+};
+
+
+class detect_and_accumulate32bit: public testing::TestWithParam<detect_accumulate_params> {
+// Test that the kernel executes in certain parameter
+// settings
+};
+
+
+TEST_P(detect_and_accumulate32bit, no_stride)
+{
+
+    thrust::device_vector<float2> input;
+    thrust::device_vector<float> output;
+
+    detect_accumulate_params params = GetParam();
+
+    input.resize(params.nspectra * params.nchan * params.naccumulate);
+    output.resize(params.nspectra * params.nchan);
+    float2 v;
+    v.x = 1.0;
+    v.y = 1.0;
+    thrust::fill(input.begin(), input.end(), v);
+    thrust::fill(output.begin(), output.end(), 0.);
+
+    kernels::detect_and_accumulate<float> <<<1024, 1024>>>(
+            thrust::raw_pointer_cast(input.data()),
+            thrust::raw_pointer_cast(output.data()),
+            params.nchan,
+            input.size(),
+            params.naccumulate,
+            1, 0., 1, 0);
+
+    cudaDeviceSynchronize();
+
+    thrust::host_vector<float> output_host = output;
+
+    for (size_t i =0; i < output.size(); i++)
+    {
+        ASSERT_FLOAT_EQ(output_host[i], (v.x * v.x + v.y * v.y) * params.naccumulate) << "i = " << i << " for nchan = " << params.nchan << ", nspectra = " << params.nspectra << ", naccumulate = " << params.naccumulate;
+    }
+}
+
+TEST_P(detect_and_accumulate32bit, w_stride)
+{
+
+    thrust::device_vector<float2> input;
+    thrust::device_vector<float> output;
+
+    detect_accumulate_params params = GetParam();
+
+    input.resize(params.nspectra * params.nchan * params.naccumulate);
+    output.resize(params.nspectra * params.nchan * 2);
+    float2 v;
+    v.x = 1.0;
+    v.y = 1.0;
+    thrust::fill(input.begin(), input.end(), v);
+    thrust::fill(output.begin(), output.end(), 0.);
+
+    kernels::detect_and_accumulate<float> <<<1024, 1024>>>(
+            thrust::raw_pointer_cast(input.data()),
+            thrust::raw_pointer_cast(output.data()),
+            params.nchan,
+            input.size(),
+            params.naccumulate,
+            1, 0., 2, 0);
+
+    cudaDeviceSynchronize();
+
+    thrust::host_vector<float> output_host = output;
+
+    for (size_t i =0; i < params.nchan * params.nspectra; i++)
+    {
+        size_t current_spectrum = i / params.nchan;
+        ASSERT_FLOAT_EQ(output_host[i + current_spectrum * params.nchan], (v.x * v.x + v.y * v.y) * params.naccumulate) << "i = " << i << " for nchan = " << params.nchan << ", nspectra = " << params.nspectra << ", naccumulate = " << params.naccumulate;
+    }
+}
+
+
+
+
+
+
+INSTANTIATE_TEST_CASE_P (DetectorAccumulatorTester,
+        detect_and_accumulate32bit,
+        testing::Values(
+          // nchan; nspectra; naccumulate;
+          detect_accumulate_params({1024, 1,  128}),
+          detect_accumulate_params({1024, 1,  1024}),
+          detect_accumulate_params({1024, 1,  2048}),
+          detect_accumulate_params({1024, 1,  16384}),
+          detect_accumulate_params({1024, 16,  1024}),
+          detect_accumulate_params({8388608, 2,  16}),
+          detect_accumulate_params({8388608, 1,  32})
+            )
+        );
+
+
+
 } //namespace test
 } //namespace edd
 } //namespace meerkat
diff --git a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
index 52b6f0d5e4b700041fc2852cc04eb85b1d31b563..db68c4ece82b084c51dc55c16f1cba7ab18268e9 100644
--- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
@@ -146,6 +146,7 @@ TEST(GatedSpectrometer, stokes_accumulate)
     testStokesAccumulateParam(8 * 1024 * 1024 + 1, 32);
     testStokesAccumulateParam(1024 + 1, 5);
     testStokesAccumulateParam(1024 + 1, 64);
+    testStokesAccumulateParam(1024, 65536);
 }