Merge branch 'detect_accumualte_fix' into 'devel'

Detect accumulate fix See merge request !14

Merge branch 'detect_accumualte_fix' into 'devel'
de4c3c1c · Tobias Winchen · c673c39a · 6ab413cc · de4c3c1c · de4c3c1c
Commit de4c3c1c authored Jun 28, 2021 by Tobias Winchen
--- a/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
+++ b/psrdada_cpp/effelsberg/edd/DetectorAccumulator.cuh
@@ -38,40 +38,61 @@ void detect_and_accumulate(float2 const* __restrict__ in, int8_t* __restrict__ o
 }
+/**
+ * @brief Integrates output power from complex voltage input.
+ *
+ * @param in            Pointer to input data of size nsamps.
+ * @param out           Pointer to output data of size nsamps / (naccumualte * nchans) * nchans
+ * @param nchans        Number of channels
+ * @param naccumulate   Number of input spectra to integrate into one output spectrum
+ * @param scale         Normalization constant for the utput spectrum
+ * @param offset        Output is shifted by this value
+ * @param stride        Stride of the output spectra. For 2, leave a gap of
+ * nchan between every output spetrum, for 1 no gap.
+ * @param out_offset
+ *
+ *
+ */
 template <typename T>
 __global__
 void detect_and_accumulate(float2 const* __restrict__ in, float* __restrict__ out,
-    int nchans, int nsamps, int naccumulate, float scale, float offset, int stride, int out_offset)
+    size_t nchans, size_t nsamps, size_t naccumulate, float scale, float offset, int stride, int out_offset)
 {
    const int nb = naccumulate / blockDim.x + 1;
    const int bs = blockDim.x;
    const int number_of_spectra = nsamps / (nchans * naccumulate);
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nsamps * nchans / naccumulate * nb); i += blockDim.x * gridDim.x)
+    for (size_t current_spectrum =0; current_spectrum < number_of_spectra; current_spectrum++)
+    { // I could just launch more blocks in a second dimension
+        for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans * nb); i += blockDim.x * gridDim.x)
        {
-      const size_t bn = i / nchans / number_of_spectra;
+          const size_t channel_number = i % nchans;
-      const size_t currentOutputSpectra = i / nchans;
+          const size_t bn = i / nchans;
-      const size_t currentChannel = i % nchans;
          double sum = 0;
          for (size_t k = 0; k < bs; k++)
          {
-        size_t j = k + bn * bs;
+            int cidx = k + bn * bs;
-        if (j >= naccumulate)
+            if (cidx >= naccumulate)
                break;
-        float2 tmp = in[ j * nchans + currentOutputSpectra * nchans * naccumulate + currentChannel];
+            const float2 tmp = in[channel_number + cidx * nchans + current_spectrum * naccumulate * nchans];
            double x = tmp.x * tmp.x;
            double y = tmp.y * tmp.y;
            sum += x + y;
          }
-      size_t toff = out_offset * nchans + currentOutputSpectra * nchans * stride;
+          size_t toff = out_offset * nchans + current_spectrum * nchans * stride;
-      atomicAdd(&out[toff + currentChannel], ((sum - offset)/scale));
+          atomicAdd(&out[toff + channel_number], ((sum - offset)/scale));
        }
    }
+}


--- a/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/DetectorAccumulatorTester.cu
@@ -99,6 +99,112 @@ TEST_F(DetectorAccumulatorTester, noise_test)
    compare_against_host(gpu_output, host_output);
 }
+// parameters for the detector accumulator test
+struct detect_accumulate_params
+{
+    size_t nchan;       // umber of channels
+    size_t nspectra;    // number of output spectra
+    size_t naccumulate; // number of spectra to accumulate
+};
+class detect_and_accumulate32bit: public testing::TestWithParam<detect_accumulate_params> {
+// Test that the kernel executes in certain parameter
+// settings
+};
+TEST_P(detect_and_accumulate32bit, no_stride)
+{
+    thrust::device_vector<float2> input;
+    thrust::device_vector<float> output;
+    detect_accumulate_params params = GetParam();
+    input.resize(params.nspectra * params.nchan * params.naccumulate);
+    output.resize(params.nspectra * params.nchan);
+    float2 v;
+    v.x = 1.0;
+    v.y = 1.0;
+    thrust::fill(input.begin(), input.end(), v);
+    thrust::fill(output.begin(), output.end(), 0.);
+    kernels::detect_and_accumulate<float> <<<1024, 1024>>>(
+            thrust::raw_pointer_cast(input.data()),
+            thrust::raw_pointer_cast(output.data()),
+            params.nchan,
+            input.size(),
+            params.naccumulate,
+            1, 0., 1, 0);
+    cudaDeviceSynchronize();
+    thrust::host_vector<float> output_host = output;
+    for (size_t i =0; i < output.size(); i++)
+    {
+        ASSERT_FLOAT_EQ(output_host[i], (v.x * v.x + v.y * v.y) * params.naccumulate) << "i = " << i << " for nchan = " << params.nchan << ", nspectra = " << params.nspectra << ", naccumulate = " << params.naccumulate;
+    }
+}
+TEST_P(detect_and_accumulate32bit, w_stride)
+{
+    thrust::device_vector<float2> input;
+    thrust::device_vector<float> output;
+    detect_accumulate_params params = GetParam();
+    input.resize(params.nspectra * params.nchan * params.naccumulate);
+    output.resize(params.nspectra * params.nchan * 2);
+    float2 v;
+    v.x = 1.0;
+    v.y = 1.0;
+    thrust::fill(input.begin(), input.end(), v);
+    thrust::fill(output.begin(), output.end(), 0.);
+    kernels::detect_and_accumulate<float> <<<1024, 1024>>>(
+            thrust::raw_pointer_cast(input.data()),
+            thrust::raw_pointer_cast(output.data()),
+            params.nchan,
+            input.size(),
+            params.naccumulate,
+            1, 0., 2, 0);
+    cudaDeviceSynchronize();
+    thrust::host_vector<float> output_host = output;
+    for (size_t i =0; i < params.nchan * params.nspectra; i++)
+    {
+        size_t current_spectrum = i / params.nchan;
+        ASSERT_FLOAT_EQ(output_host[i + current_spectrum * params.nchan], (v.x * v.x + v.y * v.y) * params.naccumulate) << "i = " << i << " for nchan = " << params.nchan << ", nspectra = " << params.nspectra << ", naccumulate = " << params.naccumulate;
+    }
+}
+INSTANTIATE_TEST_CASE_P (DetectorAccumulatorTester,
+        detect_and_accumulate32bit,
+        testing::Values(
+          // nchan; nspectra; naccumulate;
+          detect_accumulate_params({1024, 1,  128}),
+          detect_accumulate_params({1024, 1,  1024}),
+          detect_accumulate_params({1024, 1,  2048}),
+          detect_accumulate_params({1024, 1,  16384}),
+          detect_accumulate_params({1024, 16,  1024}),
+          detect_accumulate_params({8388608, 2,  16}),
+          detect_accumulate_params({8388608, 1,  32})
+            )
+        );
 } //namespace test
 } //namespace edd
 } //namespace meerkat


--- a/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
+++ b/psrdada_cpp/effelsberg/edd/test/src/GatedSpectrometerTest.cu
@@ -146,6 +146,7 @@ TEST(GatedSpectrometer, stokes_accumulate)
    testStokesAccumulateParam(8 * 1024 * 1024 + 1, 32);
    testStokesAccumulateParam(1024 + 1, 5);
    testStokesAccumulateParam(1024 + 1, 64);
+    testStokesAccumulateParam(1024, 65536);
 }