From c683d17fb858db959b606f916d0ff6473b01274a Mon Sep 17 00:00:00 2001 From: Tobias Winchen <tobias.winchen@rwth-aachen.de> Date: Wed, 14 Apr 2021 07:05:15 +0000 Subject: [PATCH] IMproved speed in stokes accumulate for low channel number --- .../effelsberg/edd/src/GatedSpectrometer.cu | 61 +++++++++++-------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu index cfac0913..016bce66 100644 --- a/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu +++ b/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu @@ -137,35 +137,42 @@ __host__ __device__ void stokes_IQUV(const float2 &p1, const float2 &p2, float & * @brief calculate stokes IQUV spectra pol1, pol2 are arrays of naccumulate * complex spectra for individual polarizations */ -__global__ void stokes_accumulate(float2 const __restrict__ *pol1, - float2 const __restrict__ *pol2, float *I, float* Q, float *U, float*V, - int nchans, int naccumulate) -{ +__global__ void stokes_accumulate(float2 const *__restrict__ pol1, + float2 const *__restrict__ pol2, float *I, + float *Q, float *U, float *V, int nchans, + int naccumulate) { + const int nb = naccumulate / blockDim.x + 1; + const int bs = blockDim.x; + + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans * nb); + i += blockDim.x * gridDim.x) { + const size_t channel_number = i % nchans; + const size_t bn = i / nchans; + + float rI = 0; + float rQ = 0; + float rU = 0; + float rV = 0; + + for (int k = 0; k < bs; k++) { + int cidx = k + bn * bs; + if (cidx >= naccumulate) + break; + + const float2 p1 = pol1[channel_number + cidx * nchans]; + const float2 p2 = pol2[channel_number + cidx * nchans]; + + rI += fabs(p1.x * p1.x + p1.y * p1.y) + fabs(p2.x * p2.x + p2.y * p2.y); + rQ += fabs(p1.x * p1.x + p1.y * p1.y) - fabs(p2.x * p2.x + p2.y * p2.y); + rU += 2.f * (p1.x * p2.x + p1.y * p2.y); + rV += -2.f * (p1.y * p2.x - p1.x * p2.y); + } - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans); - i += blockDim.x * gridDim.x) - { - float rI = 0; - float rQ = 0; - float rU = 0; - float rV = 0; - - for (int k=0; k < naccumulate; k++) - { - const float2 p1 = pol1[i + k * nchans]; - const float2 p2 = pol2[i + k * nchans]; - - rI += fabs(p1.x * p1.x + p1.y * p1.y) + fabs(p2.x * p2.x + p2.y * p2.y); - rQ += fabs(p1.x * p1.x + p1.y * p1.y) - fabs(p2.x * p2.x + p2.y * p2.y); - rU += 2.f * (p1.x * p2.x + p1.y * p2.y); - rV += -2.f * (p1.y * p2.x - p1.x * p2.y); - } - I[i] += rI; - Q[i] += rQ; - U[i] += rU; - V[i] += rV; + atomicAdd(&I[channel_number], rI); + atomicAdd(&Q[channel_number], rQ); + atomicAdd(&U[channel_number], rU); + atomicAdd(&V[channel_number], rV); } - } -- GitLab