diff --git a/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu b/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu index cfac0913f9b54a9e06c46d73a7c875416d0407db..016bce664e5f462aa8b4499c3c2b332a75312650 100644 --- a/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu +++ b/psrdada_cpp/effelsberg/edd/src/GatedSpectrometer.cu @@ -137,35 +137,42 @@ __host__ __device__ void stokes_IQUV(const float2 &p1, const float2 &p2, float & * @brief calculate stokes IQUV spectra pol1, pol2 are arrays of naccumulate * complex spectra for individual polarizations */ -__global__ void stokes_accumulate(float2 const __restrict__ *pol1, - float2 const __restrict__ *pol2, float *I, float* Q, float *U, float*V, - int nchans, int naccumulate) -{ +__global__ void stokes_accumulate(float2 const *__restrict__ pol1, + float2 const *__restrict__ pol2, float *I, + float *Q, float *U, float *V, int nchans, + int naccumulate) { + const int nb = naccumulate / blockDim.x + 1; + const int bs = blockDim.x; + + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans * nb); + i += blockDim.x * gridDim.x) { + const size_t channel_number = i % nchans; + const size_t bn = i / nchans; + + float rI = 0; + float rQ = 0; + float rU = 0; + float rV = 0; + + for (int k = 0; k < bs; k++) { + int cidx = k + bn * bs; + if (cidx >= naccumulate) + break; + + const float2 p1 = pol1[channel_number + cidx * nchans]; + const float2 p2 = pol2[channel_number + cidx * nchans]; + + rI += fabs(p1.x * p1.x + p1.y * p1.y) + fabs(p2.x * p2.x + p2.y * p2.y); + rQ += fabs(p1.x * p1.x + p1.y * p1.y) - fabs(p2.x * p2.x + p2.y * p2.y); + rU += 2.f * (p1.x * p2.x + p1.y * p2.y); + rV += -2.f * (p1.y * p2.x - p1.x * p2.y); + } - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans); - i += blockDim.x * gridDim.x) - { - float rI = 0; - float rQ = 0; - float rU = 0; - float rV = 0; - - for (int k=0; k < naccumulate; k++) - { - const float2 p1 = pol1[i + k * nchans]; - const float2 p2 = pol2[i + k * nchans]; - - rI += fabs(p1.x * p1.x + p1.y * p1.y) + fabs(p2.x * p2.x + p2.y * p2.y); - rQ += fabs(p1.x * p1.x + p1.y * p1.y) - fabs(p2.x * p2.x + p2.y * p2.y); - rU += 2.f * (p1.x * p2.x + p1.y * p2.y); - rV += -2.f * (p1.y * p2.x - p1.x * p2.y); - } - I[i] += rI; - Q[i] += rQ; - U[i] += rU; - V[i] += rV; + atomicAdd(&I[channel_number], rI); + atomicAdd(&Q[channel_number], rQ); + atomicAdd(&U[channel_number], rU); + atomicAdd(&V[channel_number], rV); } - }