Commit c683d17f authored by Tobias Winchen's avatar Tobias Winchen
Browse files

IMproved speed in stokes accumulate for low channel number

parent 258f6127
......@@ -137,35 +137,42 @@ __host__ __device__ void stokes_IQUV(const float2 &p1, const float2 &p2, float &
* @brief calculate stokes IQUV spectra pol1, pol2 are arrays of naccumulate
* complex spectra for individual polarizations
*/
__global__ void stokes_accumulate(float2 const __restrict__ *pol1,
float2 const __restrict__ *pol2, float *I, float* Q, float *U, float*V,
int nchans, int naccumulate)
{
__global__ void stokes_accumulate(float2 const *__restrict__ pol1,
float2 const *__restrict__ pol2, float *I,
float *Q, float *U, float *V, int nchans,
int naccumulate) {
const int nb = naccumulate / blockDim.x + 1;
const int bs = blockDim.x;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans * nb);
i += blockDim.x * gridDim.x) {
const size_t channel_number = i % nchans;
const size_t bn = i / nchans;
float rI = 0;
float rQ = 0;
float rU = 0;
float rV = 0;
for (int k = 0; k < bs; k++) {
int cidx = k + bn * bs;
if (cidx >= naccumulate)
break;
const float2 p1 = pol1[channel_number + cidx * nchans];
const float2 p2 = pol2[channel_number + cidx * nchans];
rI += fabs(p1.x * p1.x + p1.y * p1.y) + fabs(p2.x * p2.x + p2.y * p2.y);
rQ += fabs(p1.x * p1.x + p1.y * p1.y) - fabs(p2.x * p2.x + p2.y * p2.y);
rU += 2.f * (p1.x * p2.x + p1.y * p2.y);
rV += -2.f * (p1.y * p2.x - p1.x * p2.y);
}
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans);
i += blockDim.x * gridDim.x)
{
float rI = 0;
float rQ = 0;
float rU = 0;
float rV = 0;
for (int k=0; k < naccumulate; k++)
{
const float2 p1 = pol1[i + k * nchans];
const float2 p2 = pol2[i + k * nchans];
rI += fabs(p1.x * p1.x + p1.y * p1.y) + fabs(p2.x * p2.x + p2.y * p2.y);
rQ += fabs(p1.x * p1.x + p1.y * p1.y) - fabs(p2.x * p2.x + p2.y * p2.y);
rU += 2.f * (p1.x * p2.x + p1.y * p2.y);
rV += -2.f * (p1.y * p2.x - p1.x * p2.y);
}
I[i] += rI;
Q[i] += rQ;
U[i] += rU;
V[i] += rV;
atomicAdd(&I[channel_number], rI);
atomicAdd(&Q[channel_number], rQ);
atomicAdd(&U[channel_number], rU);
atomicAdd(&V[channel_number], rV);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment