GatedSpectrometer.cu 22.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include "psrdada_cpp/effelsberg/edd/GatedSpectrometer.cuh"


namespace psrdada_cpp {
namespace effelsberg {
namespace edd {


__global__ void mergeSideChannels(uint64_t* __restrict__ A, uint64_t*
        __restrict__ B, size_t N)
{
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N);
       i += blockDim.x * gridDim.x)
  {
Tobias Winchen's avatar
Tobias Winchen committed
15
    uint64_t v = A[i] | B[i];
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
    A[i] = v;
    B[i] = v;
  }
}


__global__ void gating(float* __restrict__ G0,
        float* __restrict__ G1,
        const uint64_t* __restrict__ sideChannelData,
        size_t N, size_t heapSize, size_t bitpos,
        size_t noOfSideChannels, size_t selectedSideChannel,
        const float*  __restrict__ _baseLineG0,
        const float*  __restrict__ _baseLineG1,
        float* __restrict__ baseLineNG0,
        float* __restrict__ baseLineNG1,
31
32
33
        uint64_cu* stats_G0,
        uint64_cu* stats_G1)
{
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
  // statistics values for samopels to G0, G1
  uint32_t _G0stats = 0;
  uint32_t _G1stats = 0;

  const float baseLineG0 = _baseLineG0[0];
  const float baseLineG1 = _baseLineG1[0];

  float baselineUpdateG0 = 0;
  float baselineUpdateG1 = 0;

  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < N);
       i += blockDim.x * gridDim.x) {
    const float v = G0[i];

    const uint64_t sideChannelItem = sideChannelData[((i / heapSize) * (noOfSideChannels)) +
                        selectedSideChannel];

    const unsigned int bit_set = TEST_BIT(sideChannelItem, bitpos);
52
    const unsigned int heap_lost = TEST_BIT(sideChannelItem, 3);
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
    G1[i] = (v - baseLineG1) * bit_set * (!heap_lost) + baseLineG1;
    G0[i] = (v - baseLineG0) * (!bit_set) *(!heap_lost) + baseLineG0;

    _G0stats += (!bit_set) *(!heap_lost);
    _G1stats += bit_set * (!heap_lost);

    baselineUpdateG1 += v * bit_set * (!heap_lost);
    baselineUpdateG0 += v * (!bit_set) *(!heap_lost);
  }

  __shared__ uint32_t x[1024];

  // Reduce G0, G1
  sum_reduce<uint32_t>(x, _G0stats);
  if(threadIdx.x == 0) {
    atomicAdd(stats_G0,  (uint64_cu) x[threadIdx.x]);
  }
  __syncthreads();

  sum_reduce<uint32_t>(x, _G1stats);
  if(threadIdx.x == 0) {
    atomicAdd(stats_G1,  (uint64_cu) x[threadIdx.x]);
  }
  __syncthreads();

  //reuse shared array
  float *y = (float*) x;
  //update the baseline array
  sum_reduce<float>(y, baselineUpdateG0);
  if(threadIdx.x == 0) {
    atomicAdd(baseLineNG0, y[threadIdx.x]);
  }
  __syncthreads();

  sum_reduce<float>(y, baselineUpdateG1);
  if(threadIdx.x == 0) {
    atomicAdd(baseLineNG1, y[threadIdx.x]);
  }
  __syncthreads();
}



96
97
98



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
// Updates the baselines of the gates for the polarization set for the next
// block
// only few output blocks per input block thus execution on only one thread.
// Important is that the execution is async on the GPU.
__global__ void update_baselines(float*  __restrict__ baseLineG0,
        float*  __restrict__ baseLineG1,
        float* __restrict__ baseLineNG0,
        float* __restrict__ baseLineNG1,
        uint64_cu* stats_G0, uint64_cu* stats_G1,
        size_t N)
{
    size_t NG0 = 0;
    size_t NG1 = 0;

    for (size_t i =0; i < N; i++)
    {
       NG0 += stats_G0[i];
       NG1 += stats_G1[i];
    }

    baseLineG0[0] = baseLineNG0[0] / NG0;
    baseLineG1[0] = baseLineNG1[0] / NG1;
    baseLineNG0[0] = 0;
    baseLineNG1[0] = 0;
}



127
128
129
130
131
132
133
134
135
136
137
138
139
/**
 * @brief calculate stokes IQUV from two complex valuies for each polarization
 */
__host__ __device__ void stokes_IQUV(const float2 &p1, const float2 &p2, float &I, float &Q, float &U, float &V)
{
    I = fabs(p1.x*p1.x + p1.y * p1.y) + fabs(p2.x*p2.x + p2.y * p2.y);
    Q = fabs(p1.x*p1.x + p1.y * p1.y) - fabs(p2.x*p2.x + p2.y * p2.y);
    U = 2 * (p1.x*p2.x + p1.y * p2.y);
    V = -2 * (p1.y*p2.x - p1.x * p2.y);
}



140

141
142
143
144
/**
 * @brief calculate stokes IQUV spectra pol1, pol2 are arrays of naccumulate
 * complex spectra for individual polarizations
 */
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
__global__ void stokes_accumulate(float2 const *__restrict__ pol1,
                                  float2 const *__restrict__ pol2, float *I,
                                  float *Q, float *U, float *V, int nchans,
                                  int naccumulate) {
  const int nb = naccumulate / blockDim.x + 1;
  const int bs = blockDim.x;

  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; (i < nchans * nb);
       i += blockDim.x * gridDim.x) {
    const size_t channel_number = i % nchans;
    const size_t bn = i / nchans;

    float rI = 0;
    float rQ = 0;
    float rU = 0;
    float rV = 0;

    for (int k = 0; k < bs; k++) {
      int cidx = k + bn * bs;
      if (cidx >= naccumulate)
        break;

      const float2 p1 = pol1[channel_number + cidx * nchans];
      const float2 p2 = pol2[channel_number + cidx * nchans];

      rI += fabs(p1.x * p1.x + p1.y * p1.y) + fabs(p2.x * p2.x + p2.y * p2.y);
      rQ += fabs(p1.x * p1.x + p1.y * p1.y) - fabs(p2.x * p2.x + p2.y * p2.y);
      rU += 2.f * (p1.x * p2.x + p1.y * p2.y);
      rV += -2.f * (p1.y * p2.x - p1.x * p2.y);
    }
175

176
177
178
179
    atomicAdd(&I[channel_number], rI);
    atomicAdd(&Q[channel_number], rQ);
    atomicAdd(&U[channel_number], rU);
    atomicAdd(&V[channel_number], rV);
180
181
  }
}
182
183


184
void PolarizationData::resize(size_t rawVolttageBufferBytes, size_t nsidechannelitems, size_t channelized_samples)
185
{
186
    _raw_voltage.resize(rawVolttageBufferBytes / sizeof(uint64_t));
187
188
189
190
191
192
    BOOST_LOG_TRIVIAL(debug) << "  Input voltages size (in 64-bit words): " << _raw_voltage.size();

    _baseLineG0.resize(1);
    _baseLineG0_update.resize(1);
    _baseLineG1.resize(1);
    _baseLineG1_update.resize(1);
193
194
195
    _channelised_voltage_G0.resize(channelized_samples);
    _channelised_voltage_G1.resize(channelized_samples);
    _sideChannelData.resize(nsidechannelitems);
196
    _sideChannelData_h.resize(nsidechannelitems);
197
    BOOST_LOG_TRIVIAL(debug) << "  Channelised voltages size: " << _channelised_voltage_G0.size();
198
199
200
}


201
202
SinglePolarizationInput::SinglePolarizationInput(size_t fft_length, size_t nbits, const DadaBufferLayout
        &dadaBufferLayout) : PolarizationData(nbits), _fft_length(fft_length), _dadaBufferLayout(dadaBufferLayout)
203
{
204

205
206
    size_t nsamps_per_buffer = _dadaBufferLayout.sizeOfData() * 8 / nbits;
    size_t _batch = nsamps_per_buffer / _fft_length;
207

208
    resize(_dadaBufferLayout.sizeOfData(), _dadaBufferLayout.getNSideChannels() * _dadaBufferLayout.getNHeaps(), (_fft_length / 2 + 1) * _batch);
209
210
211
};


212
213
214
215
216
217
size_t SinglePolarizationInput::getSamplesPerInputPolarization()
{
    return _dadaBufferLayout.sizeOfData() * 8 / _nbits;
}


218
219
220
221
222
223
224
void PolarizationData::swap()
{
    _raw_voltage.swap();
    _sideChannelData.swap();
}


225
void SinglePolarizationInput::getFromBlock(RawBytes &block, cudaStream_t &_h2d_stream)
226
227
228
229
230
231
232
233
234
235
236
237
{
  BOOST_LOG_TRIVIAL(debug) << "   block.used_bytes() = " << block.used_bytes()
                           << ", dataBlockBytes = " << _dadaBufferLayout.sizeOfData() << "\n";

  CUDA_ERROR_CHECK(cudaMemcpyAsync(static_cast<void *>(_raw_voltage.a_ptr()),
                                   static_cast<void *>(block.ptr()),
                                   _dadaBufferLayout.sizeOfData() , cudaMemcpyHostToDevice,
                                   _h2d_stream));
  CUDA_ERROR_CHECK(cudaMemcpyAsync(
      static_cast<void *>(_sideChannelData.a_ptr()),
      static_cast<void *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()),
      _dadaBufferLayout.sizeOfSideChannelData(), cudaMemcpyHostToDevice, _h2d_stream));
238
239
240
241
242

    std::memcpy(static_cast<void *>(_sideChannelData_h.a_ptr()),
      static_cast<void *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()),
      _dadaBufferLayout.sizeOfSideChannelData());

243
244
245
246
247
248
249
250
  BOOST_LOG_TRIVIAL(debug) << "First side channel item: 0x" <<   std::setw(16)
      << std::setfill('0') << std::hex <<
      (reinterpret_cast<uint64_t*>(block.ptr() + _dadaBufferLayout.sizeOfData()
                                   + _dadaBufferLayout.sizeOfGap()))[0] <<
      std::dec;
}


251
DualPolarizationInput::DualPolarizationInput(size_t fft_length, size_t nbits, const DadaBufferLayout
252
253
254
255
        &dadaBufferLayout) : _fft_length(fft_length),
    polarization0(nbits),
    polarization1(nbits),
    _dadaBufferLayout(dadaBufferLayout)
256
{
257
258
259
260

  size_t nsamps_per_buffer = _dadaBufferLayout.sizeOfData() * 8 / nbits;
  size_t _batch = nsamps_per_buffer / _fft_length / 2;

261
262
    polarization0.resize(_dadaBufferLayout.sizeOfData() / 2, _dadaBufferLayout.getNSideChannels() * _dadaBufferLayout.getNHeaps() / 2, (_fft_length / 2 + 1) * _batch);
    polarization1.resize(_dadaBufferLayout.sizeOfData() / 2, _dadaBufferLayout.getNSideChannels() * _dadaBufferLayout.getNHeaps() / 2, (_fft_length / 2 + 1) * _batch);
263
264
};

265
266

void DualPolarizationInput::swap()
267
{
268
269
    polarization0.swap();
    polarization1.swap();
270
271
}

272

273
274
275
276
277
278
size_t DualPolarizationInput::getSamplesPerInputPolarization()
{
    return _dadaBufferLayout.sizeOfData() * 8 / polarization0._nbits / 2;
}


279
void DualPolarizationInput::getFromBlock(RawBytes &block, cudaStream_t &_h2d_stream)
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
{
// Copy the data with stride to the GPU:
// CPU: P1P2P1P2P1P2 ...
// GPU: P1P1P1 ... P2P2P2 ...
// If this is a bottleneck the gating kernel could sort the layout out
// during copy
int heapsize_bytes =  _dadaBufferLayout.getHeapSize();
CUDA_ERROR_CHECK(cudaMemcpy2DAsync(
  static_cast<void *>(polarization0._raw_voltage.a_ptr()),
    heapsize_bytes,
    static_cast<void *>(block.ptr()),
    2 * heapsize_bytes,
    heapsize_bytes, _dadaBufferLayout.sizeOfData() / heapsize_bytes/ 2,
    cudaMemcpyHostToDevice, _h2d_stream));

CUDA_ERROR_CHECK(cudaMemcpy2DAsync(
  static_cast<void *>(polarization1._raw_voltage.a_ptr()),
    heapsize_bytes,
298
    static_cast<void *>(block.ptr() + heapsize_bytes),
299
300
301
302
303
304
305
306
307
308
309
310
311
    2 * heapsize_bytes,
    heapsize_bytes, _dadaBufferLayout.sizeOfData() / heapsize_bytes/ 2,
    cudaMemcpyHostToDevice, _h2d_stream));

CUDA_ERROR_CHECK(cudaMemcpy2DAsync(
    static_cast<void *>(polarization0._sideChannelData.a_ptr()),
    sizeof(uint64_t),
    static_cast<void *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap()),
    2 * sizeof(uint64_t),
    sizeof(uint64_t),
    _dadaBufferLayout.sizeOfSideChannelData() / 2 / sizeof(uint64_t),
    cudaMemcpyHostToDevice, _h2d_stream));

312

313
314
315
316
317
318
319
320
CUDA_ERROR_CHECK(cudaMemcpy2DAsync(
    static_cast<void *>(polarization1._sideChannelData.a_ptr()),
    sizeof(uint64_t),
    static_cast<void *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap() + sizeof(uint64_t)),
    2 * sizeof(uint64_t),
    sizeof(uint64_t),
    _dadaBufferLayout.sizeOfSideChannelData() / 2 / sizeof(uint64_t), cudaMemcpyHostToDevice, _h2d_stream));

321
    uint64_t * sci = reinterpret_cast<uint64_t *>(block.ptr() + _dadaBufferLayout.sizeOfData() + _dadaBufferLayout.sizeOfGap());
322
    for(size_t i =0; i < _dadaBufferLayout.sizeOfSideChannelData() / sizeof(uint64_t); i+=2)
323
324
325
326
327
328
    {
       polarization0._sideChannelData_h.a()[i / 2] = sci[i];
       polarization1._sideChannelData_h.a()[i / 2] = sci[i + 1];
    }


329
330
331
332
333
334
335
336
337
338
339
340
341
BOOST_LOG_TRIVIAL(debug) << "First side channel item: 0x" <<   std::setw(16)
    << std::setfill('0') << std::hex <<
    (reinterpret_cast<uint64_t*>(block.ptr() + _dadaBufferLayout.sizeOfData()
                                 + _dadaBufferLayout.sizeOfGap()))[0] << std::dec;
}



PowerSpectrumOutput::PowerSpectrumOutput(size_t size, size_t blocks)
{
    BOOST_LOG_TRIVIAL(debug) << "Setting size of power spectrum output size = " << size << ", blocks =  " << blocks;
   data.resize(size * blocks);
   _noOfBitSets.resize(blocks);
342
   _noOfOverflowed.resize(blocks);
343
344
345
}


346
void PowerSpectrumOutput::swap(cudaStream_t &_proc_stream)
347
348
349
{
    data.swap();
    _noOfBitSets.swap();
350
    _noOfOverflowed.swap();
351
352
    thrust::fill(thrust::cuda::par.on(_proc_stream), data.a().begin(), data.a().end(), 0.);
    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSets.a().begin(), _noOfBitSets.a().end(), 0L);
353
    thrust::fill(_noOfOverflowed.a().begin(), _noOfOverflowed.a().end(), 0L);
354
355
356
357
358
359
360
361
362
}


GatedPowerSpectrumOutput::GatedPowerSpectrumOutput(size_t nchans, size_t
        blocks) : OutputDataStream(nchans, blocks), G0(nchans, blocks),
G1(nchans, blocks)
{
  // on the host both power are stored in the same data buffer together with
  // the number of bit sets
363
  _host_power.resize( 2 * ( _nchans * sizeof(IntegratedPowerType) + 2 * sizeof(size_t) ) * G0._noOfBitSets.size());
364
365
366
367
}


/// Swap output buffers
368
void GatedPowerSpectrumOutput::swap(cudaStream_t &_proc_stream)
369
{
370
371
    G0.swap(_proc_stream);
    G1.swap(_proc_stream);
372
373
374
375
376
377
    _host_power.swap();
}


void GatedPowerSpectrumOutput::data2Host(cudaStream_t &_d2h_stream)
{
378
379
    BOOST_LOG_TRIVIAL(debug) << "Copying data2Host for " << G0._noOfBitSets.size() << " blocks.";
  //CUDA_ERROR_CHECK(cudaStreamSynchronize(_d2h_stream));
380
381
382
383
384
385

  for (size_t i = 0; i < G0._noOfBitSets.size(); i++)
  {
    // size of individual spectrum + meta
    size_t memslicesize = (_nchans * sizeof(IntegratedPowerType));
    // number of spectra per output
386
    size_t memOffset = 2 * i * (memslicesize + 2*sizeof(size_t));
387
388
389
390

    // copy 2x channel data
    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset) ,
391
                        static_cast<void *>(G0.data.b_ptr() + i * _nchans),
392
393
394
395
396
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));

    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 1 * memslicesize) ,
397
                        static_cast<void *>(G1.data.b_ptr() + i * _nchans),
398
399
400
401
402
403
404
405
406
407
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));

    // copy noOf bit set data
    CUDA_ERROR_CHECK(
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType)),
          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));

408
409
410
411
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType) + sizeof(size_t)),
             static_cast<void *>(G0._noOfOverflowed.b_ptr() + i),
            1 * sizeof(size_t));

412
    CUDA_ERROR_CHECK(
413
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType) + 2 * sizeof(size_t)),
414
415
416
          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
417
418
419
420
421


    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 2 * _nchans * sizeof(IntegratedPowerType) + 3*sizeof(size_t)),
             static_cast<void *>(G1._noOfOverflowed.b_ptr() + i),
            1 * sizeof(size_t));
422
423
424
425
  }
}


426
427
428
429
430
431
432
void FullStokesOutput::swap(cudaStream_t &_proc_stream)
{
    I.swap();
    Q.swap();
    U.swap();
    V.swap();
    _noOfBitSets.swap();
433
    _noOfOverflowed.swap();
434
435
436
437
438
    thrust::fill(thrust::cuda::par.on(_proc_stream), I.a().begin(), I.a().end(), 0.);
    thrust::fill(thrust::cuda::par.on(_proc_stream), Q.a().begin(), Q.a().end(), 0.);
    thrust::fill(thrust::cuda::par.on(_proc_stream), U.a().begin(), U.a().end(), 0.);
    thrust::fill(thrust::cuda::par.on(_proc_stream), V.a().begin(), V.a().end(), 0.);
    thrust::fill(thrust::cuda::par.on(_proc_stream), _noOfBitSets.a().begin(), _noOfBitSets.a().end(), 0L);
439
440
    thrust::fill( _noOfOverflowed.a().begin(), _noOfOverflowed.a().end(), 0L);

441
442
443
444
445
446
447
448
449
450
}


FullStokesOutput::FullStokesOutput(size_t size, size_t blocks)
{
    I.resize(size * blocks);
    Q.resize(size * blocks);
    U.resize(size * blocks);
    V.resize(size * blocks);
    _noOfBitSets.resize(blocks);
451
   _noOfOverflowed.resize(blocks);
452
453
454
455
456
457
458
459
}



GatedFullStokesOutput::GatedFullStokesOutput(size_t nchans, size_t blocks): OutputDataStream(nchans, blocks), G0(nchans, blocks),
G1(nchans, blocks)
{
    BOOST_LOG_TRIVIAL(debug) << "Output with " << _blocks << " blocks a " << _nchans << " channels";
460
    _host_power.resize( 8 * ( _nchans * sizeof(IntegratedPowerType) + 2 * sizeof(size_t) ) * _blocks);
461
462
463
464
465
466
467
468
469
470
471
472
473
474
    BOOST_LOG_TRIVIAL(debug) << "Allocated " << _host_power.size() << " bytes.";
};


void GatedFullStokesOutput::swap(cudaStream_t &_proc_stream)
{
    G0.swap(_proc_stream);
    G1.swap(_proc_stream);
    _host_power.swap();
}


void GatedFullStokesOutput::data2Host(cudaStream_t &_d2h_stream)
{
Tobias Winchen's avatar
Tobias Winchen committed
475
476
477

BOOST_LOG_TRIVIAL(debug) << "Copying data2Host for " << G0._noOfBitSets.size() << " blocks.";

478
479
480
for (size_t i = 0; i < G0._noOfBitSets.size(); i++)
{
    size_t memslicesize = (_nchans * sizeof(IntegratedPowerType));
481
    size_t memOffset = 8 * i * (memslicesize + 2 * sizeof(size_t));
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
    // Copy  II QQ UU VV
    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset) ,
                        static_cast<void *>(G0.I.b_ptr() + i * _nchans),
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));

    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 1 * memslicesize) ,
                        static_cast<void *>(G1.I.b_ptr() + i * _nchans),
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));

    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 2 * memslicesize) ,
                        static_cast<void *>(G0.Q.b_ptr() + i * _nchans),
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));

    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 3 * memslicesize) ,
                        static_cast<void *>(G1.Q.b_ptr() + i * _nchans),
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));

    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 4 * memslicesize) ,
                        static_cast<void *>(G0.U.b_ptr() + i * _nchans),
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));
512

513
514
515
516
517
    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 5 * memslicesize) ,
                        static_cast<void *>(G1.U.b_ptr() + i * _nchans),
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));
518

519
520
521
522
523
    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 6 * memslicesize) ,
                        static_cast<void *>(G0.V.b_ptr() + i * _nchans),
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));
524

525
526
527
528
529
    CUDA_ERROR_CHECK(
        cudaMemcpyAsync(static_cast<void *>(_host_power.a_ptr() + memOffset + 7 * memslicesize) ,
                        static_cast<void *>(G1.V.b_ptr() + i * _nchans),
                        _nchans * sizeof(IntegratedPowerType),
                        cudaMemcpyDeviceToHost, _d2h_stream));
530

531
532
533
534
535
536
    // Copy SCI
    CUDA_ERROR_CHECK(
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize),
          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
537
538
539
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 1 * sizeof(size_t) ),
            static_cast<void *>(G0._noOfOverflowed.b_ptr() + i ), 1 * sizeof(size_t) );

540
    CUDA_ERROR_CHECK(
541
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 2 * sizeof(size_t)),
542
543
544
          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
545
546
547
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 3 * sizeof(size_t) ),
            static_cast<void *>(G1._noOfOverflowed.b_ptr() + i ), 1 * sizeof(size_t) );

548
    CUDA_ERROR_CHECK(
549
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 4 * sizeof(size_t)),
550
551
552
          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
553
554
555
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 5 * sizeof(size_t) ),
            static_cast<void *>(G0._noOfOverflowed.b_ptr() + i ), 1 * sizeof(size_t) );

556
    CUDA_ERROR_CHECK(
557
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 6 * sizeof(size_t)),
558
559
560
          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
561
562
563
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 7 * sizeof(size_t) ),
            static_cast<void *>(G1._noOfOverflowed.b_ptr() + i ), 1 * sizeof(size_t) );

564
    CUDA_ERROR_CHECK(
565
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 8 * sizeof(size_t)),
566
567
568
          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
569
570
571
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 9 * sizeof(size_t) ),
            static_cast<void *>(G0._noOfOverflowed.b_ptr() + i ), 1 * sizeof(size_t) );

572
    CUDA_ERROR_CHECK(
573
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 10 * sizeof(size_t)),
574
575
576
          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
577
578
579
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 11 * sizeof(size_t) ),
            static_cast<void *>(G1._noOfOverflowed.b_ptr() + i ), 1 * sizeof(size_t) );

580
    CUDA_ERROR_CHECK(
581
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 12 * sizeof(size_t)),
582
583
584
          static_cast<void *>(G0._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
585
586
587
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 13 * sizeof(size_t) ),
            static_cast<void *>(G0._noOfOverflowed.b_ptr() + i ), 1 * sizeof(size_t) );

588
    CUDA_ERROR_CHECK(
589
        cudaMemcpyAsync( static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 14 * sizeof(size_t)),
590
591
592
          static_cast<void *>(G1._noOfBitSets.b_ptr() + i ),
            1 * sizeof(size_t),
            cudaMemcpyDeviceToHost, _d2h_stream));
593
594
    std::memcpy(static_cast<void *>(_host_power.a_ptr() + memOffset + 8 * memslicesize + 15 * sizeof(size_t) ),
            static_cast<void *>(G1._noOfOverflowed.b_ptr() + i ), 1 * sizeof(size_t) );
Tobias Winchen's avatar
Tobias Winchen committed
595

596
597
  }
}
598
599


600
}}} // namespace