Commit 06971fc0 authored by Ewan Barr's avatar Ewan Barr
Browse files

working version

parent 76ab9366
......@@ -62,8 +62,27 @@ bool SimpleFFTSpectrometer<HandlerType>::operator()(RawBytes& block)
_channelised.resize(nchans * batch);
_detected.resize(nchans * batch / _naccumulate);
_first_block = false;
//cudaMemcpy((char*) _edd_raw_ptr, block.ptr(), block.used_bytes(), cudaMemcpyHostToDevice);
//CUDA_ERROR_CHECK(cudaDeviceSynchronize());
return false;
}
// --- pass 2 ---
// Start async memcpy into next buffer in stream 0
// Process previous block in stream 1
// return
// sync stream 1
// start async memcpy into output block in stream 2
// sync stream 0
// start async memcpy into next buffer in stream 0
// Process previous block in stream 1
//
if (_nsamps != nsamps_in_block)
{
throw std::runtime_error("Received incomplete block");
......@@ -92,9 +111,8 @@ bool SimpleFFTSpectrometer<HandlerType>::operator()(RawBytes& block)
cufftComplex* _channelised_ptr = thrust::raw_pointer_cast(_channelised.data());
CUFFT_ERROR_CHECK(cufftExecR2C(_fft_plan, (cufftReal*)_edd_unpacked_ptr, _channelised_ptr));
float* _detected_ptr = thrust::raw_pointer_cast(_detected.data());
kernels::detect_and_accumulate<<<1024, 1024>>>(_channelised_ptr, _detected_ptr, nchans, nsamps_in_block/_fft_length, 64);
kernels::detect_and_accumulate<<<1024, 1024>>>(_channelised_ptr, _detected_ptr, nchans, _nsamps/_fft_length, 64);
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
//thrust::copy(_edd_unpacked.begin(), _edd_unpacked.end(), block.ptr());
......
......@@ -94,18 +94,18 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__
}
__global__
void detect_and_accumulate(cufftComplex* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate)
void detect_and_accumulate(float2* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate)
{
for (int block_idx = blockIdx.x; block_idx < nsamps/naccumulate; ++block_idx)
for (int block_idx = blockIdx.x; block_idx < nsamps/naccumulate; block_idx+=gridDim.x)
{
int read_offset = block_idx * naccumulate * nchans;
int write_offset = block_idx * nchans;
for (int chan_idx = threadIdx.x; threadIdx.x < nchans; chan_idx += blockDim.x)
for (int chan_idx = threadIdx.x; chan_idx < nchans; chan_idx += blockDim.x)
{
float sum = 0.0f;
for (int ii=0; ii < naccumulate; ++ii)
{
cufftComplex tmp = in[read_offset + chan_idx];
float2 tmp = in[read_offset + chan_idx + ii*nchans];
float x = tmp.x * tmp.x;
float y = tmp.y * tmp.y;
sum += x + y;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment