Commit 3bbc6779 authored by Ewan Barr's avatar Ewan Barr
Browse files

fixed the thread count for the unpacker

parent c06ed862
......@@ -6,6 +6,8 @@
#include "thrust/host_vector.h"
#include "cufft.h"
#define NTHREADS_UNPACK 512
namespace psrdada_cpp {
namespace effelsberg {
namespace edd {
......@@ -15,7 +17,7 @@ namespace kernels {
void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n);
__global__
void detect_and_accumulate(cufftComplex* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate)
void detect_and_accumulate(cufftComplex* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate);
} //kernels
......
......@@ -39,8 +39,8 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__
* Note: This kernels will not work with more than 512 threads.
*/
__shared__ volatile float tmp_out[NTHREADS * 16];
__shared__ volatile uint64_t tmp_in[NTHREADS * 3];
__shared__ volatile float tmp_out[NTHREADS_UNPACK * 16];
__shared__ volatile uint64_t tmp_in[NTHREADS_UNPACK * 3];
int block_idx = blockIdx.x;
uint64_t val;
......@@ -51,10 +51,10 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__
{
//Read to shared memeory
int block_read_start = block_idx * NTHREADS * 3;
int block_read_start = block_idx * NTHREADS_UNPACK * 3;
tmp_in[threadIdx.x] = in[block_read_start + threadIdx.x];
tmp_in[NTHREADS + threadIdx.x] = in[block_read_start + NTHREADS + threadIdx.x];
tmp_in[NTHREADS * 2 + threadIdx.x] = in[block_read_start + NTHREADS * 2 + threadIdx.x];
tmp_in[NTHREADS_UNPACK + threadIdx.x] = in[block_read_start + NTHREADS_UNPACK + threadIdx.x];
tmp_in[NTHREADS_UNPACK * 2 + threadIdx.x] = in[block_read_start + NTHREADS_UNPACK * 2 + threadIdx.x];
__syncthreads();
......@@ -83,9 +83,9 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__
__syncthreads();
int block_write_start = block_idx * NTHREADS * 16;
int block_write_start = block_idx * NTHREADS_UNPACK * 16;
for (int ii = threadIdx.x; ii < 16 * NTHREADS; ii+=blockDim.x)
for (int ii = threadIdx.x; ii < 16 * NTHREADS_UNPACK; ii+=blockDim.x)
{
out[block_write_start+ii] = tmp_out[ii];
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment