From 3bbc6779b0e071b58c20daaf96fc449b88ea828f Mon Sep 17 00:00:00 2001
From: Ewan Barr <ewan.d.barr@googlemail.com>
Date: Wed, 25 Apr 2018 15:35:15 +0200
Subject: [PATCH] fixed the thread count for the unpacker

---
 psrdada_cpp/effelsberg/edd/eddfft.cuh    |  4 +++-
 psrdada_cpp/effelsberg/edd/src/eddfft.cu | 14 +++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/psrdada_cpp/effelsberg/edd/eddfft.cuh b/psrdada_cpp/effelsberg/edd/eddfft.cuh
index bc18907f..94dc6229 100644
--- a/psrdada_cpp/effelsberg/edd/eddfft.cuh
+++ b/psrdada_cpp/effelsberg/edd/eddfft.cuh
@@ -6,6 +6,8 @@
 #include "thrust/host_vector.h"
 #include "cufft.h"
 
+#define NTHREADS_UNPACK 512
+
 namespace psrdada_cpp {
 namespace effelsberg {
 namespace edd {
@@ -15,7 +17,7 @@ namespace kernels {
     void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__ out, int n);
 
     __global__
-    void detect_and_accumulate(cufftComplex* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate)
+    void detect_and_accumulate(cufftComplex* __restrict__ in, float* __restrict__ out, int nchans, int nsamps, int naccumulate);
 
 
 } //kernels
diff --git a/psrdada_cpp/effelsberg/edd/src/eddfft.cu b/psrdada_cpp/effelsberg/edd/src/eddfft.cu
index 0c4ab129..4d8ddfba 100644
--- a/psrdada_cpp/effelsberg/edd/src/eddfft.cu
+++ b/psrdada_cpp/effelsberg/edd/src/eddfft.cu
@@ -39,8 +39,8 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__
      * Note: This kernels will not work with more than 512 threads.
      */
 
-    __shared__ volatile float tmp_out[NTHREADS * 16];
-    __shared__ volatile uint64_t tmp_in[NTHREADS * 3];
+    __shared__ volatile float tmp_out[NTHREADS_UNPACK * 16];
+    __shared__ volatile uint64_t tmp_in[NTHREADS_UNPACK * 3];
     int block_idx = blockIdx.x;
 
     uint64_t val;
@@ -51,10 +51,10 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__
     {
 
         //Read to shared memeory
-        int block_read_start = block_idx * NTHREADS * 3;
+        int block_read_start = block_idx * NTHREADS_UNPACK * 3;
         tmp_in[threadIdx.x]                = in[block_read_start + threadIdx.x];
-        tmp_in[NTHREADS + threadIdx.x]     = in[block_read_start + NTHREADS + threadIdx.x];
-        tmp_in[NTHREADS * 2 + threadIdx.x] = in[block_read_start + NTHREADS * 2 + threadIdx.x];
+        tmp_in[NTHREADS_UNPACK + threadIdx.x]     = in[block_read_start + NTHREADS_UNPACK + threadIdx.x];
+        tmp_in[NTHREADS_UNPACK * 2 + threadIdx.x] = in[block_read_start + NTHREADS_UNPACK * 2 + threadIdx.x];
 
         __syncthreads();
 
@@ -83,9 +83,9 @@ void unpack_edd_12bit_to_float32(uint64_t* __restrict__ in, float* __restrict__
 
         __syncthreads();
 
-        int block_write_start = block_idx * NTHREADS * 16;
+        int block_write_start = block_idx * NTHREADS_UNPACK * 16;
 
-        for (int ii = threadIdx.x; ii < 16 * NTHREADS; ii+=blockDim.x)
+        for (int ii = threadIdx.x; ii < 16 * NTHREADS_UNPACK; ii+=blockDim.x)
         {
             out[block_write_start+ii] = tmp_out[ii];
         }
-- 
GitLab