From 2d02b2bc3a64523833d50d2d055abb4e20f4dd45 Mon Sep 17 00:00:00 2001
From: Andreas Marek <amarek@rzg.mpg.de>
Date: Wed, 29 Mar 2017 12:11:45 +0200
Subject: [PATCH] Test blocksize

---
 src/cuUtils_template.Xcu                     | 62 +++++++++++++++++++-
 src/ev_tridi_band_gpu_c_v2_real_template.Xcu | 25 +++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/src/cuUtils_template.Xcu b/src/cuUtils_template.Xcu
index ff4c7601..4a091a8a 100644
--- a/src/cuUtils_template.Xcu
+++ b/src/cuUtils_template.Xcu
@@ -61,7 +61,7 @@
 #include <cuComplex.h>
 #endif
 
-#define BLOCK_CYCLIC_BLOCKSIZE 128
+#define BLOCK_CYCLIC_BLOCKSIZE 64
 #define GLOBAL_STRIPE_WIDTH 256
 #define WARP_SIZE 32
 
@@ -161,6 +161,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
 
 #if REALCASE == 1
         // attention
+#if BLOCK_CYCLIC_BLOCKSIZE == 128
         if (t_idx < 32)
 	{
                 s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
@@ -171,8 +172,26 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
         if (t_idx < 1)
                 s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3];
 	}
+#else /* BLOCK_CYCLIC_BLOCKSIZE == 128 */
+        if (t_idx < 32)
+	{
+                s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32];
+        if (t_idx < 8)
+                s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 8] + s_block[t_idx + 16] + s_block[t_idx + 24];
+        if (t_idx < 4)
+                s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 4];
+        if (t_idx < 1)
+                s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3];
+	}
+
+#endif /* BLOCK_CYCLIC_BLOCKSIZE == 128 */
+
+
 #endif
 #if COMPLEXCASE == 1
+
+#if BLOCK_CYCLIC_BLOCKSSIZE == 128
+
         // attention
 	if (t_idx < 32)
         {
@@ -209,6 +228,47 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
         }
         }
 #endif
+
+#else /* BLOCK_CYCLIC_BLOCKSSIZE == 128 */
+	if (t_idx < 32)
+        {
+#ifdef DOUBLE_PRECISION_COMPLEX
+        s_block[t_idx] = (cuCadd(s_block[t_idx],s_block[t_idx + 32]) ) );
+        if (t_idx < 8)
+        {
+        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
+
+        }
+        if (t_idx < 4)
+        {
+        s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ;
+        }
+        if (t_idx < 1)
+        {
+        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) );
+        }
+        }
+#else
+        s_block[t_idx] = (cuCaddf(s_block[t_idx],s_block[t_idx + 32])  );
+        if (t_idx < 8)
+        {
+        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 8] ) , cuCaddf( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
+
+        }
+        if (t_idx < 4)
+        {
+        s_block[t_idx] = cuCaddf(s_block[t_idx] , s_block[t_idx + 4]) ;
+        }
+        if (t_idx < 1)
+        {
+        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 1] ) , cuCaddf( s_block[t_idx +2] , s_block[t_idx + 3] ) );
+        }
+        }
+#endif
+
+#endif /* BLOCK_CYCLIC_BLOCKSSIZE == 128 */
+
+
 #endif /* COMPLEXCASE == 1 */
 }
 
diff --git a/src/ev_tridi_band_gpu_c_v2_real_template.Xcu b/src/ev_tridi_band_gpu_c_v2_real_template.Xcu
index 42508922..a4ac7bc7 100644
--- a/src/ev_tridi_band_gpu_c_v2_real_template.Xcu
+++ b/src/ev_tridi_band_gpu_c_v2_real_template.Xcu
@@ -54,7 +54,7 @@
 #include <stdlib.h>
 #include "config-f90.h"
 
-#define BLOCK_CYCLIC_BLOCKSIZE 128
+#define BLOCK_CYCLIC_BLOCKSIZE 64
 #define GLOBAL_STRIPE_WIDTH 256
 
 // Perform the equivalent of "__shfl_xor" on an 8-byte value
@@ -133,6 +133,8 @@ __device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, in
         // attention
         if (t_idx < 32)
         {
+
+#if BLOCK_CYCLIC_BLOCKSIZE == 128
 #ifdef DOUBLE_PRECISION_REAL
             dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
             dotp_s[t_idx + 64] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
@@ -140,6 +142,18 @@ __device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, in
             dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
             dotp_s[t_idx + 64] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
 #endif
+#else /* BLOCK_CYCLIC_BLOCKSIZE */
+#ifdef DOUBLE_PRECISION_REAL
+            dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
+            //dotp_s[t_idx + 64] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
+#else
+            dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
+            //dotp_s[t_idx + 64] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
+#endif
+
+#endif /* BLOCK_CYCLIC_BLOCKSIZE */
+
+
         }
     }
 }
@@ -279,7 +293,12 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
         // Now both threads in a pair can write to the same reduction buffer address without race-condition issues
         dotp_s[t_s] = my_r1;
 	//attention
+#if BLOCK_CYCLIC_BLOCKSIZE == 128
         dotp_s[t_s + 64] = my_r2;
+#else
+        dotp_s[t_s + 32] = my_r2;
+
+#endif
 
         // Ensure the reduction buffers are fully populated
         sync_real_threads<HAVE_2_WARPS>();
@@ -297,7 +316,11 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
         s_1 = dotp_s[0];
 
 	// attention
+#if BLOCK_CYCLIC_BLOCKSIZE == 128
         s_2 = dotp_s[64];
+#else
+        s_2 = dotp_s[32];
+#endif
 
         // Each thread updates its corresponding EV component
         q_v_2 = q_v_2 - hh_v_3 * s_1 - hh_v_2 * s_2 + tau_2 * hh_v_2 * s_1 * dot_p;
-- 
GitLab