From 2d02b2bc3a64523833d50d2d055abb4e20f4dd45 Mon Sep 17 00:00:00 2001 From: Andreas Marek <amarek@rzg.mpg.de> Date: Wed, 29 Mar 2017 12:11:45 +0200 Subject: [PATCH] Test blocksize --- src/cuUtils_template.Xcu | 62 +++++++++++++++++++- src/ev_tridi_band_gpu_c_v2_real_template.Xcu | 25 +++++++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/src/cuUtils_template.Xcu b/src/cuUtils_template.Xcu index ff4c7601..4a091a8a 100644 --- a/src/cuUtils_template.Xcu +++ b/src/cuUtils_template.Xcu @@ -61,7 +61,7 @@ #include <cuComplex.h> #endif -#define BLOCK_CYCLIC_BLOCKSIZE 128 +#define BLOCK_CYCLIC_BLOCKSIZE 64 #define GLOBAL_STRIPE_WIDTH 256 #define WARP_SIZE 32 @@ -161,6 +161,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block) #if REALCASE == 1 // attention +#if BLOCK_CYCLIC_BLOCKSIZE == 128 if (t_idx < 32) { s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ; @@ -171,8 +172,26 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block) if (t_idx < 1) s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3]; } +#else /* BLOCK_CYCLIC_BLOCKSIZE == 128 */ + if (t_idx < 32) + { + s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32]; + if (t_idx < 8) + s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 8] + s_block[t_idx + 16] + s_block[t_idx + 24]; + if (t_idx < 4) + s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 4]; + if (t_idx < 1) + s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3]; + } + +#endif /* BLOCK_CYCLIC_BLOCKSIZE == 128 */ + + #endif #if COMPLEXCASE == 1 + +#if BLOCK_CYCLIC_BLOCKSSIZE == 128 + // attention if (t_idx < 32) { @@ -209,6 +228,47 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block) } } #endif + +#else /* BLOCK_CYCLIC_BLOCKSSIZE == 128 */ + if (t_idx < 32) + { +#ifdef DOUBLE_PRECISION_COMPLEX + s_block[t_idx] = (cuCadd(s_block[t_idx],s_block[t_idx + 32]) ) ); + if (t_idx < 8) + { + s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) ); + + } + if (t_idx < 4) + { + s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ; + } + if (t_idx < 1) + { + s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) ); + } + } +#else + s_block[t_idx] = (cuCaddf(s_block[t_idx],s_block[t_idx + 32]) ); + if (t_idx < 8) + { + s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 8] ) , cuCaddf( s_block[t_idx + 16] , s_block[t_idx + 24] ) ); + + } + if (t_idx < 4) + { + s_block[t_idx] = cuCaddf(s_block[t_idx] , s_block[t_idx + 4]) ; + } + if (t_idx < 1) + { + s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 1] ) , cuCaddf( s_block[t_idx +2] , s_block[t_idx + 3] ) ); + } + } +#endif + +#endif /* BLOCK_CYCLIC_BLOCKSSIZE == 128 */ + + #endif /* COMPLEXCASE == 1 */ } diff --git a/src/ev_tridi_band_gpu_c_v2_real_template.Xcu b/src/ev_tridi_band_gpu_c_v2_real_template.Xcu index 42508922..a4ac7bc7 100644 --- a/src/ev_tridi_band_gpu_c_v2_real_template.Xcu +++ b/src/ev_tridi_band_gpu_c_v2_real_template.Xcu @@ -54,7 +54,7 @@ #include <stdlib.h> #include "config-f90.h" -#define BLOCK_CYCLIC_BLOCKSIZE 128 +#define BLOCK_CYCLIC_BLOCKSIZE 64 #define GLOBAL_STRIPE_WIDTH 256 // Perform the equivalent of "__shfl_xor" on an 8-byte value @@ -133,6 +133,8 @@ __device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, in // attention if (t_idx < 32) { + +#if BLOCK_CYCLIC_BLOCKSIZE == 128 #ifdef DOUBLE_PRECISION_REAL dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]); dotp_s[t_idx + 64] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]); @@ -140,6 +142,18 @@ __device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, in dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]); dotp_s[t_idx + 64] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]); #endif +#else /* BLOCK_CYCLIC_BLOCKSIZE */ +#ifdef DOUBLE_PRECISION_REAL + dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]); + //dotp_s[t_idx + 64] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]); +#else + dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]); + //dotp_s[t_idx + 64] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]); +#endif + +#endif /* BLOCK_CYCLIC_BLOCKSIZE */ + + } } } @@ -279,7 +293,12 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k // Now both threads in a pair can write to the same reduction buffer address without race-condition issues dotp_s[t_s] = my_r1; //attention +#if BLOCK_CYCLIC_BLOCKSIZE == 128 dotp_s[t_s + 64] = my_r2; +#else + dotp_s[t_s + 32] = my_r2; + +#endif // Ensure the reduction buffers are fully populated sync_real_threads<HAVE_2_WARPS>(); @@ -297,7 +316,11 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k s_1 = dotp_s[0]; // attention +#if BLOCK_CYCLIC_BLOCKSIZE == 128 s_2 = dotp_s[64]; +#else + s_2 = dotp_s[32]; +#endif // Each thread updates its corresponding EV component q_v_2 = q_v_2 - hh_v_3 * s_1 - hh_v_2 * s_2 + tau_2 * hh_v_2 * s_1 * dot_p; -- GitLab