From 358fde9802d417392c9623175b659602d1ab0806 Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Tue, 28 Mar 2017 10:42:46 +0200 Subject: [PATCH] Introduce some C-macros for GPU kernels --- src/cuUtils_template.Xcu | 26 ++++++++----- ...v_tridi_band_gpu_c_v2_complex_template.Xcu | 37 ++++++++++++------- src/ev_tridi_band_gpu_c_v2_real_template.Xcu | 25 +++++++++---- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/src/cuUtils_template.Xcu b/src/cuUtils_template.Xcu index df28b75c..3862961c 100644 --- a/src/cuUtils_template.Xcu +++ b/src/cuUtils_template.Xcu @@ -61,6 +61,10 @@ #include #endif +#define BLOCK_CYCLIC_BLOCKSIZE 128 +#define GLOBAL_STRIPE_WIDTH 256 +#define WARP_SIZE 32 + // Reset a reduction block // Limitation: the thread-block size must be a divider of the reduction block's size @@ -156,6 +160,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block) __syncthreads(); #if REALCASE == 1 + // attention if (t_idx < 32) { s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ; @@ -168,6 +173,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block) } #endif #if COMPLEXCASE == 1 + // attention if (t_idx < 32) { #ifdef DOUBLE_PRECISION_COMPLEX @@ -330,24 +336,24 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFlo __global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n) { - __shared__ double hh_s[128] ; + __shared__ double hh_s[BLOCK_CYCLIC_BLOCKSIZE] ; #else __global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n) { - __shared__ float hh_s[128] ; + __shared__ float hh_s[BLOCK_CYCLIC_BLOCKSIZE] ; #endif #endif #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX __global__ void compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n) { - __shared__ cuDoubleComplex hh_s[128] ; + __shared__ cuDoubleComplex hh_s[BLOCK_CYCLIC_BLOCKSIZE] ; #else __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n) { - __shared__ cuFloatComplex hh_s[128] ; + __shared__ cuFloatComplex hh_s[BLOCK_CYCLIC_BLOCKSIZE] ; #endif #endif int t_idx, v_idx; @@ -359,7 +365,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF t_idx = threadIdx.x ; // // The contents of the shared memory must be fully reset -// reset_shared_block_c(hh_s, 128); +// reset_shared_block_c(hh_s, BLOCK_CYCLIC_BLOCKSIZE); // Initialize the contents of the shared buffer (preparing for reduction) if (t_idx > 0) @@ -513,22 +519,22 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bc #endif { int grid_size; - grid_size = 1 + (n - 1) / 256; + grid_size = 1 + (n - 1) / GLOBAL_STRIPE_WIDTH; cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err); #if REALCASE == 1 #ifdef DOUBLE_PRECISION_REAL - extract_hh_tau_c_kernel_double<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); + extract_hh_tau_c_kernel_double<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); #else - extract_hh_tau_c_kernel_single<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); + extract_hh_tau_c_kernel_single<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); #endif #endif #if COMPLEXCASE == 1 #ifdef DOUBLE_PRECISION_COMPLEX - extract_hh_tau_c_kernel_complex_double<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); + extract_hh_tau_c_kernel_complex_double<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); #else - extract_hh_tau_c_kernel_complex_single<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); + extract_hh_tau_c_kernel_complex_single<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); #endif #endif err = cudaGetLastError(); diff --git a/src/ev_tridi_band_gpu_c_v2_complex_template.Xcu b/src/ev_tridi_band_gpu_c_v2_complex_template.Xcu index 78698a9e..f648e401 100644 --- a/src/ev_tridi_band_gpu_c_v2_complex_template.Xcu +++ b/src/ev_tridi_band_gpu_c_v2_complex_template.Xcu @@ -54,6 +54,11 @@ #include #include #include "config-f90.h" + + +#define BLOCK_CYCLIC_BLOCKSIZE 128 +#define GLOBAL_STRIPE_WIDTH 256 + // =========================================================================================================== // Important: due to the use of warp shuffling, the C version of the backtransformation kernel only works on // devices with compute capability 3.x; for older devices, please use the Fortran kernel version @@ -136,7 +141,7 @@ __device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block) int t_idx ; t_idx = threadIdx.x; __syncthreads(); - + // attention #ifdef DOUBLE_PRECISION_COMPLEX if (t_idx < 32) { @@ -187,7 +192,7 @@ __device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block) int t_idx ; t_idx = threadIdx.x; __syncthreads(); - + // attention #ifdef DOUBLE_PRECISION_COMPLEX if(t_idx < 64) { @@ -312,6 +317,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o if (HAVE_2_WARPS) { // In this case, we have 2 warps, each doing 1 reduction + // attention if (t_idx < 64) { #ifdef DOUBLE_PRECISION_COMPLEX @@ -324,6 +330,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o else { // In this case we have 1 warp that performs both reductions + // attention if (t_idx < 32) { #ifdef DOUBLE_PRECISION_COMPLEX @@ -349,6 +356,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex if (HAVE_2_WARPS) { // In this case, we have 2 warps, each doing 1 reduction + //attention if (t_idx < 64) { #ifdef DOUBLE_PRECISION_COMPLEX @@ -361,6 +369,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex else { // In this case we have 1 warp that performs both reductions + // attention if (t_idx < 32) { #ifdef DOUBLE_PRECISION_COMPLEX @@ -393,6 +402,7 @@ __device__ void reset_dotp_buffers_complex_double( cuDoubleComplex * const __r __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __restrict__ s_block) #endif { + // attention if (blockDim.x >= 64) { int t_idx = threadIdx.x; @@ -406,7 +416,7 @@ __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __re } else { - int s_chunk = 128 / blockDim.x; + int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x; #ifdef DOUBLE_PRECISION_COMPLEX int s_chunk_size = s_chunk * sizeof(cuDoubleComplex); #else @@ -425,20 +435,20 @@ __device__ void reset_dotp_buffers_complex_2_double( cuDoubleComplex * const _ __device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __restrict__ s_block) #endif { - if (blockDim.x >= 128) + if (blockDim.x >= BLOCK_CYCLIC_BLOCKSIZE) { int t_idx = threadIdx.x; - if (t_idx < 128) + if (t_idx < BLOCK_CYCLIC_BLOCKSIZE) { - s_block[t_idx].x = s_block[t_idx + 128].x = 0.0; - s_block[t_idx].y = s_block[t_idx + 128].y = 0.0; + s_block[t_idx].x = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].x = 0.0; + s_block[t_idx].y = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].y = 0.0; } } else { - int s_chunk = 256 / blockDim.x; + int s_chunk = GLOBAL_STRIPE_WIDTH / blockDim.x; #ifdef DOUBLE_PRECISION_COMPLEX int s_chunk_size = s_chunk * sizeof(cuDoubleComplex); #else @@ -462,13 +472,13 @@ template__global__ void compute_hh_trafo_c_ker #endif { #ifdef DOUBLE_PRECISION_COMPLEX - __shared__ cuDoubleComplex q_s[128]; - __shared__ cuDoubleComplex dotp_s[128]; + __shared__ cuDoubleComplex q_s[BLOCK_CYCLIC_BLOCKSIZE]; + __shared__ cuDoubleComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE]; cuDoubleComplex q_v2, tau ; #else - __shared__ cuFloatComplex q_s[128]; - __shared__ cuFloatComplex dotp_s[128]; + __shared__ cuFloatComplex q_s[BLOCK_CYCLIC_BLOCKSIZE]; + __shared__ cuFloatComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE]; cuFloatComplex q_v2, tau ; #endif @@ -565,6 +575,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex* switch (nb) { + // attention case 256: case 128: case 64: @@ -616,7 +627,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex* #endif break; default: - printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and 128.\n"); + printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE.\n"); } cudaDeviceSynchronize(); diff --git a/src/ev_tridi_band_gpu_c_v2_real_template.Xcu b/src/ev_tridi_band_gpu_c_v2_real_template.Xcu index a964decd..a65c2bbb 100644 --- a/src/ev_tridi_band_gpu_c_v2_real_template.Xcu +++ b/src/ev_tridi_band_gpu_c_v2_real_template.Xcu @@ -54,6 +54,9 @@ #include #include "config-f90.h" +#define BLOCK_CYCLIC_BLOCKSIZE 128 +#define GLOBAL_STRIPE_WIDTH 256 + #if 0 static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask) { @@ -96,6 +99,7 @@ __device__ __forceinline__ void reset_dotp_buffers_double(double * const __restr __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restrict__ s_block) #endif { + // attention if (blockDim.x >= 64) { int t_idx = threadIdx.x; @@ -107,7 +111,7 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri } else { - int s_chunk = 128 / blockDim.x; + int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x; #ifdef DOUBLE_PRECISION_REAL int s_chunk_size = s_chunk * sizeof(double); #else @@ -125,20 +129,20 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri // We use templates here to avoid additional branching based on the actual size of the thread-block template #ifdef DOUBLE_PRECISION_REAL -__global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot, +__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot, const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols) #else -__global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot, +__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot, const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols) #endif { #ifdef DOUBLE_PRECISION_REAL - __shared__ double dotp_s[128]; - __shared__ double q_s[129]; + __shared__ double dotp_s[BLOCK_CYCLIC_BLOCKSIZE]; + __shared__ double q_s[BLOCK_CYCLIC_BLOCKSIZE+1]; #else - __shared__ float dotp_s[128]; - __shared__ float q_s[129]; + __shared__ float dotp_s[BLOCK_CYCLIC_BLOCKSIZE]; + __shared__ float q_s[BLOCK_CYCLIC_BLOCKSIZE+1]; #endif int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta; @@ -222,6 +226,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * // Now both threads in a pair can write to the same reduction buffer address without race-condition issues dotp_s[t_s] = my_r1; + //attention dotp_s[t_s + 64] = my_r2; // Ensure the reduction buffers are fully populated @@ -238,6 +243,8 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * // Each thread collects the reduction results s_1 = dotp_s[0]; + + // attention s_2 = dotp_s[64]; // Each thread updates its corresponding EV component @@ -299,6 +306,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * sync_threads(); // We perform the reduction using the first warp only + // attention if (t_idx < 32) { #ifdef DOUBLE_PRECISION_REAL @@ -324,6 +332,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * { switch (nb) { + // attention case 128: case 64: #ifdef DOUBLE_PRECISION_REAL @@ -375,7 +384,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * break; default: - printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and 128.\n"); + printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE .\n"); } } -- GitLab