Commit 358fde98 authored by Andreas Marek's avatar Andreas Marek

Introduce some C-macros for GPU kernels

parent d03a603a
...@@ -61,6 +61,10 @@ ...@@ -61,6 +61,10 @@
#include <cuComplex.h> #include <cuComplex.h>
#endif #endif
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
#define WARP_SIZE 32
// Reset a reduction block // Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size // Limitation: the thread-block size must be a divider of the reduction block's size
...@@ -156,6 +160,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block) ...@@ -156,6 +160,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
__syncthreads(); __syncthreads();
#if REALCASE == 1 #if REALCASE == 1
// attention
if (t_idx < 32) if (t_idx < 32)
{ {
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ; s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
...@@ -168,6 +173,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block) ...@@ -168,6 +173,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
} }
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
// attention
if (t_idx < 32) if (t_idx < 32)
{ {
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
...@@ -330,24 +336,24 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFlo ...@@ -330,24 +336,24 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFlo
__global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n) __global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n)
{ {
__shared__ double hh_s[128] ; __shared__ double hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
#else #else
__global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n) __global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n)
{ {
__shared__ float hh_s[128] ; __shared__ float hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
#endif #endif
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
__global__ void compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n) __global__ void compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n)
{ {
__shared__ cuDoubleComplex hh_s[128] ; __shared__ cuDoubleComplex hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
#else #else
__global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n) __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n)
{ {
__shared__ cuFloatComplex hh_s[128] ; __shared__ cuFloatComplex hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
#endif #endif
#endif #endif
int t_idx, v_idx; int t_idx, v_idx;
...@@ -359,7 +365,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF ...@@ -359,7 +365,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF
t_idx = threadIdx.x ; t_idx = threadIdx.x ;
// // The contents of the shared memory must be fully reset // // The contents of the shared memory must be fully reset
// reset_shared_block_c(hh_s, 128); // reset_shared_block_c(hh_s, BLOCK_CYCLIC_BLOCKSIZE);
// Initialize the contents of the shared buffer (preparing for reduction) // Initialize the contents of the shared buffer (preparing for reduction)
if (t_idx > 0) if (t_idx > 0)
...@@ -513,22 +519,22 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bc ...@@ -513,22 +519,22 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bc
#endif #endif
{ {
int grid_size; int grid_size;
grid_size = 1 + (n - 1) / 256; grid_size = 1 + (n - 1) / GLOBAL_STRIPE_WIDTH;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err); if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err);
#if REALCASE == 1 #if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
extract_hh_tau_c_kernel_double<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); extract_hh_tau_c_kernel_double<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else #else
extract_hh_tau_c_kernel_single<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); extract_hh_tau_c_kernel_single<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif #endif
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
extract_hh_tau_c_kernel_complex_double<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); extract_hh_tau_c_kernel_complex_double<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else #else
extract_hh_tau_c_kernel_complex_single<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); extract_hh_tau_c_kernel_complex_single<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif #endif
#endif #endif
err = cudaGetLastError(); err = cudaGetLastError();
......
...@@ -54,6 +54,11 @@ ...@@ -54,6 +54,11 @@
#include <stdlib.h> #include <stdlib.h>
#include <cuComplex.h> #include <cuComplex.h>
#include "config-f90.h" #include "config-f90.h"
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
// =========================================================================================================== // ===========================================================================================================
// Important: due to the use of warp shuffling, the C version of the backtransformation kernel only works on // Important: due to the use of warp shuffling, the C version of the backtransformation kernel only works on
// devices with compute capability 3.x; for older devices, please use the Fortran kernel version // devices with compute capability 3.x; for older devices, please use the Fortran kernel version
...@@ -136,7 +141,7 @@ __device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block) ...@@ -136,7 +141,7 @@ __device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block)
int t_idx ; int t_idx ;
t_idx = threadIdx.x; t_idx = threadIdx.x;
__syncthreads(); __syncthreads();
// attention
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
if (t_idx < 32) if (t_idx < 32)
{ {
...@@ -187,7 +192,7 @@ __device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block) ...@@ -187,7 +192,7 @@ __device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block)
int t_idx ; int t_idx ;
t_idx = threadIdx.x; t_idx = threadIdx.x;
__syncthreads(); __syncthreads();
// attention
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
if(t_idx < 64) if(t_idx < 64)
{ {
...@@ -312,6 +317,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o ...@@ -312,6 +317,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
if (HAVE_2_WARPS) if (HAVE_2_WARPS)
{ {
// In this case, we have 2 warps, each doing 1 reduction // In this case, we have 2 warps, each doing 1 reduction
// attention
if (t_idx < 64) if (t_idx < 64)
{ {
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
...@@ -324,6 +330,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o ...@@ -324,6 +330,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
else else
{ {
// In this case we have 1 warp that performs both reductions // In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32) if (t_idx < 32)
{ {
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
...@@ -349,6 +356,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex ...@@ -349,6 +356,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
if (HAVE_2_WARPS) if (HAVE_2_WARPS)
{ {
// In this case, we have 2 warps, each doing 1 reduction // In this case, we have 2 warps, each doing 1 reduction
//attention
if (t_idx < 64) if (t_idx < 64)
{ {
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
...@@ -361,6 +369,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex ...@@ -361,6 +369,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
else else
{ {
// In this case we have 1 warp that performs both reductions // In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32) if (t_idx < 32)
{ {
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
...@@ -393,6 +402,7 @@ __device__ void reset_dotp_buffers_complex_double( cuDoubleComplex * const __r ...@@ -393,6 +402,7 @@ __device__ void reset_dotp_buffers_complex_double( cuDoubleComplex * const __r
__device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __restrict__ s_block) __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __restrict__ s_block)
#endif #endif
{ {
// attention
if (blockDim.x >= 64) if (blockDim.x >= 64)
{ {
int t_idx = threadIdx.x; int t_idx = threadIdx.x;
...@@ -406,7 +416,7 @@ __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __re ...@@ -406,7 +416,7 @@ __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __re
} }
else else
{ {
int s_chunk = 128 / blockDim.x; int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex); int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else #else
...@@ -425,20 +435,20 @@ __device__ void reset_dotp_buffers_complex_2_double( cuDoubleComplex * const _ ...@@ -425,20 +435,20 @@ __device__ void reset_dotp_buffers_complex_2_double( cuDoubleComplex * const _
__device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __restrict__ s_block) __device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __restrict__ s_block)
#endif #endif
{ {
if (blockDim.x >= 128) if (blockDim.x >= BLOCK_CYCLIC_BLOCKSIZE)
{ {
int t_idx = threadIdx.x; int t_idx = threadIdx.x;
if (t_idx < 128) if (t_idx < BLOCK_CYCLIC_BLOCKSIZE)
{ {
s_block[t_idx].x = s_block[t_idx + 128].x = 0.0; s_block[t_idx].x = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].x = 0.0;
s_block[t_idx].y = s_block[t_idx + 128].y = 0.0; s_block[t_idx].y = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].y = 0.0;
} }
} }
else else
{ {
int s_chunk = 256 / blockDim.x; int s_chunk = GLOBAL_STRIPE_WIDTH / blockDim.x;
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex); int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else #else
...@@ -462,13 +472,13 @@ template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_ker ...@@ -462,13 +472,13 @@ template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_ker
#endif #endif
{ {
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
__shared__ cuDoubleComplex q_s[128]; __shared__ cuDoubleComplex q_s[BLOCK_CYCLIC_BLOCKSIZE];
__shared__ cuDoubleComplex dotp_s[128]; __shared__ cuDoubleComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
cuDoubleComplex q_v2, tau ; cuDoubleComplex q_v2, tau ;
#else #else
__shared__ cuFloatComplex q_s[128]; __shared__ cuFloatComplex q_s[BLOCK_CYCLIC_BLOCKSIZE];
__shared__ cuFloatComplex dotp_s[128]; __shared__ cuFloatComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
cuFloatComplex q_v2, tau ; cuFloatComplex q_v2, tau ;
#endif #endif
...@@ -565,6 +575,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex* ...@@ -565,6 +575,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
switch (nb) switch (nb)
{ {
// attention
case 256: case 256:
case 128: case 128:
case 64: case 64:
...@@ -616,7 +627,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex* ...@@ -616,7 +627,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
#endif #endif
break; break;
default: default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and 128.\n"); printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE.\n");
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
......
...@@ -54,6 +54,9 @@ ...@@ -54,6 +54,9 @@
#include <stdlib.h> #include <stdlib.h>
#include "config-f90.h" #include "config-f90.h"
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
#if 0 #if 0
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask) static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
{ {
...@@ -96,6 +99,7 @@ __device__ __forceinline__ void reset_dotp_buffers_double(double * const __restr ...@@ -96,6 +99,7 @@ __device__ __forceinline__ void reset_dotp_buffers_double(double * const __restr
__device__ __forceinline__ void reset_dotp_buffers_single(float * const __restrict__ s_block) __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restrict__ s_block)
#endif #endif
{ {
// attention
if (blockDim.x >= 64) if (blockDim.x >= 64)
{ {
int t_idx = threadIdx.x; int t_idx = threadIdx.x;
...@@ -107,7 +111,7 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri ...@@ -107,7 +111,7 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
} }
else else
{ {
int s_chunk = 128 / blockDim.x; int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
int s_chunk_size = s_chunk * sizeof(double); int s_chunk_size = s_chunk * sizeof(double);
#else #else
...@@ -125,20 +129,20 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri ...@@ -125,20 +129,20 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
// We use templates here to avoid additional branching based on the actual size of the thread-block // We use templates here to avoid additional branching based on the actual size of the thread-block
template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS> template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
__global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot, __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols) const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#else #else
__global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot, __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols) const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#endif #endif
{ {
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
__shared__ double dotp_s[128]; __shared__ double dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
__shared__ double q_s[129]; __shared__ double q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
#else #else
__shared__ float dotp_s[128]; __shared__ float dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
__shared__ float q_s[129]; __shared__ float q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
#endif #endif
int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta; int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta;
...@@ -222,6 +226,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * ...@@ -222,6 +226,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
// Now both threads in a pair can write to the same reduction buffer address without race-condition issues // Now both threads in a pair can write to the same reduction buffer address without race-condition issues
dotp_s[t_s] = my_r1; dotp_s[t_s] = my_r1;
//attention
dotp_s[t_s + 64] = my_r2; dotp_s[t_s + 64] = my_r2;
// Ensure the reduction buffers are fully populated // Ensure the reduction buffers are fully populated
...@@ -238,6 +243,8 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * ...@@ -238,6 +243,8 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
// Each thread collects the reduction results // Each thread collects the reduction results
s_1 = dotp_s[0]; s_1 = dotp_s[0];
// attention
s_2 = dotp_s[64]; s_2 = dotp_s[64];
// Each thread updates its corresponding EV component // Each thread updates its corresponding EV component
...@@ -299,6 +306,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * ...@@ -299,6 +306,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
sync_threads<HAVE_2_WARPS>(); sync_threads<HAVE_2_WARPS>();
// We perform the reduction using the first warp only // We perform the reduction using the first warp only
// attention
if (t_idx < 32) if (t_idx < 32)
{ {
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
...@@ -324,6 +332,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * ...@@ -324,6 +332,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
{ {
switch (nb) switch (nb)
{ {
// attention
case 128: case 128:
case 64: case 64:
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
...@@ -375,7 +384,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * ...@@ -375,7 +384,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
break; break;
default: default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and 128.\n"); printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE .\n");
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment