Commit 358fde98 authored by Andreas Marek's avatar Andreas Marek

Introduce some C-macros for GPU kernels

parent d03a603a
......@@ -61,6 +61,10 @@
#include <cuComplex.h>
#endif
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
#define WARP_SIZE 32
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
......@@ -156,6 +160,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
__syncthreads();
#if REALCASE == 1
// attention
if (t_idx < 32)
{
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
......@@ -168,6 +173,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
}
#endif
#if COMPLEXCASE == 1
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -330,24 +336,24 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFlo
__global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n)
{
__shared__ double hh_s[128] ;
__shared__ double hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
#else
__global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n)
{
__shared__ float hh_s[128] ;
__shared__ float hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__ void compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n)
{
__shared__ cuDoubleComplex hh_s[128] ;
__shared__ cuDoubleComplex hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
#else
__global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n)
{
__shared__ cuFloatComplex hh_s[128] ;
__shared__ cuFloatComplex hh_s[BLOCK_CYCLIC_BLOCKSIZE] ;
#endif
#endif
int t_idx, v_idx;
......@@ -359,7 +365,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF
t_idx = threadIdx.x ;
// // The contents of the shared memory must be fully reset
// reset_shared_block_c(hh_s, 128);
// reset_shared_block_c(hh_s, BLOCK_CYCLIC_BLOCKSIZE);
// Initialize the contents of the shared buffer (preparing for reduction)
if (t_idx > 0)
......@@ -513,22 +519,22 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bc
#endif
{
int grid_size;
grid_size = 1 + (n - 1) / 256;
grid_size = 1 + (n - 1) / GLOBAL_STRIPE_WIDTH;
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err);
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extract_hh_tau_c_kernel_double<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_double<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else
extract_hh_tau_c_kernel_single<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_single<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extract_hh_tau_c_kernel_complex_double<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_complex_double<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else
extract_hh_tau_c_kernel_complex_single<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_complex_single<<<grid_size,GLOBAL_STRIPE_WIDTH>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif
#endif
err = cudaGetLastError();
......
......@@ -54,6 +54,11 @@
#include <stdlib.h>
#include <cuComplex.h>
#include "config-f90.h"
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
// ===========================================================================================================
// Important: due to the use of warp shuffling, the C version of the backtransformation kernel only works on
// devices with compute capability 3.x; for older devices, please use the Fortran kernel version
......@@ -136,7 +141,7 @@ __device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block)
int t_idx ;
t_idx = threadIdx.x;
__syncthreads();
// attention
#ifdef DOUBLE_PRECISION_COMPLEX
if (t_idx < 32)
{
......@@ -187,7 +192,7 @@ __device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block)
int t_idx ;
t_idx = threadIdx.x;
__syncthreads();
// attention
#ifdef DOUBLE_PRECISION_COMPLEX
if(t_idx < 64)
{
......@@ -312,6 +317,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
if (HAVE_2_WARPS)
{
// In this case, we have 2 warps, each doing 1 reduction
// attention
if (t_idx < 64)
{
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -324,6 +330,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
else
{
// In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -349,6 +356,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
if (HAVE_2_WARPS)
{
// In this case, we have 2 warps, each doing 1 reduction
//attention
if (t_idx < 64)
{
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -361,6 +369,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
else
{
// In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -393,6 +402,7 @@ __device__ void reset_dotp_buffers_complex_double( cuDoubleComplex * const __r
__device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __restrict__ s_block)
#endif
{
// attention
if (blockDim.x >= 64)
{
int t_idx = threadIdx.x;
......@@ -406,7 +416,7 @@ __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __re
}
else
{
int s_chunk = 128 / blockDim.x;
int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
#ifdef DOUBLE_PRECISION_COMPLEX
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else
......@@ -425,20 +435,20 @@ __device__ void reset_dotp_buffers_complex_2_double( cuDoubleComplex * const _
__device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __restrict__ s_block)
#endif
{
if (blockDim.x >= 128)
if (blockDim.x >= BLOCK_CYCLIC_BLOCKSIZE)
{
int t_idx = threadIdx.x;
if (t_idx < 128)
if (t_idx < BLOCK_CYCLIC_BLOCKSIZE)
{
s_block[t_idx].x = s_block[t_idx + 128].x = 0.0;
s_block[t_idx].y = s_block[t_idx + 128].y = 0.0;
s_block[t_idx].x = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].x = 0.0;
s_block[t_idx].y = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].y = 0.0;
}
}
else
{
int s_chunk = 256 / blockDim.x;
int s_chunk = GLOBAL_STRIPE_WIDTH / blockDim.x;
#ifdef DOUBLE_PRECISION_COMPLEX
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else
......@@ -462,13 +472,13 @@ template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_ker
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
__shared__ cuDoubleComplex q_s[128];
__shared__ cuDoubleComplex dotp_s[128];
__shared__ cuDoubleComplex q_s[BLOCK_CYCLIC_BLOCKSIZE];
__shared__ cuDoubleComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
cuDoubleComplex q_v2, tau ;
#else
__shared__ cuFloatComplex q_s[128];
__shared__ cuFloatComplex dotp_s[128];
__shared__ cuFloatComplex q_s[BLOCK_CYCLIC_BLOCKSIZE];
__shared__ cuFloatComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
cuFloatComplex q_v2, tau ;
#endif
......@@ -565,6 +575,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
switch (nb)
{
// attention
case 256:
case 128:
case 64:
......@@ -616,7 +627,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
#endif
break;
default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and 128.\n");
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE.\n");
}
cudaDeviceSynchronize();
......
......@@ -54,6 +54,9 @@
#include <stdlib.h>
#include "config-f90.h"
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
#if 0
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
{
......@@ -96,6 +99,7 @@ __device__ __forceinline__ void reset_dotp_buffers_double(double * const __restr
__device__ __forceinline__ void reset_dotp_buffers_single(float * const __restrict__ s_block)
#endif
{
// attention
if (blockDim.x >= 64)
{
int t_idx = threadIdx.x;
......@@ -107,7 +111,7 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
}
else
{
int s_chunk = 128 / blockDim.x;
int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
#ifdef DOUBLE_PRECISION_REAL
int s_chunk_size = s_chunk * sizeof(double);
#else
......@@ -125,20 +129,20 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
// We use templates here to avoid additional branching based on the actual size of the thread-block
template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
__global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#else
__global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#endif
{
#ifdef DOUBLE_PRECISION_REAL
__shared__ double dotp_s[128];
__shared__ double q_s[129];
__shared__ double dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
__shared__ double q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
#else
__shared__ float dotp_s[128];
__shared__ float q_s[129];
__shared__ float dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
__shared__ float q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
#endif
int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta;
......@@ -222,6 +226,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
// Now both threads in a pair can write to the same reduction buffer address without race-condition issues
dotp_s[t_s] = my_r1;
//attention
dotp_s[t_s + 64] = my_r2;
// Ensure the reduction buffers are fully populated
......@@ -238,6 +243,8 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
// Each thread collects the reduction results
s_1 = dotp_s[0];
// attention
s_2 = dotp_s[64];
// Each thread updates its corresponding EV component
......@@ -299,6 +306,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
sync_threads<HAVE_2_WARPS>();
// We perform the reduction using the first warp only
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_REAL
......@@ -324,6 +332,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
{
switch (nb)
{
// attention
case 128:
case 64:
#ifdef DOUBLE_PRECISION_REAL
......@@ -375,7 +384,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
break;
default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and 128.\n");
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE .\n");
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment