Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
358fde98
Commit
358fde98
authored
Mar 28, 2017
by
Andreas Marek
Browse files
Introduce some C-macros for GPU kernels
parent
d03a603a
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/cuUtils_template.Xcu
View file @
358fde98
...
...
@@ -61,6 +61,10 @@
#include <cuComplex.h>
#endif
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
#define WARP_SIZE 32
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
...
...
@@ -156,6 +160,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
__syncthreads();
#if REALCASE == 1
// attention
if (t_idx < 32)
{
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
...
...
@@ -168,6 +173,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
}
#endif
#if COMPLEXCASE == 1
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -330,24 +336,24 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFlo
__global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n)
{
__shared__ double hh_s[
128
] ;
__shared__ double hh_s[
BLOCK_CYCLIC_BLOCKSIZE
] ;
#else
__global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n)
{
__shared__ float hh_s[
128
] ;
__shared__ float hh_s[
BLOCK_CYCLIC_BLOCKSIZE
] ;
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__ void compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n)
{
__shared__ cuDoubleComplex hh_s[
128
] ;
__shared__ cuDoubleComplex hh_s[
BLOCK_CYCLIC_BLOCKSIZE
] ;
#else
__global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n)
{
__shared__ cuFloatComplex hh_s[
128
] ;
__shared__ cuFloatComplex hh_s[
BLOCK_CYCLIC_BLOCKSIZE
] ;
#endif
#endif
int t_idx, v_idx;
...
...
@@ -359,7 +365,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF
t_idx = threadIdx.x ;
// // The contents of the shared memory must be fully reset
// reset_shared_block_c(hh_s,
128
);
// reset_shared_block_c(hh_s,
BLOCK_CYCLIC_BLOCKSIZE
);
// Initialize the contents of the shared buffer (preparing for reduction)
if (t_idx > 0)
...
...
@@ -513,22 +519,22 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bc
#endif
{
int grid_size;
grid_size = 1 + (n - 1) /
256
;
grid_size = 1 + (n - 1) /
GLOBAL_STRIPE_WIDTH
;
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err);
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extract_hh_tau_c_kernel_double<<<grid_size,
256
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_double<<<grid_size,
GLOBAL_STRIPE_WIDTH
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else
extract_hh_tau_c_kernel_single<<<grid_size,
256
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_single<<<grid_size,
GLOBAL_STRIPE_WIDTH
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extract_hh_tau_c_kernel_complex_double<<<grid_size,
256
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_complex_double<<<grid_size,
GLOBAL_STRIPE_WIDTH
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else
extract_hh_tau_c_kernel_complex_single<<<grid_size,
256
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_complex_single<<<grid_size,
GLOBAL_STRIPE_WIDTH
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif
#endif
err = cudaGetLastError();
...
...
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu
View file @
358fde98
...
...
@@ -54,6 +54,11 @@
#include <stdlib.h>
#include <cuComplex.h>
#include "config-f90.h"
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
// ===========================================================================================================
// Important: due to the use of warp shuffling, the C version of the backtransformation kernel only works on
// devices with compute capability 3.x; for older devices, please use the Fortran kernel version
...
...
@@ -136,7 +141,7 @@ __device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block)
int t_idx ;
t_idx = threadIdx.x;
__syncthreads();
// attention
#ifdef DOUBLE_PRECISION_COMPLEX
if (t_idx < 32)
{
...
...
@@ -187,7 +192,7 @@ __device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block)
int t_idx ;
t_idx = threadIdx.x;
__syncthreads();
// attention
#ifdef DOUBLE_PRECISION_COMPLEX
if(t_idx < 64)
{
...
...
@@ -312,6 +317,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
if (HAVE_2_WARPS)
{
// In this case, we have 2 warps, each doing 1 reduction
// attention
if (t_idx < 64)
{
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -324,6 +330,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
else
{
// In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -349,6 +356,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
if (HAVE_2_WARPS)
{
// In this case, we have 2 warps, each doing 1 reduction
//attention
if (t_idx < 64)
{
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -361,6 +369,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
else
{
// In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -393,6 +402,7 @@ __device__ void reset_dotp_buffers_complex_double( cuDoubleComplex * const __r
__device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __restrict__ s_block)
#endif
{
// attention
if (blockDim.x >= 64)
{
int t_idx = threadIdx.x;
...
...
@@ -406,7 +416,7 @@ __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __re
}
else
{
int s_chunk =
128
/ blockDim.x;
int s_chunk =
BLOCK_CYCLIC_BLOCKSIZE
/ blockDim.x;
#ifdef DOUBLE_PRECISION_COMPLEX
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else
...
...
@@ -425,20 +435,20 @@ __device__ void reset_dotp_buffers_complex_2_double( cuDoubleComplex * const _
__device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __restrict__ s_block)
#endif
{
if (blockDim.x >=
128
)
if (blockDim.x >=
BLOCK_CYCLIC_BLOCKSIZE
)
{
int t_idx = threadIdx.x;
if (t_idx <
128
)
if (t_idx <
BLOCK_CYCLIC_BLOCKSIZE
)
{
s_block[t_idx].x = s_block[t_idx +
128
].x = 0.0;
s_block[t_idx].y = s_block[t_idx +
128
].y = 0.0;
s_block[t_idx].x = s_block[t_idx +
BLOCK_CYCLIC_BLOCKSIZE
].x = 0.0;
s_block[t_idx].y = s_block[t_idx +
BLOCK_CYCLIC_BLOCKSIZE
].y = 0.0;
}
}
else
{
int s_chunk =
256
/ blockDim.x;
int s_chunk =
GLOBAL_STRIPE_WIDTH
/ blockDim.x;
#ifdef DOUBLE_PRECISION_COMPLEX
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else
...
...
@@ -462,13 +472,13 @@ template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_ker
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
__shared__ cuDoubleComplex q_s[
128
];
__shared__ cuDoubleComplex dotp_s[
128
];
__shared__ cuDoubleComplex q_s[
BLOCK_CYCLIC_BLOCKSIZE
];
__shared__ cuDoubleComplex dotp_s[
BLOCK_CYCLIC_BLOCKSIZE
];
cuDoubleComplex q_v2, tau ;
#else
__shared__ cuFloatComplex q_s[
128
];
__shared__ cuFloatComplex dotp_s[
128
];
__shared__ cuFloatComplex q_s[
BLOCK_CYCLIC_BLOCKSIZE
];
__shared__ cuFloatComplex dotp_s[
BLOCK_CYCLIC_BLOCKSIZE
];
cuFloatComplex q_v2, tau ;
#endif
...
...
@@ -565,6 +575,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
switch (nb)
{
// attention
case 256:
case 128:
case 64:
...
...
@@ -616,7 +627,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
#endif
break;
default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and
128
.\n");
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and
BLOCK_CYCLIC_BLOCKSIZE
.\n");
}
cudaDeviceSynchronize();
...
...
src/ev_tridi_band_gpu_c_v2_real_template.Xcu
View file @
358fde98
...
...
@@ -54,6 +54,9 @@
#include <stdlib.h>
#include "config-f90.h"
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
#if 0
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
{
...
...
@@ -96,6 +99,7 @@ __device__ __forceinline__ void reset_dotp_buffers_double(double * const __restr
__device__ __forceinline__ void reset_dotp_buffers_single(float * const __restrict__ s_block)
#endif
{
// attention
if (blockDim.x >= 64)
{
int t_idx = threadIdx.x;
...
...
@@ -107,7 +111,7 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
}
else
{
int s_chunk =
128
/ blockDim.x;
int s_chunk =
BLOCK_CYCLIC_BLOCKSIZE
/ blockDim.x;
#ifdef DOUBLE_PRECISION_REAL
int s_chunk_size = s_chunk * sizeof(double);
#else
...
...
@@ -125,20 +129,20 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
// We use templates here to avoid additional branching based on the actual size of the thread-block
template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
__global__ void __launch_bounds__(
128
) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
__global__ void __launch_bounds__(
BLOCK_CYCLIC_BLOCKSIZE
) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#else
__global__ void __launch_bounds__(
128
) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
__global__ void __launch_bounds__(
BLOCK_CYCLIC_BLOCKSIZE
) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#endif
{
#ifdef DOUBLE_PRECISION_REAL
__shared__ double dotp_s[
128
];
__shared__ double q_s[
129
];
__shared__ double dotp_s[
BLOCK_CYCLIC_BLOCKSIZE
];
__shared__ double q_s[
BLOCK_CYCLIC_BLOCKSIZE+1
];
#else
__shared__ float dotp_s[
128
];
__shared__ float q_s[
129
];
__shared__ float dotp_s[
BLOCK_CYCLIC_BLOCKSIZE
];
__shared__ float q_s[
BLOCK_CYCLIC_BLOCKSIZE+1
];
#endif
int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta;
...
...
@@ -222,6 +226,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
// Now both threads in a pair can write to the same reduction buffer address without race-condition issues
dotp_s[t_s] = my_r1;
//attention
dotp_s[t_s + 64] = my_r2;
// Ensure the reduction buffers are fully populated
...
...
@@ -238,6 +243,8 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
// Each thread collects the reduction results
s_1 = dotp_s[0];
// attention
s_2 = dotp_s[64];
// Each thread updates its corresponding EV component
...
...
@@ -299,6 +306,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
sync_threads<HAVE_2_WARPS>();
// We perform the reduction using the first warp only
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -324,6 +332,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
{
switch (nb)
{
// attention
case 128:
case 64:
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -375,7 +384,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
break;
default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and
128
.\n");
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and
BLOCK_CYCLIC_BLOCKSIZE
.\n");
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment