Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
358fde98
Commit
358fde98
authored
Mar 28, 2017
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Introduce some C-macros for GPU kernels
parent
d03a603a
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
57 additions
and
31 deletions
+57
-31
src/cuUtils_template.Xcu
src/cuUtils_template.Xcu
+16
-10
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu
+24
-13
src/ev_tridi_band_gpu_c_v2_real_template.Xcu
src/ev_tridi_band_gpu_c_v2_real_template.Xcu
+17
-8
No files found.
src/cuUtils_template.Xcu
View file @
358fde98
...
@@ -61,6 +61,10 @@
...
@@ -61,6 +61,10 @@
#include <cuComplex.h>
#include <cuComplex.h>
#endif
#endif
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
#define WARP_SIZE 32
// Reset a reduction block
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
// Limitation: the thread-block size must be a divider of the reduction block's size
...
@@ -156,6 +160,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
...
@@ -156,6 +160,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
__syncthreads();
__syncthreads();
#if REALCASE == 1
#if REALCASE == 1
// attention
if (t_idx < 32)
if (t_idx < 32)
{
{
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
...
@@ -168,6 +173,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
...
@@ -168,6 +173,7 @@ __device__ void warp_reduce_complex_single( cuFloatComplex *s_block)
}
}
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
// attention
if (t_idx < 32)
if (t_idx < 32)
{
{
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
...
@@ -330,24 +336,24 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFlo
...
@@ -330,24 +336,24 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFlo
__global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n)
__global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n)
{
{
__shared__ double hh_s[
128
] ;
__shared__ double hh_s[
BLOCK_CYCLIC_BLOCKSIZE
] ;
#else
#else
__global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n)
__global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n)
{
{
__shared__ float hh_s[
128
] ;
__shared__ float hh_s[
BLOCK_CYCLIC_BLOCKSIZE
] ;
#endif
#endif
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
__global__ void compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n)
__global__ void compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n)
{
{
__shared__ cuDoubleComplex hh_s[
128
] ;
__shared__ cuDoubleComplex hh_s[
BLOCK_CYCLIC_BLOCKSIZE
] ;
#else
#else
__global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n)
__global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n)
{
{
__shared__ cuFloatComplex hh_s[
128
] ;
__shared__ cuFloatComplex hh_s[
BLOCK_CYCLIC_BLOCKSIZE
] ;
#endif
#endif
#endif
#endif
int t_idx, v_idx;
int t_idx, v_idx;
...
@@ -359,7 +365,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF
...
@@ -359,7 +365,7 @@ __global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuF
t_idx = threadIdx.x ;
t_idx = threadIdx.x ;
// // The contents of the shared memory must be fully reset
// // The contents of the shared memory must be fully reset
// reset_shared_block_c(hh_s,
128
);
// reset_shared_block_c(hh_s,
BLOCK_CYCLIC_BLOCKSIZE
);
// Initialize the contents of the shared buffer (preparing for reduction)
// Initialize the contents of the shared buffer (preparing for reduction)
if (t_idx > 0)
if (t_idx > 0)
...
@@ -513,22 +519,22 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bc
...
@@ -513,22 +519,22 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bc
#endif
#endif
{
{
int grid_size;
int grid_size;
grid_size = 1 + (n - 1) /
256
;
grid_size = 1 + (n - 1) /
GLOBAL_STRIPE_WIDTH
;
cudaDeviceSynchronize();
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err);
if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err);
#if REALCASE == 1
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
extract_hh_tau_c_kernel_double<<<grid_size,
256
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_double<<<grid_size,
GLOBAL_STRIPE_WIDTH
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else
#else
extract_hh_tau_c_kernel_single<<<grid_size,
256
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_single<<<grid_size,
GLOBAL_STRIPE_WIDTH
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif
#endif
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
extract_hh_tau_c_kernel_complex_double<<<grid_size,
256
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_complex_double<<<grid_size,
GLOBAL_STRIPE_WIDTH
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else
#else
extract_hh_tau_c_kernel_complex_single<<<grid_size,
256
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
extract_hh_tau_c_kernel_complex_single<<<grid_size,
GLOBAL_STRIPE_WIDTH
>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif
#endif
#endif
#endif
err = cudaGetLastError();
err = cudaGetLastError();
...
...
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu
View file @
358fde98
...
@@ -54,6 +54,11 @@
...
@@ -54,6 +54,11 @@
#include <stdlib.h>
#include <stdlib.h>
#include <cuComplex.h>
#include <cuComplex.h>
#include "config-f90.h"
#include "config-f90.h"
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
// ===========================================================================================================
// ===========================================================================================================
// Important: due to the use of warp shuffling, the C version of the backtransformation kernel only works on
// Important: due to the use of warp shuffling, the C version of the backtransformation kernel only works on
// devices with compute capability 3.x; for older devices, please use the Fortran kernel version
// devices with compute capability 3.x; for older devices, please use the Fortran kernel version
...
@@ -136,7 +141,7 @@ __device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block)
...
@@ -136,7 +141,7 @@ __device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block)
int t_idx ;
int t_idx ;
t_idx = threadIdx.x;
t_idx = threadIdx.x;
__syncthreads();
__syncthreads();
// attention
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
if (t_idx < 32)
if (t_idx < 32)
{
{
...
@@ -187,7 +192,7 @@ __device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block)
...
@@ -187,7 +192,7 @@ __device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block)
int t_idx ;
int t_idx ;
t_idx = threadIdx.x;
t_idx = threadIdx.x;
__syncthreads();
__syncthreads();
// attention
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
if(t_idx < 64)
if(t_idx < 64)
{
{
...
@@ -312,6 +317,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
...
@@ -312,6 +317,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
if (HAVE_2_WARPS)
if (HAVE_2_WARPS)
{
{
// In this case, we have 2 warps, each doing 1 reduction
// In this case, we have 2 warps, each doing 1 reduction
// attention
if (t_idx < 64)
if (t_idx < 64)
{
{
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
...
@@ -324,6 +330,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
...
@@ -324,6 +330,7 @@ __device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_o
else
else
{
{
// In this case we have 1 warp that performs both reductions
// In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32)
if (t_idx < 32)
{
{
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
...
@@ -349,6 +356,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
...
@@ -349,6 +356,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
if (HAVE_2_WARPS)
if (HAVE_2_WARPS)
{
{
// In this case, we have 2 warps, each doing 1 reduction
// In this case, we have 2 warps, each doing 1 reduction
//attention
if (t_idx < 64)
if (t_idx < 64)
{
{
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
...
@@ -361,6 +369,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
...
@@ -361,6 +369,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
else
else
{
{
// In this case we have 1 warp that performs both reductions
// In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32)
if (t_idx < 32)
{
{
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
...
@@ -393,6 +402,7 @@ __device__ void reset_dotp_buffers_complex_double( cuDoubleComplex * const __r
...
@@ -393,6 +402,7 @@ __device__ void reset_dotp_buffers_complex_double( cuDoubleComplex * const __r
__device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __restrict__ s_block)
__device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __restrict__ s_block)
#endif
#endif
{
{
// attention
if (blockDim.x >= 64)
if (blockDim.x >= 64)
{
{
int t_idx = threadIdx.x;
int t_idx = threadIdx.x;
...
@@ -406,7 +416,7 @@ __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __re
...
@@ -406,7 +416,7 @@ __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __re
}
}
else
else
{
{
int s_chunk =
128
/ blockDim.x;
int s_chunk =
BLOCK_CYCLIC_BLOCKSIZE
/ blockDim.x;
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else
#else
...
@@ -425,20 +435,20 @@ __device__ void reset_dotp_buffers_complex_2_double( cuDoubleComplex * const _
...
@@ -425,20 +435,20 @@ __device__ void reset_dotp_buffers_complex_2_double( cuDoubleComplex * const _
__device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __restrict__ s_block)
__device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __restrict__ s_block)
#endif
#endif
{
{
if (blockDim.x >=
128
)
if (blockDim.x >=
BLOCK_CYCLIC_BLOCKSIZE
)
{
{
int t_idx = threadIdx.x;
int t_idx = threadIdx.x;
if (t_idx <
128
)
if (t_idx <
BLOCK_CYCLIC_BLOCKSIZE
)
{
{
s_block[t_idx].x = s_block[t_idx +
128
].x = 0.0;
s_block[t_idx].x = s_block[t_idx +
BLOCK_CYCLIC_BLOCKSIZE
].x = 0.0;
s_block[t_idx].y = s_block[t_idx +
128
].y = 0.0;
s_block[t_idx].y = s_block[t_idx +
BLOCK_CYCLIC_BLOCKSIZE
].y = 0.0;
}
}
}
}
else
else
{
{
int s_chunk =
256
/ blockDim.x;
int s_chunk =
GLOBAL_STRIPE_WIDTH
/ blockDim.x;
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else
#else
...
@@ -462,13 +472,13 @@ template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_ker
...
@@ -462,13 +472,13 @@ template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_ker
#endif
#endif
{
{
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
__shared__ cuDoubleComplex q_s[
128
];
__shared__ cuDoubleComplex q_s[
BLOCK_CYCLIC_BLOCKSIZE
];
__shared__ cuDoubleComplex dotp_s[
128
];
__shared__ cuDoubleComplex dotp_s[
BLOCK_CYCLIC_BLOCKSIZE
];
cuDoubleComplex q_v2, tau ;
cuDoubleComplex q_v2, tau ;
#else
#else
__shared__ cuFloatComplex q_s[
128
];
__shared__ cuFloatComplex q_s[
BLOCK_CYCLIC_BLOCKSIZE
];
__shared__ cuFloatComplex dotp_s[
128
];
__shared__ cuFloatComplex dotp_s[
BLOCK_CYCLIC_BLOCKSIZE
];
cuFloatComplex q_v2, tau ;
cuFloatComplex q_v2, tau ;
#endif
#endif
...
@@ -565,6 +575,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
...
@@ -565,6 +575,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
switch (nb)
switch (nb)
{
{
// attention
case 256:
case 256:
case 128:
case 128:
case 64:
case 64:
...
@@ -616,7 +627,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
...
@@ -616,7 +627,7 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
#endif
#endif
break;
break;
default:
default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and
128
.\n");
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and
BLOCK_CYCLIC_BLOCKSIZE
.\n");
}
}
cudaDeviceSynchronize();
cudaDeviceSynchronize();
...
...
src/ev_tridi_band_gpu_c_v2_real_template.Xcu
View file @
358fde98
...
@@ -54,6 +54,9 @@
...
@@ -54,6 +54,9 @@
#include <stdlib.h>
#include <stdlib.h>
#include "config-f90.h"
#include "config-f90.h"
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
#if 0
#if 0
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
{
{
...
@@ -96,6 +99,7 @@ __device__ __forceinline__ void reset_dotp_buffers_double(double * const __restr
...
@@ -96,6 +99,7 @@ __device__ __forceinline__ void reset_dotp_buffers_double(double * const __restr
__device__ __forceinline__ void reset_dotp_buffers_single(float * const __restrict__ s_block)
__device__ __forceinline__ void reset_dotp_buffers_single(float * const __restrict__ s_block)
#endif
#endif
{
{
// attention
if (blockDim.x >= 64)
if (blockDim.x >= 64)
{
{
int t_idx = threadIdx.x;
int t_idx = threadIdx.x;
...
@@ -107,7 +111,7 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
...
@@ -107,7 +111,7 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
}
}
else
else
{
{
int s_chunk =
128
/ blockDim.x;
int s_chunk =
BLOCK_CYCLIC_BLOCKSIZE
/ blockDim.x;
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
int s_chunk_size = s_chunk * sizeof(double);
int s_chunk_size = s_chunk * sizeof(double);
#else
#else
...
@@ -125,20 +129,20 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
...
@@ -125,20 +129,20 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
// We use templates here to avoid additional branching based on the actual size of the thread-block
// We use templates here to avoid additional branching based on the actual size of the thread-block
template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
__global__ void __launch_bounds__(
128
) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
__global__ void __launch_bounds__(
BLOCK_CYCLIC_BLOCKSIZE
) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#else
#else
__global__ void __launch_bounds__(
128
) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
__global__ void __launch_bounds__(
BLOCK_CYCLIC_BLOCKSIZE
) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#endif
#endif
{
{
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
__shared__ double dotp_s[
128
];
__shared__ double dotp_s[
BLOCK_CYCLIC_BLOCKSIZE
];
__shared__ double q_s[
129
];
__shared__ double q_s[
BLOCK_CYCLIC_BLOCKSIZE+1
];
#else
#else
__shared__ float dotp_s[
128
];
__shared__ float dotp_s[
BLOCK_CYCLIC_BLOCKSIZE
];
__shared__ float q_s[
129
];
__shared__ float q_s[
BLOCK_CYCLIC_BLOCKSIZE+1
];
#endif
#endif
int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta;
int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta;
...
@@ -222,6 +226,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
...
@@ -222,6 +226,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
// Now both threads in a pair can write to the same reduction buffer address without race-condition issues
// Now both threads in a pair can write to the same reduction buffer address without race-condition issues
dotp_s[t_s] = my_r1;
dotp_s[t_s] = my_r1;
//attention
dotp_s[t_s + 64] = my_r2;
dotp_s[t_s + 64] = my_r2;
// Ensure the reduction buffers are fully populated
// Ensure the reduction buffers are fully populated
...
@@ -238,6 +243,8 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
...
@@ -238,6 +243,8 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
// Each thread collects the reduction results
// Each thread collects the reduction results
s_1 = dotp_s[0];
s_1 = dotp_s[0];
// attention
s_2 = dotp_s[64];
s_2 = dotp_s[64];
// Each thread updates its corresponding EV component
// Each thread updates its corresponding EV component
...
@@ -299,6 +306,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
...
@@ -299,6 +306,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
sync_threads<HAVE_2_WARPS>();
sync_threads<HAVE_2_WARPS>();
// We perform the reduction using the first warp only
// We perform the reduction using the first warp only
// attention
if (t_idx < 32)
if (t_idx < 32)
{
{
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
...
@@ -324,6 +332,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
...
@@ -324,6 +332,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
{
{
switch (nb)
switch (nb)
{
{
// attention
case 128:
case 128:
case 64:
case 64:
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
...
@@ -375,7 +384,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
...
@@ -375,7 +384,7 @@ __global__ void __launch_bounds__(128) compute_hh_trafo_c_kernel_single(float *
break;
break;
default:
default:
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and
128
.\n");
printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and
BLOCK_CYCLIC_BLOCKSIZE
.\n");
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment