Commit 22a12154 authored by Andreas Marek's avatar Andreas Marek
Browse files

Rename cuda functions

- the functions now contain the appropiate real/complex in their
  name
- unused functions have been removed as cleanup
parent 358fde98
......@@ -267,7 +267,7 @@
&PRECISION&
&_&
&MATH_DATATYPE
call launch_compute_hh_trafo_c_kernel_&
call launch_compute_hh_trafo_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
......@@ -292,7 +292,7 @@
&_&
&MATH_DATATYPE
call launch_compute_hh_trafo_c_kernel_&
call launch_compute_hh_trafo_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
......
This diff is collapsed.
......@@ -64,27 +64,27 @@
// devices with compute capability 3.x; for older devices, please use the Fortran kernel version
// ===========================================================================================================
// Perform the equivalent of "__shfl_xor" on an 8-byte value
#ifdef DOUBLE_PRECISION_COMPLEX
static __device__ __forceinline__ double shfl_xor_double(double r, int mask)
#else
static __device__ __forceinline__ float shfl_xor_single(float r, int mask)
#endif
{
//// Perform the equivalent of "__shfl_xor" on an 8-byte value
//#ifdef DOUBLE_PRECISION_COMPLEX
int hi = __shfl_xor(__double2hiint(r), mask);
int lo = __shfl_xor(__double2loint(r), mask);
return __hiloint2double(hi, lo);
//static __device__ __forceinline__ double shfl_xor_double(double r, int mask)
//#else
// int hi = __shfl_xor(__float2hiint(r), mask);
// int lo = __shfl_xor(__float2loint(r), mask);
// int hi;
// int lo;
//exit(1);
//return __hiloint2float(hi, lo);
//static __device__ __forceinline__ float shfl_xor_single(float r, int mask)
//#endif
}
//{
////#ifdef DOUBLE_PRECISION_COMPLEX
// int hi = __shfl_xor(__double2hiint(r), mask);
// int lo = __shfl_xor(__double2loint(r), mask);
//
// return __hiloint2double(hi, lo);
////#else
//// int hi = __shfl_xor(__float2hiint(r), mask);
//// int lo = __shfl_xor(__float2loint(r), mask);
//// int hi;
//// int lo;
// //exit(1);
// //return __hiloint2float(hi, lo);
////#endif
//}
#if 0
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
......@@ -133,9 +133,9 @@ static __device__ __forceinline__ float shfl_down_single(float r, int offset)
}
#ifdef DOUBLE_PRECISION_COMPLEX
__device__ void warp_reduce_complex_1_double( cuDoubleComplex *s_block)
__device__ void warp_reduce_1_complex_double( cuDoubleComplex *s_block)
#else
__device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block)
__device__ void warp_reduce_1_complex_single( cuFloatComplex *s_block)
#endif
{
int t_idx ;
......@@ -184,15 +184,15 @@ __device__ void warp_reduce_complex_1_single( cuFloatComplex *s_block)
}
#ifdef DOUBLE_PRECISION_COMPLEX
__device__ void warp_reduce_complex_2_double( cuDoubleComplex *s_block)
__device__ void warp_reduce_2_complex_double( cuDoubleComplex *s_block)
#else
__device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block)
__device__ void warp_reduce_2_complex_single( cuFloatComplex *s_block)
#endif
{
int t_idx ;
t_idx = threadIdx.x;
__syncthreads();
// attention
// attention
#ifdef DOUBLE_PRECISION_COMPLEX
if(t_idx < 64)
{
......@@ -241,31 +241,11 @@ __device__ void warp_reduce_complex_2_single( cuFloatComplex *s_block)
#endif
}
// Perform a reduction on a warp or the first part of it
template <unsigned int REDUCE_START_OFFSET>
#ifdef DOUBLE_PRECISION_COMPLEX
__device__ __forceinline__ double warp_reduce_double(double r)
__device__ __forceinline__ cuDoubleComplex warp_reduce_c_complex_double( cuDoubleComplex r)
#else
__device__ __forceinline__ float warp_reduce_single(float r)
#endif
{
#pragma unroll
for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
{
#ifdef DOUBLE_PRECISION_COMPLEX
r += shfl_down_double(r, i);
#else
r += shfl_down_single(r, i);
#endif
}
return r;
}
template <unsigned int REDUCE_START_OFFSET>
#ifdef DOUBLE_PRECISION_COMPLEX
__device__ __forceinline__ cuDoubleComplex warp_reduce_c_double( cuDoubleComplex r)
#else
__device__ __forceinline__ cuFloatComplex warp_reduce_c_single( cuFloatComplex r)
__device__ __forceinline__ cuFloatComplex warp_reduce_c_complex_single( cuFloatComplex r)
#endif
{
......@@ -303,47 +283,7 @@ __device__ __forceinline__ cuFloatComplex warp_reduce_c_single( cuFloatComplex r
#endif
}
// Perform 2 reductions, using either 1 or 2 warps
template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_COMPLEX
__device__ __forceinline__ void double_warp_reduce_double(double * dotp_s, int w_off)
#else
__device__ __forceinline__ void float_warp_reduce_single(float * dotp_s, int w_off)
#endif
{
int t_idx = threadIdx.x;
if (HAVE_2_WARPS)
{
// In this case, we have 2 warps, each doing 1 reduction
// attention
if (t_idx < 64)
{
#ifdef DOUBLE_PRECISION_COMPLEX
dotp_s[w_off + t_idx] = warp_reduce_double<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
#else
dotp_s[w_off + t_idx] = warp_reduce_single<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
#endif
}
}
else
{
// In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_COMPLEX
dotp_s[t_idx] = warp_reduce_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
dotp_s[t_idx + 64] = warp_reduce_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
#else
dotp_s[t_idx] = warp_reduce_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
dotp_s[t_idx + 64] = warp_reduce_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
#endif
}
}
}
#if 0
template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_COMPLEX
__device__ __forceinline__ void double_warp_reduce_complex_double(cuDoubleComplex * dotp_s, int w_off)
......@@ -360,9 +300,9 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
if (t_idx < 64)
{
#ifdef DOUBLE_PRECISION_COMPLEX
dotp_s[w_off + t_idx] = warp_reduce_c_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[w_off + t_idx] , dotp_s[w_off + t_idx + 32]));
dotp_s[w_off + t_idx] = warp_reduce_c_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[w_off + t_idx] , dotp_s[w_off + t_idx + 32]));
#else
dotp_s[w_off + t_idx] = warp_reduce_c_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[w_off + t_idx] , dotp_s[w_off + t_idx + 32]));
dotp_s[w_off + t_idx] = warp_reduce_c_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[w_off + t_idx] , dotp_s[w_off + t_idx + 32]));
#endif
}
}
......@@ -373,20 +313,22 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_COMPLEX
dotp_s[t_idx] = warp_reduce_c_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[t_idx] , dotp_s[t_idx + 32]));
dotp_s[t_idx + 64] = warp_reduce_c_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[t_idx + 64] , dotp_s[t_idx + 96]));
dotp_s[t_idx] = warp_reduce_c_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[t_idx] , dotp_s[t_idx + 32]));
dotp_s[t_idx + 64] = warp_reduce_c_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[t_idx + 64] , dotp_s[t_idx + 96]));
#else
dotp_s[t_idx] = warp_reduce_c_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[t_idx] , dotp_s[t_idx + 32]));
dotp_s[t_idx + 64] = warp_reduce_c_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[t_idx + 64] , dotp_s[t_idx + 96]));
dotp_s[t_idx] = warp_reduce_c_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[t_idx] , dotp_s[t_idx + 32]));
dotp_s[t_idx + 64] = warp_reduce_c_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[t_idx + 64] , dotp_s[t_idx + 96]));
#endif
}
}
}
#endif
#ifndef ALREADY_DEFINED_SYNC
// Synchronization wrapper, removing explicit synchronization when the thread-block is at most 32 threads (1 warp) in size
template <bool MUST_SYNC>
__device__ __forceinline__ void sync_threads()
__device__ __forceinline__ void sync_real_threads()
{
if (MUST_SYNC)
{
......@@ -430,9 +372,9 @@ __device__ void reset_dotp_buffers_complex_single( cuFloatComplex * const __re
}
}
#ifdef DOUBLE_PRECISION_COMPLEX
__device__ void reset_dotp_buffers_complex_2_double( cuDoubleComplex * const __restrict__ s_block)
__device__ void reset_dotp_buffers_2_complex_double( cuDoubleComplex * const __restrict__ s_block)
#else
__device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __restrict__ s_block)
__device__ void reset_dotp_buffers_2_complex_single( cuFloatComplex * const __restrict__ s_block)
#endif
{
if (blockDim.x >= BLOCK_CYCLIC_BLOCKSIZE)
......@@ -466,9 +408,9 @@ __device__ void reset_dotp_buffers_complex_2_single( cuFloatComplex * const __
// Backtransformation kernel
// =========================
#ifdef DOUBLE_PRECISION_COMPLEX
template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_kernel_complex_2_2_double(cuDoubleComplex * const __restrict__ q, const cuDoubleComplex * const __restrict__ hh, const cuDoubleComplex * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_kernel_2_2_complex_double(cuDoubleComplex * const __restrict__ q, const cuDoubleComplex * const __restrict__ hh, const cuDoubleComplex * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#else
template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_kernel_complex_2_2_single(cuFloatComplex * const __restrict__ q, const cuFloatComplex * const __restrict__ hh, const cuFloatComplex * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_kernel_2_2_complex_single(cuFloatComplex * const __restrict__ q, const cuFloatComplex * const __restrict__ hh, const cuFloatComplex * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -524,9 +466,9 @@ template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_c_ker
#endif
}
#ifdef DOUBLE_PRECISION_COMPLEX
warp_reduce_complex_1_double( dotp_s);
warp_reduce_1_complex_double( dotp_s);
#else
warp_reduce_complex_1_single( dotp_s);
warp_reduce_1_complex_single( dotp_s);
#endif
__syncthreads();
......@@ -580,50 +522,50 @@ extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex*
case 128:
case 64:
#ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_complex_2_2_double<16><<<n_block, n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_double<16><<<n_block, n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_complex_2_2_single<16><<<n_block, n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_single<16><<<n_block, n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 32:
#ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_complex_2_2_double<8><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_double<8><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_complex_2_2_single<8><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_single<8><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 16:
#ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_complex_2_2_double<4><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_double<4><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_complex_2_2_single<4><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_single<4><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 8:
#ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_complex_2_2_double<2><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_double<2><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_complex_2_2_single<2><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_single<2><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 4:
#ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_complex_2_2_double<1><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_double<1><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_complex_2_2_single<1><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_single<1><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 2:
case 1:
#ifdef DOUBLE_PRECISION_COMPLEX
compute_hh_trafo_c_kernel_complex_2_2_double<0><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_double<0><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_complex_2_2_single<0><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_2_2_complex_single<0><<<n_block ,n_thread>>>(q, hh, hh_tau, nb, ldq, off, ncols);
#endif
break;
default:
......
......@@ -57,6 +57,114 @@
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256
// Perform the equivalent of "__shfl_xor" on an 8-byte value
#ifdef DOUBLE_PRECISION_REAL
static __device__ __forceinline__ double shfl_xor_real_double(double r, int mask)
#else
static __device__ __forceinline__ float shfl_xor_real_single(float r, int mask)
#endif
{
//#ifdef DOUBLE_PRECISION_REAL
int hi = __shfl_xor(__double2hiint(r), mask);
int lo = __shfl_xor(__double2loint(r), mask);
return __hiloint2double(hi, lo);
//#else
// int hi = __shfl_xor(__float2hiint(r), mask);
// int lo = __shfl_xor(__float2loint(r), mask);
// int hi;
// int lo;
//exit(1);
//return __hiloint2float(hi, lo);
//#endif
}
// Perform the equivalent of "__shfl_down" on an 8-byte value
#ifdef DOUBLE_PRECISION_REAL
static __device__ __forceinline__ double shfl_down_real_double(double r, int offset)
#else
static __device__ __forceinline__ float shfl_down_real_single(float r, int offset)
#endif
{
//#ifdef DOUBLE_PRECISION_REAL
int hi = __shfl_down(__double2hiint(r), offset);
int lo = __shfl_down(__double2loint(r), offset);
return __hiloint2double(hi, lo);
//#else
// //int hi = __shfl_down(__float2hiint(r), offset);
// //int lo = __shfl_down(__float2loint(r), offset);
// //return __hiloint2float(hi, lo);
// int hi;
// int lo;
// //exit(1);
//#endif
}
// Perform a reduction on a warp or the first part of it
template <unsigned int REDUCE_START_OFFSET>
#ifdef DOUBLE_PRECISION_REAL
__device__ __forceinline__ double warp_reduce_real_double(double r)
#else
__device__ __forceinline__ float warp_reduce_real_single(float r)
#endif
{
#pragma unroll
for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
{
#ifdef DOUBLE_PRECISION_REAL
r += shfl_down_real_double(r, i);
#else
r += shfl_down_real_single(r, i);
#endif
}
return r;
}
// Perform 2 reductions, using either 1 or 2 warps
template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
__device__ __forceinline__ void double_warp_reduce_real_double(double * dotp_s, int w_off)
#else
__device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, int w_off)
#endif
{
int t_idx = threadIdx.x;
if (HAVE_2_WARPS)
{
// In this case, we have 2 warps, each doing 1 reduction
// attention
if (t_idx < 64)
{
#ifdef DOUBLE_PRECISION_REAL
dotp_s[w_off + t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
#else
dotp_s[w_off + t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
#endif
}
}
else
{
// In this case we have 1 warp that performs both reductions
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_REAL
dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
dotp_s[t_idx + 64] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
#else
dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
dotp_s[t_idx + 64] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
#endif
}
}
}
#if 0
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
{
......@@ -94,9 +202,9 @@ static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComp
// Reset the entire contents of a shared reduction block; the thread block size must be a power-of-2
#ifdef DOUBLE_PRECISION_REAL
__device__ __forceinline__ void reset_dotp_buffers_double(double * const __restrict__ s_block)
__device__ __forceinline__ void reset_dotp_buffers_real_double(double * const __restrict__ s_block)
#else
__device__ __forceinline__ void reset_dotp_buffers_single(float * const __restrict__ s_block)
__device__ __forceinline__ void reset_dotp_buffers_real_single(float * const __restrict__ s_block)
#endif
{
// attention
......@@ -129,10 +237,10 @@ __device__ __forceinline__ void reset_dotp_buffers_single(float * const __restri
// We use templates here to avoid additional branching based on the actual size of the thread-block
template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_real_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#else
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_kernel_real_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#endif
......@@ -163,9 +271,9 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
// The entire contents of the shared reduction buffers must be reset
#ifdef DOUBLE_PRECISION_REAL
reset_dotp_buffers_double(dotp_s);
reset_dotp_buffers_real_double(dotp_s);
#else
reset_dotp_buffers_single(dotp_s);
reset_dotp_buffers_real_single(dotp_s);
#endif
// Compute initial access indices
......@@ -183,7 +291,7 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
}
// Ensure the ring buffer and reduction buffers are initialized
sync_threads<HAVE_2_WARPS>();
sync_real_threads<HAVE_2_WARPS>();
while (j >= off + 1)
{
......@@ -217,11 +325,11 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
// After using "shfl_xor", both threads in a pair will hold the same values
#ifdef DOUBLE_PRECISION_REAL
my_r1 += shfl_xor_double(my_r1, 1);
my_r2 += shfl_xor_double(my_r2, 1);
my_r1 += shfl_xor_real_double(my_r1, 1);
my_r2 += shfl_xor_real_double(my_r2, 1);
#else
my_r1 += shfl_xor_single(my_r1, 1);
my_r2 += shfl_xor_single(my_r2, 1);
my_r1 += shfl_xor_real_single(my_r1, 1);
my_r2 += shfl_xor_real_single(my_r2, 1);
#endif
// Now both threads in a pair can write to the same reduction buffer address without race-condition issues
......@@ -230,16 +338,16 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
dotp_s[t_s + 64] = my_r2;
// Ensure the reduction buffers are fully populated
sync_threads<HAVE_2_WARPS>();
sync_real_threads<HAVE_2_WARPS>();
// Perform the 2 reductions using only the first warp (we assume the warp size is 32, valid up to CC 3.x)
#ifdef DOUBLE_PRECISION_REAL
double_warp_reduce_double<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
double_warp_reduce_real_double<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
#else
float_warp_reduce_single<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
float_warp_reduce_real_single<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
#endif
// Ensure every thread will have access to the reduction results
sync_threads<HAVE_2_WARPS>();
sync_real_threads<HAVE_2_WARPS>();
// Each thread collects the reduction results
s_1 = dotp_s[0];
......@@ -262,7 +370,7 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
q_s[t_idx + 2] = q_v_2;
}
sync_threads<HAVE_2_WARPS>();
sync_real_threads<HAVE_2_WARPS>();
// Update access indices
q_off -= q_delta;
......@@ -297,26 +405,26 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
// We prepare the reduction buffer
my_r1 = q_v_1 * hh_v_1 * tau_1;
#ifdef DOUBLE_PRECISION_REAL
my_r1 += shfl_xor_double(my_r1, 1);
my_r1 += shfl_xor_real_double(my_r1, 1);
#else
my_r1 += shfl_xor_single(my_r1, 1);
my_r1 += shfl_xor_real_single(my_r1, 1);
#endif
dotp_s[t_s] = my_r1;
sync_threads<HAVE_2_WARPS>();
sync_real_threads<HAVE_2_WARPS>();
// We perform the reduction using the first warp only
// attention
if (t_idx < 32)
{
#ifdef DOUBLE_PRECISION_REAL
dotp_s[t_idx] = warp_reduce_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
#else
dotp_s[t_idx] = warp_reduce_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
#endif
}
sync_threads<HAVE_2_WARPS>();
sync_real_threads<HAVE_2_WARPS>();
// The last EV components are written to the EV matrix
q[q_off] = q_v_1 - hh_v_1 * dotp_s[0];
......@@ -325,9 +433,9 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
// This is a host wrapper for calling the appropriate back-transformation kernel, based on the SCALAPACK block size
#ifdef DOUBLE_PRECISION_REAL
extern "C" void launch_compute_hh_trafo_c_kernel_double(double * const q, const double * const hh, const double * const hh_dot, const double * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
extern "C" void launch_compute_hh_trafo_c_kernel_real_double(double * const q, const double * const hh, const double * const hh_dot, const double * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
#else
extern "C" void launch_compute_hh_trafo_c_kernel_single(float * const q, const float * const hh, const float * const hh_dot, const float * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
extern "C" void launch_compute_hh_trafo_c_kernel_real_single(float * const q, const float * const hh, const float * const hh_dot, const float * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
#endif
{
switch (nb)
......@@ -336,50 +444,50 @@ __global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_c_k
case 128:
case 64:
#ifdef DOUBLE_PRECISION_REAL
compute_hh_trafo_c_kernel_double<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_double<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_single<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_single<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 32:
#ifdef DOUBLE_PRECISION_REAL
compute_hh_trafo_c_kernel_double<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_double<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_single<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_single<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 16:
#ifdef DOUBLE_PRECISION_REAL
compute_hh_trafo_c_kernel_double<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_double<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_single<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_single<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 8:
#ifdef DOUBLE_PRECISION_REAL
compute_hh_trafo_c_kernel_double<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_double<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_single<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_single<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 4:
#ifdef DOUBLE_PRECISION_REAL
compute_hh_trafo_c_kernel_double<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_double<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_single<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_single<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#endif
break;
case 2:
case 1:
#ifdef DOUBLE_PRECISION_REAL
compute_hh_trafo_c_kernel_double<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_double<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#else
compute_hh_trafo_c_kernel_single<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
compute_hh_trafo_c_kernel_real_single<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
#endif
break;
......
This diff is collapsed.
......@@ -75,7 +75,7 @@
! call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
call launch_my_pack_c_kernel_&
call launch_my_pack_gpu_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION &
......@@ -156,7 +156,7 @@
! Use one kernel call to pack the entire row group
! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
call launch_my_unpack_