Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
1ca304fb
Commit
1ca304fb
authored
Mar 29, 2017
by
Andreas Marek
Browse files
Cleanup of cuda kernel implementatios
parent
d8499a48
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu
View file @
1ca304fb
...
...
@@ -64,27 +64,6 @@
// devices with compute capability 3.x; for older devices, please use the Fortran kernel version
// ===========================================================================================================
//// Perform the equivalent of "__shfl_xor" on an 8-byte value
//#ifdef DOUBLE_PRECISION_COMPLEX
//static __device__ __forceinline__ double shfl_xor_double(double r, int mask)
//#else
//static __device__ __forceinline__ float shfl_xor_single(float r, int mask)
//#endif
//{
////#ifdef DOUBLE_PRECISION_COMPLEX
// int hi = __shfl_xor(__double2hiint(r), mask);
// int lo = __shfl_xor(__double2loint(r), mask);
//
// return __hiloint2double(hi, lo);
////#else
//// int hi = __shfl_xor(__float2hiint(r), mask);
//// int lo = __shfl_xor(__float2loint(r), mask);
//// int hi;
//// int lo;
// //exit(1);
// //return __hiloint2float(hi, lo);
////#endif
//}
#if 0
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
...
...
@@ -111,25 +90,15 @@ static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComp
// Perform the equivalent of "__shfl_down" on an 8-byte value
#ifdef DOUBLE_PRECISION_COMPLEX
static __device__ __forceinline__ double shfl_down_double(double r, int offset)
static __device__ __forceinline__ double shfl_down_
complex_
double(double r, int offset)
#else
static __device__ __forceinline__ float shfl_down_single(float r, int offset)
static __device__ __forceinline__ float shfl_down_
complex_
single(float r, int offset)
#endif
{
//#ifdef DOUBLE_PRECISION_COMPLEX
int hi = __shfl_down(__double2hiint(r), offset);
int lo = __shfl_down(__double2loint(r), offset);
return __hiloint2double(hi, lo);
//#else
// //int hi = __shfl_down(__float2hiint(r), offset);
// //int lo = __shfl_down(__float2loint(r), offset);
// //return __hiloint2float(hi, lo);
// int hi;
// int lo;
// //exit(1);
//#endif
}
#ifdef DOUBLE_PRECISION_COMPLEX
...
...
@@ -261,18 +230,18 @@ __device__ __forceinline__ cuFloatComplex warp_reduce_c_complex_single( cuFloatC
for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
{
#ifdef DOUBLE_PRECISION_COMPLEX
real += shfl_down_double(real, i);
real += shfl_down_
complex_
double(real, i);
#else
real += shfl_down_single(real, i);
real += shfl_down_
complex_
single(real, i);
#endif
}
#pragma unroll
for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
{
#ifdef DOUBLE_PRECISION_COMPLEX
imag += shfl_down_double(imag, i);
imag += shfl_down_
complex_
double(imag, i);
#else
imag += shfl_down_single(imag, i);
imag += shfl_down_
complex_
single(imag, i);
#endif
}
...
...
@@ -283,7 +252,7 @@ __device__ __forceinline__ cuFloatComplex warp_reduce_c_complex_single( cuFloatC
#endif
}
#if 0
#if 0
/* not used anywhere */
template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_COMPLEX
__device__ __forceinline__ void double_warp_reduce_complex_double(cuDoubleComplex * dotp_s, int w_off)
...
...
@@ -322,7 +291,7 @@ __device__ __forceinline__ void float_warp_reduce_complex_single(cuFloatComplex
}
}
}
#endif
#endif
/* not used anywhere */
#ifndef ALREADY_DEFINED_SYNC
...
...
src/ev_tridi_band_gpu_c_v2_real_template.Xcu
View file @
1ca304fb
...
...
@@ -64,19 +64,10 @@ static __device__ __forceinline__ double shfl_xor_real_double(double r, int mask
static __device__ __forceinline__ float shfl_xor_real_single(float r, int mask)
#endif
{
//#ifdef DOUBLE_PRECISION_REAL
int hi = __shfl_xor(__double2hiint(r), mask);
int lo = __shfl_xor(__double2loint(r), mask);
return __hiloint2double(hi, lo);
//#else
// int hi = __shfl_xor(__float2hiint(r), mask);
// int lo = __shfl_xor(__float2loint(r), mask);
// int hi;
// int lo;
//exit(1);
//return __hiloint2float(hi, lo);
//#endif
}
// Perform the equivalent of "__shfl_down" on an 8-byte value
...
...
@@ -86,24 +77,12 @@ static __device__ __forceinline__ double shfl_down_real_double(double r, int off
static __device__ __forceinline__ float shfl_down_real_single(float r, int offset)
#endif
{
//#ifdef DOUBLE_PRECISION_REAL
int hi = __shfl_down(__double2hiint(r), offset);
int lo = __shfl_down(__double2loint(r), offset);
return __hiloint2double(hi, lo);
//#else
// //int hi = __shfl_down(__float2hiint(r), offset);
// //int lo = __shfl_down(__float2loint(r), offset);
// //return __hiloint2float(hi, lo);
// int hi;
// int lo;
// //exit(1);
//#endif
}
// Perform a reduction on a warp or the first part of it
template <unsigned int REDUCE_START_OFFSET>
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -165,41 +144,6 @@ __device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, in
}
}
#if 0
static __device__ __forceinline__ cuDoubleComplex shfl_xor_complex(cuDoubleComplex r, int mask)
{
double real = cuCreal(r) ;
double imag = cuCimag(r);
int hr = __shfl_xor(__double2hiint(real), mask);
int lr = __shfl_xor(__double2loint(real), mask);
int hi = __shfl_xor(__double2hiint(imag), mask);
int li = __shfl_xor(__double2loint(imag), mask);
real = __hiloint2double(hr, lr);
imag = __hiloint2double(hi, li);
return make_cuDoubleComplex(real, imag);
}
#endif
//#ifndef ALREADY_DEFINED_SYNC
//// Synchronization wrapper, removing explicit synchronization when the thread-block is at most 32 threads (1 warp) in size
//template <bool MUST_SYNC>
//__device__ __forceinline__ void sync_threads()
//{
// if (MUST_SYNC)
// {
// __syncthreads();
// }
//}
//#define ALREADY_DEFINED_SYNC 1
//#endif
// Reset the entire contents of a shared reduction block; the thread block size must be a power-of-2
#ifdef DOUBLE_PRECISION_REAL
__device__ __forceinline__ void reset_dotp_buffers_real_double(double * const __restrict__ s_block)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment