Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
ee776ac8
Commit
ee776ac8
authored
Mar 05, 2021
by
Andreas Marek
Browse files
Merge branch 'ELPA_ROCm' into oneAPI
parents
cc1bf575
b0232235
Changes
21
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
ee776ac8
...
...
@@ -143,7 +143,7 @@ libelpa@SUFFIX@_private_la_SOURCES += \
endif
if
WITH_NVIDIA_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_gpu_complex.cu
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_
nvidia_
gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_
nvidia_
gpu_complex.cu
endif
if
WITH_INTEL_GPU_VERSION
...
...
@@ -151,7 +151,7 @@ if WITH_INTEL_GPU_VERSION
endif
if
WITH_AMD_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/ROCm/elpa_index_amd_gpu.cpp src/GPU/ROCm/rocmFunctions.cpp src/GPU/ROCm/hipUtils.cpp src/elpa2/GPU/
CUDA
/ev_tridi_band_gpu_real.c
u
src/elpa2/GPU/
CUDA
/ev_tridi_band_gpu_complex.c
u
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/ROCm/elpa_index_amd_gpu.cpp src/GPU/ROCm/rocmFunctions.cpp src/GPU/ROCm/hipUtils.cpp src/elpa2/GPU/
ROCm
/ev_tridi_band_
amd_
gpu_real.c
pp
src/elpa2/GPU/
ROCm
/ev_tridi_band_
amd_
gpu_complex.c
pp
endif
if
!WITH_MPI
...
...
configure.ac
View file @
ee776ac8
...
...
@@ -881,11 +881,14 @@ m4_define(elpa_m4_bgq_kernels, [
m4_define(elpa_m4_nvidia_gpu_kernels, [
real_nvidia_gpu
complex_nvidia_gpu
])
m4_define(elpa_m4_amd_gpu_kernels, [
real_amd_gpu
complex_amd_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu
amd_gpu
])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
...
...
@@ -930,6 +933,7 @@ ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([nvidia_gpu],[disable])
ELPA_SELECT_KERNELS([amd_gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
...
...
@@ -971,17 +975,28 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
fi
])
AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
[Compile and always use the GPU version])],
[],[with_gpu_support_only=no])
if test x"$with_gpu_support_only" = x"yes" ; then
AC_ARG_WITH(
nvidia-
gpu-support-only, [AS_HELP_STRING([--with-
nvidia-
gpu-support-only],
[Compile and always use the
NVIDIA
GPU version])],
[],[with_
nvidia_
gpu_support_only=no])
if test x"$with_
nvidia_
gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_nvida_gpu=yes
use_real_nvid
i
a_gpu=yes
use_complex_nvidia_gpu=yes
fi
AC_ARG_WITH(amd-gpu-support-only, [AS_HELP_STRING([--with-amd-gpu-support-only],
[Compile and always use the AMD GPU version])],
[],[with_amd_gpu_support_only=no])
if test x"$with_amd_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_amd_gpu=yes
use_complex_amd_gpu=yes
fi
dnl
dnl ELPA_KERNEL_DEPENDS([kernel],[other kernels])
...
...
@@ -1056,7 +1071,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels
elpa_m4_amd_gpu_kernels
,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
...
@@ -1477,6 +1492,11 @@ AC_ARG_ENABLE([gpu],
fi],
[use_gpu=no])
AC_MSG_RESULT([${use_gpu}])
if test x"${use_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_real_nvidia_gpu=yes
use_complex_nvidia_gpu=yes
fi
AC_MSG_CHECKING(whether Nvidia-GPU version should be used)
AC_ARG_ENABLE([Nvidia-gpu],
...
...
@@ -1494,11 +1514,6 @@ if test x"${use_nvidia_gpu}" = x"yes" ; then
use_nvidia_real_gpu=yes
use_nvidia_complex_gpu=yes
fi
if test x"${use_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_nvidia_real_gpu=yes
use_nvidia_complex_gpu=yes
fi
...
...
@@ -1532,11 +1547,8 @@ AC_ARG_ENABLE([AMD-gpu],
AC_MSG_RESULT([${use_amd_gpu}])
if test x"${use_amd_gpu}" = x"yes" ; then
need_amd_gpu=yes
########################################
# must be changed
#######################################
use_real_amd_gpu=no
use_complex_amd_gpu=no
use_real_amd_gpu=yes
use_complex_amd_gpu=yes
fi
...
...
@@ -1569,8 +1581,7 @@ fi
if test x"${need_amd_gpu}" = x"yes" ; then
echo "no amd gpu yet"
#AC_LANG_PUSH([C])
AC_LANG_PUSH([C])
#CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
#LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
#NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
...
...
@@ -1578,18 +1589,18 @@ if test x"${need_amd_gpu}" = x"yes" ; then
#AC_SUBST(NVCC)
#AC_SUBST(NVCCFLAGS)
#
dnl check whether
nv
cc compiler is found
#
AC_CHECK_PROG(
nv
cc_found,
nv
cc,yes,no)
#
if test x"${
nv
cc_found}" = x"no" ; then
#
AC_MSG_ERROR([
nv
cc not found; try to set the
cuda
-path or disable
Nvidia
GPU support])
#
fi
dnl check whether
hip
cc compiler is found
AC_CHECK_PROG(
hip
cc_found,
hip
cc,yes,no)
if test x"${
hip
cc_found}" = x"no" ; then
AC_MSG_ERROR([
hip
cc not found; try to set the
hip
-path or disable
AMD
GPU support])
fi
#dnl check whether we find c
u
blas
#
AC_SEARCH_LIBS([c
u
blas
D
gemm],[c
u
blas],[have_c
u
blas=yes],[have_c
u
blas=no])
#
if test x"${have_c
u
blas}" = x"no"; then
#
AC_MSG_ERROR([Could not link c
u
blas; try to set the
cuda
-path or disable
Nvidia
GPU support])
#
fi
#AC_SEARCH_LIBS([
cuda
Memcpy],[cudart],[have_cudart=yes],[have_cudart=no])
#dnl check whether we find
ro
cblas
AC_SEARCH_LIBS([
ro
cblas
_d
gemm],[
ro
cblas],[have_
ro
cblas=yes],[have_
ro
cblas=no])
if test x"${have_
ro
cblas}" = x"no"; then
AC_MSG_ERROR([Could not link
ro
cblas; try to set the
hip
-path or disable
AMD
GPU support])
fi
#AC_SEARCH_LIBS([
hip
Memcpy],[cudart],[have_cudart=yes],[have_cudart=no])
#if test x"${have_cudart}" = x"no"; then
# AC_MSG_ERROR([Could not link cudart; try to set the cuda-path or disable Nvidia GPU support])
#fi
...
...
@@ -1624,8 +1635,8 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
AC_SUBST([ELPA_2STAGE_]m4_toupper(elpa_m4_kernel)[_COMPILED])
])
AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_
nvidia_
gpu" = x"yes" -o x"$use_complex_
nvidia_
gpu" = x"yes"])
if test x"$use_real_
nvidia_
gpu" = x"yes" -o x"$use_complex_
nvidia_
gpu" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support])
AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build])
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=1
...
...
@@ -1657,7 +1668,13 @@ else
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED])
echo "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
echo "$use_real_amd_gpu"
echo "$use_complex_amd_gpu"
AM_CONDITIONAL([WITH_AMD_GPU_VERSION],[test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes"])
if test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes" ; then
...
...
elpa/elpa_constants.h.in
View file @
ee776ac8
...
...
@@ -51,26 +51,27 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NVIDIA_GPU, 18, @ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 28, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 31, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 34, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 37, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_AMD_GPU, 19, @ELPA_2STAGE_REAL_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 22, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 25, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 26, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 27, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 28, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 31, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 34, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
...
...
@@ -104,7 +105,8 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
src/GPU/CUDA/cuUtils_template.cu
View file @
ee776ac8
...
...
@@ -64,16 +64,16 @@
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
__global__
void
my_pack_c_kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
cuda_
kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
#else
__global__
void
my_pack_c_kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
cuda_
kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__
void
my_pack_c_kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
cuda_
kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
,
int
i_off
)
#else
__global__
void
my_pack_c_kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuFloatComplex
*
src
,
cuFloatComplex
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
cuda_
kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuFloatComplex
*
src
,
cuFloatComplex
*
dst
,
int
i_off
)
#endif
#endif
{
...
...
@@ -97,16 +97,16 @@ __global__ void my_pack_c_kernel_complex_single(const int n_offset, const int ma
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
__global__
void
my_unpack_c_kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
__global__
void
my_unpack_c_
cuda_
kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
#else
__global__
void
my_unpack_c_kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
__global__
void
my_unpack_c_
cuda_
kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__
void
my_unpack_c_kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
,
int
i_off
)
__global__
void
my_unpack_c_
cuda_
kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
,
int
i_off
)
#else
__global__
void
my_unpack_c_kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuFloatComplex
*
src
,
cuFloatComplex
*
dst
,
int
i_off
)
__global__
void
my_unpack_c_
cuda_
kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuFloatComplex
*
src
,
cuFloatComplex
*
dst
,
int
i_off
)
#endif
#endif
{
...
...
@@ -128,16 +128,16 @@ __global__ void my_unpack_c_kernel_complex_single(const int n_offset, const int
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
__global__
void
extract_hh_tau_c_kernel_real_double
(
double
*
hh
,
double
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
__global__
void
extract_hh_tau_c_
cuda_
kernel_real_double
(
double
*
hh
,
double
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
#else
__global__
void
extract_hh_tau_c_kernel_real_single
(
float
*
hh
,
float
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
__global__
void
extract_hh_tau_c_
cuda_
kernel_real_single
(
float
*
hh
,
float
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__
void
extract_hh_tau_c_kernel_complex_double
(
cuDoubleComplex
*
hh
,
cuDoubleComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
__global__
void
extract_hh_tau_c_
cuda_
kernel_complex_double
(
cuDoubleComplex
*
hh
,
cuDoubleComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
#else
__global__
void
extract_hh_tau_c_kernel_complex_single
(
cuFloatComplex
*
hh
,
cuFloatComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
__global__
void
extract_hh_tau_c_
cuda_
kernel_complex_single
(
cuFloatComplex
*
hh
,
cuFloatComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
#endif
#endif
{
...
...
@@ -181,16 +181,16 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex *hh, cuFlo
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extern
"C"
void
launch_my_pack_c_kernel_real_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
a_dev
,
double
*
row_group_dev
)
extern
"C"
void
launch_my_pack_c_
cuda_
kernel_real_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
a_dev
,
double
*
row_group_dev
)
#else
extern
"C"
void
launch_my_pack_c_kernel_real_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
float
*
a_dev
,
float
*
row_group_dev
)
extern
"C"
void
launch_my_pack_c_
cuda_
kernel_real_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
float
*
a_dev
,
float
*
row_group_dev
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extern
"C"
void
launch_my_pack_c_kernel_complex_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
a_dev
,
cuDoubleComplex
*
row_group_dev
)
extern
"C"
void
launch_my_pack_c_
cuda_
kernel_complex_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
a_dev
,
cuDoubleComplex
*
row_group_dev
)
#else
extern
"C"
void
launch_my_pack_c_kernel_complex_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuFloatComplex
*
a_dev
,
cuFloatComplex
*
row_group_dev
)
extern
"C"
void
launch_my_pack_c_
cuda_
kernel_complex_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuFloatComplex
*
a_dev
,
cuFloatComplex
*
row_group_dev
)
#endif
#endif
{
...
...
@@ -202,16 +202,16 @@ extern "C" void launch_my_pack_c_kernel_complex_single(const int row_count, cons
{
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
my_pack_c_kernel_real_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
my_pack_c_
cuda_
kernel_real_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
#else
my_pack_c_kernel_real_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
my_pack_c_
cuda_
kernel_real_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
my_pack_c_kernel_complex_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
my_pack_c_
cuda_
kernel_complex_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
#else
my_pack_c_kernel_complex_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
my_pack_c_
cuda_
kernel_complex_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
#endif
#endif
}
...
...
@@ -219,22 +219,22 @@ extern "C" void launch_my_pack_c_kernel_complex_single(const int row_count, cons
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
my pack_kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
printf
(
"
\n
my pack_
cuda_
kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extern
"C"
void
launch_extract_hh_tau_c_kernel_real_double
(
double
*
bcast_buffer_dev
,
double
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
extern
"C"
void
launch_extract_hh_tau_c_
cuda_
kernel_real_double
(
double
*
bcast_buffer_dev
,
double
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
#else
extern
"C"
void
launch_extract_hh_tau_c_kernel_real_single
(
float
*
bcast_buffer_dev
,
float
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
extern
"C"
void
launch_extract_hh_tau_c_
cuda_
kernel_real_single
(
float
*
bcast_buffer_dev
,
float
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extern
"C"
void
launch_extract_hh_tau_c_kernel_complex_double
(
cuDoubleComplex
*
bcast_buffer_dev
,
cuDoubleComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
extern
"C"
void
launch_extract_hh_tau_c_
cuda_
kernel_complex_double
(
cuDoubleComplex
*
bcast_buffer_dev
,
cuDoubleComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
#else
extern
"C"
void
launch_extract_hh_tau_c_kernel_complex_single
(
cuFloatComplex
*
bcast_buffer_dev
,
cuFloatComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
extern
"C"
void
launch_extract_hh_tau_c_
cuda_
kernel_complex_single
(
cuFloatComplex
*
bcast_buffer_dev
,
cuFloatComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
#endif
#endif
{
...
...
@@ -243,38 +243,38 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex *bc
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extract_hh_tau_c_kernel_real_double
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
extract_hh_tau_c_
cuda_
kernel_real_double
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
#else
extract_hh_tau_c_kernel_real_single
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
extract_hh_tau_c_
cuda_
kernel_real_single
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extract_hh_tau_c_kernel_complex_double
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
extract_hh_tau_c_
cuda_
kernel_complex_double
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
#else
extract_hh_tau_c_kernel_complex_single
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
extract_hh_tau_c_
cuda_
kernel_complex_single
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
#endif
#endif
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
extract _kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
printf
(
"
\n
extract
_cuda
_kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extern
"C"
void
launch_my_unpack_c_kernel_real_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
row_group_dev
,
double
*
a_dev
)
extern
"C"
void
launch_my_unpack_c_
cuda_
kernel_real_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
row_group_dev
,
double
*
a_dev
)
#else
extern
"C"
void
launch_my_unpack_c_kernel_real_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
float
*
row_group_dev
,
float
*
a_dev
)
extern
"C"
void
launch_my_unpack_c_
cuda_
kernel_real_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
float
*
row_group_dev
,
float
*
a_dev
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extern
"C"
void
launch_my_unpack_c_kernel_complex_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
row_group_dev
,
cuDoubleComplex
*
a_dev
)
extern
"C"
void
launch_my_unpack_c_
cuda_
kernel_complex_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
row_group_dev
,
cuDoubleComplex
*
a_dev
)
#else
extern
"C"
void
launch_my_unpack_c_kernel_complex_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuFloatComplex
*
row_group_dev
,
cuFloatComplex
*
a_dev
)
extern
"C"
void
launch_my_unpack_c_
cuda_
kernel_complex_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuFloatComplex
*
row_group_dev
,
cuFloatComplex
*
a_dev
)
#endif
#endif
{
...
...
@@ -286,16 +286,16 @@ extern "C" void launch_my_unpack_c_kernel_complex_single(const int row_count, co
{
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
my_unpack_c_kernel_real_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
my_unpack_c_
cuda_
kernel_real_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
#else
my_unpack_c_kernel_real_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
my_unpack_c_
cuda_
kernel_real_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
my_unpack_c_kernel_complex_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
my_unpack_c_
cuda_
kernel_complex_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
#else
my_unpack_c_kernel_complex_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
my_unpack_c_
cuda_
kernel_complex_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
#endif
#endif
}
...
...
@@ -303,7 +303,7 @@ extern "C" void launch_my_unpack_c_kernel_complex_single(const int row_count, co
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
my_unpack_c_kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
printf
(
"
\n
my_unpack_c_
cuda_
kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
...
...
src/GPU/CUDA/elpa_index_nvidia_gpu.cu
View file @
ee776ac8
// Copyright 2021, A. Marek MPCDF
//
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
extern
"C"
{
int
nvidia_gpu_count
()
{
int
count
;
...
...
src/GPU/ROCm/elpa_index_amd_gpu.cpp
View file @
ee776ac8
// Copyright 2021, A. Marek MPCDF
//
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
#include <hip/hip_runtime.h>
extern
"C"
{
int
amd_gpu_count
()
{
int
count
;
...
...
src/GPU/ROCm/hipUtils.cpp
View file @
ee776ac8
// Copyright 2021, A. Marek MPCDF
//
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
...
...
src/GPU/ROCm/hipUtils_template.cpp
View file @
ee776ac8
...
...
@@ -51,30 +51,32 @@
// written by A. Marek, MPCDF
#endif
#include "config-f90.h"
#include "hip/hip_runtime.h"
//#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#if COMPLEXCASE == 1
//
#include <
cuC
omplex.h>
#include <
hip/hip_c
omplex.h>
#endif
#define MAX_BLOCK_SIZE 1024
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
__global__
void
my_pack_c_kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
hip_
kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
#else
__global__
void
my_pack_c_kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
hip_
kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__
void
my_pack_c_kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cu
DoubleComplex
*
src
,
cu
DoubleComplex
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
hip_
kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
hip
DoubleComplex
*
src
,
hip
DoubleComplex
*
dst
,
int
i_off
)
#else
__global__
void
my_pack_c_kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cu
FloatComplex
*
src
,
cu
FloatComplex
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
hip_
kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
hip
FloatComplex
*
src
,
hip
FloatComplex
*
dst
,
int
i_off
)
#endif
#endif
{
...
...
@@ -98,16 +100,16 @@ __global__ void my_pack_c_kernel_complex_single(const int n_offset, cons