Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
69cc6fc5
Commit
69cc6fc5
authored
Mar 04, 2021
by
Andreas Marek
Browse files
Make ELPA AMD GPU version compile
parent
fc6fb7ad
Changes
21
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
69cc6fc5
...
...
@@ -142,11 +142,11 @@ libelpa@SUFFIX@_private_la_SOURCES += \
endif
if
WITH_NVIDIA_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_gpu_complex.cu
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/CUDA/elpa_index_nvidia_gpu.cu src/GPU/CUDA/cudaFunctions.cu src/GPU/CUDA/cuUtils.cu src/elpa2/GPU/CUDA/ev_tridi_band_
nvidia_
gpu_real.cu src/elpa2/GPU/CUDA/ev_tridi_band_
nvidia_
gpu_complex.cu
endif
if
WITH_AMD_GPU_VERSION
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/ROCm/elpa_index_amd_gpu.cpp src/GPU/ROCm/rocmFunctions.cpp src/GPU/ROCm/hipUtils.cpp src/elpa2/GPU/
CUDA
/ev_tridi_band_gpu_real.c
u
src/elpa2/GPU/
CUDA
/ev_tridi_band_gpu_complex.c
u
libelpa@SUFFIX@
_private_la_SOURCES
+=
src/GPU/ROCm/elpa_index_amd_gpu.cpp src/GPU/ROCm/rocmFunctions.cpp src/GPU/ROCm/hipUtils.cpp src/elpa2/GPU/
ROCm
/ev_tridi_band_
amd_
gpu_real.c
pp
src/elpa2/GPU/
ROCm
/ev_tridi_band_
amd_
gpu_complex.c
pp
endif
if
!WITH_MPI
...
...
configure.ac
View file @
69cc6fc5
...
...
@@ -878,12 +878,17 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq
])
m4_define(elpa_m4_gpu_kernels, [
real_gpu
complex_gpu
m4_define(elpa_m4_
nvidia_
gpu_kernels, [
real_
nvidia_
gpu
complex_
nvidia_
gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_amd_gpu_kernels, [
real_amd_gpu
complex_amd_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu amd_gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
...
...
@@ -927,7 +932,8 @@ ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([nvidia_gpu],[disable])
ELPA_SELECT_KERNELS([amd_gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
...
...
@@ -969,15 +975,26 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
fi
])
AC_ARG_WITH(gpu-support-only, [AS_HELP_STRING([--with-gpu-support-only],
[Compile and always use the GPU version])],
[],[with_gpu_support_only=no])
if test x"$with_gpu_support_only" = x"yes" ; then
AC_ARG_WITH(nvidia-gpu-support-only, [AS_HELP_STRING([--with-nvidia-gpu-support-only],
[Compile and always use the NVIDIA GPU version])],
[],[with_nvidia_gpu_support_only=no])
if test x"$with_nvidia_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_nvidia_gpu=yes
use_complex_nvidia_gpu=yes
fi
AC_ARG_WITH(amd-gpu-support-only, [AS_HELP_STRING([--with-amd-gpu-support-only],
[Compile and always use the AMD GPU version])],
[],[with_amd_gpu_support_only=no])
if test x"$with_amd_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_gpu=yes
use_complex_gpu=yes
use_real_
amd_
gpu=yes
use_complex_
amd_
gpu=yes
fi
...
...
@@ -1054,7 +1071,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_
nvidia_gpu_kernels elpa_m4_amd_
gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
...
@@ -1477,8 +1494,8 @@ AC_ARG_ENABLE([Nvidia-gpu],
AC_MSG_RESULT([${use_nvidia_gpu}])
if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_real_gpu=yes
use_complex_gpu=yes
use_real_
nvidia_
gpu=yes
use_complex_
nvidia_
gpu=yes
fi
...
...
@@ -1495,11 +1512,8 @@ AC_ARG_ENABLE([AMD-gpu],
AC_MSG_RESULT([${use_amd_gpu}])
if test x"${use_amd_gpu}" = x"yes" ; then
need_amd_gpu=yes
########################################
# must be changed
#######################################
use_real_gpu=no
use_complex_gpu=no
use_real_amd_gpu=yes
use_complex_amd_gpu=yes
fi
...
...
@@ -1532,8 +1546,7 @@ fi
if test x"${need_amd_gpu}" = x"yes" ; then
echo "no amd gpu yet"
#AC_LANG_PUSH([C])
AC_LANG_PUSH([C])
#CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
#LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
#NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
...
...
@@ -1541,18 +1554,18 @@ if test x"${need_amd_gpu}" = x"yes" ; then
#AC_SUBST(NVCC)
#AC_SUBST(NVCCFLAGS)
#
dnl check whether
nv
cc compiler is found
#
AC_CHECK_PROG(
nv
cc_found,
nv
cc,yes,no)
#
if test x"${
nv
cc_found}" = x"no" ; then
#
AC_MSG_ERROR([
nv
cc not found; try to set the
cuda
-path or disable
Nvidia
GPU support])
#
fi
dnl check whether
hip
cc compiler is found
AC_CHECK_PROG(
hip
cc_found,
hip
cc,yes,no)
if test x"${
hip
cc_found}" = x"no" ; then
AC_MSG_ERROR([
hip
cc not found; try to set the
hip
-path or disable
AMD
GPU support])
fi
#dnl check whether we find c
u
blas
#
AC_SEARCH_LIBS([c
u
blas
D
gemm],[c
u
blas],[have_c
u
blas=yes],[have_c
u
blas=no])
#
if test x"${have_c
u
blas}" = x"no"; then
#
AC_MSG_ERROR([Could not link c
u
blas; try to set the
cuda
-path or disable
Nvidia
GPU support])
#
fi
#AC_SEARCH_LIBS([
cuda
Memcpy],[cudart],[have_cudart=yes],[have_cudart=no])
#dnl check whether we find
ro
cblas
AC_SEARCH_LIBS([
ro
cblas
_d
gemm],[
ro
cblas],[have_
ro
cblas=yes],[have_
ro
cblas=no])
if test x"${have_
ro
cblas}" = x"no"; then
AC_MSG_ERROR([Could not link
ro
cblas; try to set the
hip
-path or disable
AMD
GPU support])
fi
#AC_SEARCH_LIBS([
hip
Memcpy],[cudart],[have_cudart=yes],[have_cudart=no])
#if test x"${have_cudart}" = x"no"; then
# AC_MSG_ERROR([Could not link cudart; try to set the cuda-path or disable Nvidia GPU support])
#fi
...
...
@@ -1587,12 +1600,12 @@ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
AC_SUBST([ELPA_2STAGE_]m4_toupper(elpa_m4_kernel)[_COMPILED])
])
AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes"])
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_
nvidia_
gpu" = x"yes" -o x"$use_complex_
nvidia_
gpu" = x"yes"])
if test x"$use_real_
nvidia_
gpu" = x"yes" -o x"$use_complex_
nvidia_
gpu" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support])
AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU_COMPILED=1
ELPA_2STAGE_REAL_
NVIDIA_
GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
...
...
@@ -1617,11 +1630,17 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_LANG_POP([C])
fi
else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU_COMPILED=0
ELPA_2STAGE_REAL_
NVIDIA_
GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED])
echo "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
echo "$use_real_amd_gpu"
echo "$use_complex_amd_gpu"
AM_CONDITIONAL([WITH_AMD_GPU_VERSION],[test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes"])
if test x"$use_real_amd_gpu" = x"yes" -o x"$use_complex_amd_gpu" = x"yes" ; then
AC_DEFINE([WITH_AMD_GPU_VERSION],[1],[enable AMD GPU support])
...
...
@@ -1655,10 +1674,8 @@ else
ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=0
ELPA_2STAGE_REAL_AMD_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_AMD_GPU_COMPILED])
...
...
elpa/elpa_constants.h.in
View file @
69cc6fc5
...
...
@@ -50,27 +50,28 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 28, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 31, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 34, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 37, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_NVIDIA_GPU, 18, @ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AMD_GPU, 19, @ELPA_2STAGE_REAL_AMD_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 22, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 25, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 26, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 27, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 28, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK2, 29, @ELPA_2STAGE_REAL_SVE128_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK4, 30, @ELPA_2STAGE_REAL_SVE128_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE128_BLOCK6, 31, @ELPA_2STAGE_REAL_SVE128_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK2, 32, @ELPA_2STAGE_REAL_SVE256_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK4, 33, @ELPA_2STAGE_REAL_SVE256_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE256_BLOCK6, 34, @ELPA_2STAGE_REAL_SVE256_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK2, 35, @ELPA_2STAGE_REAL_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK4, 36, @ELPA_2STAGE_REAL_SVE512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SVE512_BLOCK6, 37, @ELPA_2STAGE_REAL_SVE512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 38, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 39, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
...
...
@@ -104,7 +105,8 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 22, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_AMD_GPU, 23, @ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
src/GPU/CUDA/cuUtils_template.cu
View file @
69cc6fc5
...
...
@@ -64,16 +64,16 @@
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
__global__
void
my_pack_c_kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
cuda_
kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
#else
__global__
void
my_pack_c_kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
cuda_
kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__
void
my_pack_c_kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
cuda_
kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
,
int
i_off
)
#else
__global__
void
my_pack_c_kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuFloatComplex
*
src
,
cuFloatComplex
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
cuda_
kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuFloatComplex
*
src
,
cuFloatComplex
*
dst
,
int
i_off
)
#endif
#endif
{
...
...
@@ -97,16 +97,16 @@ __global__ void my_pack_c_kernel_complex_single(const int n_offset, const int ma
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
__global__
void
my_unpack_c_kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
__global__
void
my_unpack_c_
cuda_
kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
#else
__global__
void
my_unpack_c_kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
__global__
void
my_unpack_c_
cuda_
kernel_real_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
float
*
src
,
float
*
dst
,
int
i_off
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__
void
my_unpack_c_kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
,
int
i_off
)
__global__
void
my_unpack_c_
cuda_
kernel_complex_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
,
int
i_off
)
#else
__global__
void
my_unpack_c_kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuFloatComplex
*
src
,
cuFloatComplex
*
dst
,
int
i_off
)
__global__
void
my_unpack_c_
cuda_
kernel_complex_single
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
cuFloatComplex
*
src
,
cuFloatComplex
*
dst
,
int
i_off
)
#endif
#endif
{
...
...
@@ -128,16 +128,16 @@ __global__ void my_unpack_c_kernel_complex_single(const int n_offset, const int
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
__global__
void
extract_hh_tau_c_kernel_real_double
(
double
*
hh
,
double
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
__global__
void
extract_hh_tau_c_
cuda_
kernel_real_double
(
double
*
hh
,
double
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
#else
__global__
void
extract_hh_tau_c_kernel_real_single
(
float
*
hh
,
float
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
__global__
void
extract_hh_tau_c_
cuda_
kernel_real_single
(
float
*
hh
,
float
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
__global__
void
extract_hh_tau_c_kernel_complex_double
(
cuDoubleComplex
*
hh
,
cuDoubleComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
__global__
void
extract_hh_tau_c_
cuda_
kernel_complex_double
(
cuDoubleComplex
*
hh
,
cuDoubleComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
#else
__global__
void
extract_hh_tau_c_kernel_complex_single
(
cuFloatComplex
*
hh
,
cuFloatComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
__global__
void
extract_hh_tau_c_
cuda_
kernel_complex_single
(
cuFloatComplex
*
hh
,
cuFloatComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
#endif
#endif
{
...
...
@@ -181,16 +181,16 @@ __global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex *hh, cuFlo
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extern
"C"
void
launch_my_pack_c_kernel_real_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
a_dev
,
double
*
row_group_dev
)
extern
"C"
void
launch_my_pack_c_
cuda_
kernel_real_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
a_dev
,
double
*
row_group_dev
)
#else
extern
"C"
void
launch_my_pack_c_kernel_real_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
float
*
a_dev
,
float
*
row_group_dev
)
extern
"C"
void
launch_my_pack_c_
cuda_
kernel_real_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
float
*
a_dev
,
float
*
row_group_dev
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extern
"C"
void
launch_my_pack_c_kernel_complex_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
a_dev
,
cuDoubleComplex
*
row_group_dev
)
extern
"C"
void
launch_my_pack_c_
cuda_
kernel_complex_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
a_dev
,
cuDoubleComplex
*
row_group_dev
)
#else
extern
"C"
void
launch_my_pack_c_kernel_complex_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuFloatComplex
*
a_dev
,
cuFloatComplex
*
row_group_dev
)
extern
"C"
void
launch_my_pack_c_
cuda_
kernel_complex_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuFloatComplex
*
a_dev
,
cuFloatComplex
*
row_group_dev
)
#endif
#endif
{
...
...
@@ -202,16 +202,16 @@ extern "C" void launch_my_pack_c_kernel_complex_single(const int row_count, cons
{
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
my_pack_c_kernel_real_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
my_pack_c_
cuda_
kernel_real_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
#else
my_pack_c_kernel_real_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
my_pack_c_
cuda_
kernel_real_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
my_pack_c_kernel_complex_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
my_pack_c_
cuda_
kernel_complex_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
#else
my_pack_c_kernel_complex_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
my_pack_c_
cuda_
kernel_complex_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
a_dev
,
row_group_dev
,
i_off
);
#endif
#endif
}
...
...
@@ -219,22 +219,22 @@ extern "C" void launch_my_pack_c_kernel_complex_single(const int row_count, cons
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
my pack_kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
printf
(
"
\n
my pack_
cuda_
kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extern
"C"
void
launch_extract_hh_tau_c_kernel_real_double
(
double
*
bcast_buffer_dev
,
double
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
extern
"C"
void
launch_extract_hh_tau_c_
cuda_
kernel_real_double
(
double
*
bcast_buffer_dev
,
double
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
#else
extern
"C"
void
launch_extract_hh_tau_c_kernel_real_single
(
float
*
bcast_buffer_dev
,
float
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
extern
"C"
void
launch_extract_hh_tau_c_
cuda_
kernel_real_single
(
float
*
bcast_buffer_dev
,
float
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extern
"C"
void
launch_extract_hh_tau_c_kernel_complex_double
(
cuDoubleComplex
*
bcast_buffer_dev
,
cuDoubleComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
extern
"C"
void
launch_extract_hh_tau_c_
cuda_
kernel_complex_double
(
cuDoubleComplex
*
bcast_buffer_dev
,
cuDoubleComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
#else
extern
"C"
void
launch_extract_hh_tau_c_kernel_complex_single
(
cuFloatComplex
*
bcast_buffer_dev
,
cuFloatComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
extern
"C"
void
launch_extract_hh_tau_c_
cuda_
kernel_complex_single
(
cuFloatComplex
*
bcast_buffer_dev
,
cuFloatComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
#endif
#endif
{
...
...
@@ -243,38 +243,38 @@ extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex *bc
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extract_hh_tau_c_kernel_real_double
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
extract_hh_tau_c_
cuda_
kernel_real_double
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
#else
extract_hh_tau_c_kernel_real_single
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
extract_hh_tau_c_
cuda_
kernel_real_single
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extract_hh_tau_c_kernel_complex_double
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
extract_hh_tau_c_
cuda_
kernel_complex_double
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
#else
extract_hh_tau_c_kernel_complex_single
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
extract_hh_tau_c_
cuda_
kernel_complex_single
<<<
grid_size
,
MAX_BLOCK_SIZE
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
#endif
#endif
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
extract _kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
printf
(
"
\n
extract
_cuda
_kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
extern
"C"
void
launch_my_unpack_c_kernel_real_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
row_group_dev
,
double
*
a_dev
)
extern
"C"
void
launch_my_unpack_c_
cuda_
kernel_real_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
row_group_dev
,
double
*
a_dev
)
#else
extern
"C"
void
launch_my_unpack_c_kernel_real_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
float
*
row_group_dev
,
float
*
a_dev
)
extern
"C"
void
launch_my_unpack_c_
cuda_
kernel_real_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
float
*
row_group_dev
,
float
*
a_dev
)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
extern
"C"
void
launch_my_unpack_c_kernel_complex_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
row_group_dev
,
cuDoubleComplex
*
a_dev
)
extern
"C"
void
launch_my_unpack_c_
cuda_
kernel_complex_double
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
row_group_dev
,
cuDoubleComplex
*
a_dev
)
#else
extern
"C"
void
launch_my_unpack_c_kernel_complex_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuFloatComplex
*
row_group_dev
,
cuFloatComplex
*
a_dev
)
extern
"C"
void
launch_my_unpack_c_
cuda_
kernel_complex_single
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuFloatComplex
*
row_group_dev
,
cuFloatComplex
*
a_dev
)
#endif
#endif
{
...
...
@@ -286,16 +286,16 @@ extern "C" void launch_my_unpack_c_kernel_complex_single(const int row_count, co
{
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
my_unpack_c_kernel_real_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
my_unpack_c_
cuda_
kernel_real_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
#else
my_unpack_c_kernel_real_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
my_unpack_c_
cuda_
kernel_real_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
my_unpack_c_kernel_complex_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
my_unpack_c_
cuda_
kernel_complex_double
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
#else
my_unpack_c_kernel_complex_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
my_unpack_c_
cuda_
kernel_complex_single
<<<
grid_size
,
blocksize
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
l_nev
,
row_group_dev
,
a_dev
,
i_off
);
#endif
#endif
}
...
...
@@ -303,7 +303,7 @@ extern "C" void launch_my_unpack_c_kernel_complex_single(const int row_count, co
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
my_unpack_c_kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
printf
(
"
\n
my_unpack_c_
cuda_
kernel failed %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
...
...
src/GPU/CUDA/elpa_index_nvidia_gpu.cu
View file @
69cc6fc5
// Copyright 2021, A. Marek MPCDF
//
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
extern
"C"
{
int
nvidia_gpu_count
()
{
int
count
;
...
...
src/GPU/ROCm/elpa_index_amd_gpu.cpp
View file @
69cc6fc5
// Copyright 2021, A. Marek MPCDF
//
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
#include
<hip/hip_runtime.h>
extern
"C"
{
int
amd_gpu_count
()
{
int
count
;
...
...
src/GPU/ROCm/hipUtils.cpp
View file @
69cc6fc5
// Copyright 2021, A. Marek MPCDF
//
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
...
...
src/GPU/ROCm/hipUtils_template.cpp
View file @
69cc6fc5
...
...
@@ -51,30 +51,32 @@
// written by A. Marek, MPCDF
#endif
#include
"config-f90.h"
#include
"hip/hip_runtime.h"
//#include <cuda_runtime.h>
#include
<stdlib.h>
#include
<stdio.h>
#if COMPLEXCASE == 1
//
#include <
cuC
omplex.h>
#include
<
hip/hip_c
omplex.h>
#endif
#define MAX_BLOCK_SIZE 1024
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
__global__
void
my_pack_c_kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double
*
src
,
double
*
dst
,
int
i_off
)
__global__
void
my_pack_c_
hip_
kernel_real_double
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
l_nev
,
double