Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
9b6dcccf
Commit
9b6dcccf
authored
Mar 03, 2021
by
Andreas Marek
Browse files
Some changes to make compile again with oneAPI compiler
parent
db0c1416
Changes
21
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
configure.ac
View file @
9b6dcccf
...
@@ -878,12 +878,14 @@ m4_define(elpa_m4_bgq_kernels, [
...
@@ -878,12 +878,14 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq
complex_bgq
])
])
m4_define(elpa_m4_gpu_kernels, [
m4_define(elpa_m4_nvidia_gpu_kernels, [
real_gpu
real_nvidia_gpu
complex_gpu
complex_nvidia_gpu
real_amd_gpu
complex_amd_gpu
])
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq
nvidia_
gpu])
m4_define(elpa_m4_all_kernels,
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
m4_foreach_w([elpa_m4_type],
...
@@ -927,7 +929,7 @@ ELPA_SELECT_KERNELS([avx512],[enable])
...
@@ -927,7 +929,7 @@ ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([
nvidia_
gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
...
@@ -976,8 +978,8 @@ if test x"$with_gpu_support_only" = x"yes" ; then
...
@@ -976,8 +978,8 @@ if test x"$with_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
use_[]elpa_m4_kernel[]=no
])
])
use_real_gpu=yes
use_real_
nvida_
gpu=yes
use_complex_gpu=yes
use_complex_
nvidia_
gpu=yes
fi
fi
...
@@ -1054,7 +1056,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
...
@@ -1054,7 +1056,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_
nvidia_
gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
@@ -1477,8 +1479,8 @@ AC_ARG_ENABLE([Nvidia-gpu],
...
@@ -1477,8 +1479,8 @@ AC_ARG_ENABLE([Nvidia-gpu],
AC_MSG_RESULT([${use_nvidia_gpu}])
AC_MSG_RESULT([${use_nvidia_gpu}])
if test x"${use_nvidia_gpu}" = x"yes" ; then
if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
need_nvidia_gpu=yes
use_real_gpu=yes
use_
nvidia_
real_gpu=yes
use_complex_gpu=yes
use_
nvidia_
complex_gpu=yes
fi
fi
AC_MSG_CHECKING(whether INTEL GPU version should be used)
AC_MSG_CHECKING(whether INTEL GPU version should be used)
...
@@ -1514,8 +1516,8 @@ if test x"${use_amd_gpu}" = x"yes" ; then
...
@@ -1514,8 +1516,8 @@ if test x"${use_amd_gpu}" = x"yes" ; then
########################################
########################################
# must be changed
# must be changed
#######################################
#######################################
use_real_gpu=no
use_real_
amd_
gpu=no
use_complex_gpu=no
use_complex_
amd_
gpu=no
fi
fi
...
@@ -1607,8 +1609,8 @@ AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$u
...
@@ -1607,8 +1609,8 @@ AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$u
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support])
AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support])
AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build])
AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
ELPA_2STAGE_REAL_
NVIDIA_
GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
AC_ARG_ENABLE([nvtx],
...
@@ -1633,8 +1635,8 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
...
@@ -1633,8 +1635,8 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_LANG_POP([C])
AC_LANG_POP([C])
fi
fi
else
else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0
ELPA_2STAGE_REAL_
NVIDIA_
GPU_COMPILED=0
fi
fi
...
@@ -1671,10 +1673,8 @@ else
...
@@ -1671,10 +1673,8 @@ else
ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=0
ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=0
ELPA_2STAGE_REAL_AMD_GPU_COMPILED=0
ELPA_2STAGE_REAL_AMD_GPU_COMPILED=0
fi
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_GPU_COMPILED])
AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"])
AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"])
if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then
if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then
...
@@ -2021,7 +2021,7 @@ AC_CONFIG_FILES([
...
@@ -2021,7 +2021,7 @@ AC_CONFIG_FILES([
m4_include([m4/ax_fc_check_define.m4])
m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=
no
need_manual_cpp=
yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC"
ACTUAL_FC="$FC"
...
...
elpa/elpa_constants.h.in
View file @
9b6dcccf
...
@@ -50,7 +50,7 @@ enum ELPA_SOLVERS {
...
@@ -50,7 +50,7 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_
NVIDIA_
GPU, 18, @ELPA_2STAGE_REAL_
NVIDIA_
GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
...
@@ -104,7 +104,7 @@ enum ELPA_REAL_KERNELS {
...
@@ -104,7 +104,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 22, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU, 22, @ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
generate_automake_test_programs.py
View file @
9b6dcccf
...
@@ -21,8 +21,9 @@ solver_flag = {
...
@@ -21,8 +21,9 @@ solver_flag = {
"scalapack_part"
:
"-DTEST_SCALAPACK_PART"
,
"scalapack_part"
:
"-DTEST_SCALAPACK_PART"
,
}
}
gpu_flag
=
{
gpu_flag
=
{
0
:
"-DTEST_GPU=0"
,
"GPU_OFF"
:
"-DTEST_NVIDIA_GPU=0 -DTEST_INTEL_GPU=0"
,
1
:
"-DTEST_GPU=1"
,
"NVIDIA_GPU_ON"
:
"-DTEST_NVIDIA_GPU=1"
,
"INTEL_GPU_ON"
:
"-DTEST_INTEL_GPU=1"
,
}
}
gpu_id_flag
=
{
gpu_id_flag
=
{
0
:
"-DTEST_GPU_SET_ID=0"
,
0
:
"-DTEST_GPU_SET_ID=0"
,
...
@@ -86,14 +87,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
...
@@ -86,14 +87,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
# exclude some test combinations
# exclude some test combinations
# analytic tests only for "eigenvectors" and not on GPU
# analytic tests only for "eigenvectors" and not on GPU
if
(
m
==
"analytic"
and
(
g
==
1
or
t
!=
"eigenvectors"
)):
if
(
m
==
"analytic"
and
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
or
t
!=
"eigenvectors"
)):
continue
continue
# Frank tests only for "eigenvectors" and eigenvalues and real double precision case
# Frank tests only for "eigenvectors" and eigenvalues and real double precision case
if
(
m
==
"frank"
and
((
t
!=
"eigenvectors"
or
t
!=
"eigenvalues"
)
and
(
d
!=
"real"
or
p
!=
"double"
))):
if
(
m
==
"frank"
and
((
t
!=
"eigenvectors"
or
t
!=
"eigenvalues"
)
and
(
d
!=
"real"
or
p
!=
"double"
))):
continue
continue
if
(
s
in
[
"scalapack_all"
,
"scalapack_part"
]
and
(
g
==
1
or
t
!=
"eigenvectors"
or
m
!=
"analytic"
)):
if
(
s
in
[
"scalapack_all"
,
"scalapack_part"
]
and
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
or
t
!=
"eigenvectors"
or
m
!=
"analytic"
)):
continue
continue
# do not test single-precision scalapack
# do not test single-precision scalapack
...
@@ -127,7 +128,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
...
@@ -127,7 +128,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
continue
continue
# qr only for 2stage real
# qr only for 2stage real
if
(
q
==
1
and
(
s
!=
"2stage"
or
d
!=
"real"
or
t
!=
"eigenvectors"
or
g
==
1
or
m
!=
"random"
)):
if
(
q
==
1
and
(
s
!=
"2stage"
or
d
!=
"real"
or
t
!=
"eigenvectors"
or
g
==
"NVIDIA_GPU_ON"
or
"INTEL_GPU_ON"
or
m
!=
"random"
)):
continue
continue
if
(
spl
==
"myself"
and
(
d
!=
"real"
or
p
!=
"double"
or
q
!=
0
or
m
!=
"random"
or
(
t
!=
"eigenvectors"
and
t
!=
"cholesky"
)
or
lang
!=
"Fortran"
or
lay
!=
"square"
)):
if
(
spl
==
"myself"
and
(
d
!=
"real"
or
p
!=
"double"
or
q
!=
0
or
m
!=
"random"
or
(
t
!=
"eigenvectors"
and
t
!=
"cholesky"
)
or
lang
!=
"Fortran"
or
lay
!=
"square"
)):
...
@@ -154,10 +155,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
...
@@ -154,10 +155,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
print
(
"if ENABLE_C_TESTS"
)
print
(
"if ENABLE_C_TESTS"
)
endifs
+=
1
endifs
+=
1
if
(
g
==
1
):
if
(
g
==
"NVIDIA_GPU_ON"
):
print
(
"if WITH_NVIDIA_GPU_VERSION"
)
print
(
"if WITH_NVIDIA_GPU_VERSION"
)
endifs
+=
1
endifs
+=
1
if
(
g
==
"INTEL_GPU_ON"
):
print
(
"if WITH_INTEL_GPU_VERSION"
)
endifs
+=
1
if
(
lay
==
"all_layouts"
):
if
(
lay
==
"all_layouts"
):
print
(
"if WITH_MPI"
)
print
(
"if WITH_MPI"
)
endifs
+=
1
endifs
+=
1
...
@@ -190,7 +195,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
...
@@ -190,7 +195,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
langsuffix
=
language_flag
[
lang
],
langsuffix
=
language_flag
[
lang
],
d
=
d
,
p
=
p
,
t
=
t
,
s
=
s
,
d
=
d
,
p
=
p
,
t
=
t
,
s
=
s
,
kernelsuffix
=
""
if
kernel
==
"nokernel"
else
"_"
+
kernel
,
kernelsuffix
=
""
if
kernel
==
"nokernel"
else
"_"
+
kernel
,
gpusuffix
=
"gpu_"
if
g
else
""
,
gpusuffix
=
"gpu_"
if
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
)
else
""
,
gpuidsuffix
=
"set_gpu_id_"
if
gid
else
""
,
gpuidsuffix
=
"set_gpu_id_"
if
gid
else
""
,
qrsuffix
=
"qr_"
if
q
else
""
,
qrsuffix
=
"qr_"
if
q
else
""
,
m
=
m
,
m
=
m
,
...
...
src/GPU/check_for_gpu.F90
View file @
9b6dcccf
...
@@ -110,6 +110,7 @@ module mod_check_for_gpu
...
@@ -110,6 +110,7 @@ module mod_check_for_gpu
endif
endif
endif
endif
success
=
.true.
#ifdef WITH_NVIDIA_GPU_VERSION
#ifdef WITH_NVIDIA_GPU_VERSION
success
=
cuda_setdevice
(
use_gpu_id
)
success
=
cuda_setdevice
(
use_gpu_id
)
#endif
#endif
...
@@ -128,7 +129,8 @@ module mod_check_for_gpu
...
@@ -128,7 +129,8 @@ module mod_check_for_gpu
if
(
wantDebugMessage
)
then
if
(
wantDebugMessage
)
then
print
'(3(a,i0))'
,
'MPI rank '
,
myid
,
' uses GPU #'
,
deviceNumber
print
'(3(a,i0))'
,
'MPI rank '
,
myid
,
' uses GPU #'
,
deviceNumber
endif
endif
success
=
.true.
#ifdef WITH_NVIDIA_GPU_VERSION
#ifdef WITH_NVIDIA_GPU_VERSION
success
=
cublas_create
(
cublasHandle
)
success
=
cublas_create
(
cublasHandle
)
#endif
#endif
...
@@ -159,6 +161,7 @@ module mod_check_for_gpu
...
@@ -159,6 +161,7 @@ module mod_check_for_gpu
endif
endif
endif
endif
success
=
.true.
#ifdef WITH_NVIDIA_GPU_VERSION
#ifdef WITH_NVIDIA_GPU_VERSION
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success
=
cuda_getdevicecount
(
numberOfDevices
)
success
=
cuda_getdevicecount
(
numberOfDevices
)
...
...
src/GPU/mod_vendor_agnostic_layer.F90
View file @
9b6dcccf
...
@@ -2,7 +2,9 @@
...
@@ -2,7 +2,9 @@
module
elpa_gpu
module
elpa_gpu
use
precision
use
precision
use
iso_c_binding
use
iso_c_binding
#ifdef WITH_INTEL_GPU_VERSION
use
mkl_offload
#endif
integer
(
kind
=
c_int
),
parameter
::
nvidia_gpu
=
1
integer
(
kind
=
c_int
),
parameter
::
nvidia_gpu
=
1
integer
(
kind
=
c_int
),
parameter
::
amd_gpu
=
2
integer
(
kind
=
c_int
),
parameter
::
amd_gpu
=
2
integer
(
kind
=
c_int
),
parameter
::
intel_gpu
=
3
integer
(
kind
=
c_int
),
parameter
::
intel_gpu
=
3
...
...
src/elpa1/elpa1_template.F90
View file @
9b6dcccf
...
@@ -308,6 +308,12 @@ function elpa_solve_evp_&
...
@@ -308,6 +308,12 @@ function elpa_solve_evp_&
print
*
,
"Problem getting option for AMD GPU. Aborting..."
print
*
,
"Problem getting option for AMD GPU. Aborting..."
stop
stop
endif
endif
else
if
(
gpu_vendor
()
==
INTEL_GPU
)
then
call
obj
%
get
(
"intel-gpu"
,
gpu
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
else
gpu
=
0
gpu
=
0
endif
endif
...
@@ -318,6 +324,7 @@ function elpa_solve_evp_&
...
@@ -318,6 +324,7 @@ function elpa_solve_evp_&
useGPU
=
.false.
useGPU
=
.false.
endif
endif
print
*
,
"after activating gpu..."
call
obj
%
get
(
"is_skewsymmetric"
,
skewsymmetric
,
error
)
call
obj
%
get
(
"is_skewsymmetric"
,
skewsymmetric
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for skewsymmetric. Aborting..."
print
*
,
"Problem getting option for skewsymmetric. Aborting..."
...
@@ -351,6 +358,7 @@ function elpa_solve_evp_&
...
@@ -351,6 +358,7 @@ function elpa_solve_evp_&
do_useGPU
=
.false.
do_useGPU
=
.false.
print
*
,
"before check gpu..."
if
(
useGPU
)
then
if
(
useGPU
)
then
call
obj
%
timer
%
start
(
"check_for_gpu"
)
call
obj
%
timer
%
start
(
"check_for_gpu"
)
...
@@ -379,6 +387,7 @@ function elpa_solve_evp_&
...
@@ -379,6 +387,7 @@ function elpa_solve_evp_&
endif
endif
print
*
,
"after check gpu..."
do_useGPU_tridiag
=
do_useGPU
do_useGPU_tridiag
=
do_useGPU
do_useGPU_solve_tridi
=
do_useGPU
do_useGPU_solve_tridi
=
do_useGPU
do_useGPU_trans_ev
=
do_useGPU
do_useGPU_trans_ev
=
do_useGPU
...
@@ -447,7 +456,7 @@ function elpa_solve_evp_&
...
@@ -447,7 +456,7 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX
#ifdef WITH_NVTX
call
nvtxRangePush
(
"tridi"
)
call
nvtxRangePush
(
"tridi"
)
#endif
#endif
print
*
,
"before tridiag..."
call
tridiag_
&
call
tridiag_
&
&
MATH_DATATYPE
&
&
MATH_DATATYPE
&
&
_
&
&
_
&
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
9b6dcccf
...
@@ -501,6 +501,7 @@ subroutine tridiag_&
...
@@ -501,6 +501,7 @@ subroutine tridiag_&
aux
(
1
:
2
*
n_stored_vecs
)
=
conjg
(
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
))
aux
(
1
:
2
*
n_stored_vecs
)
=
conjg
(
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
))
#endif
#endif
if
(
useIntelGPU
)
then
if
(
useIntelGPU
)
then
print
*
,
"intel phase aaaaaaaaaaaaaaaaaaaaaaaaaa"
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mkl_offload"
)
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mkl_offload"
)
#if REALCASE == 1
#if REALCASE == 1
aux
(
1
:
2
*
n_stored_vecs
)
=
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
)
aux
(
1
:
2
*
n_stored_vecs
)
=
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
)
...
@@ -675,7 +676,8 @@ subroutine tridiag_&
...
@@ -675,7 +676,8 @@ subroutine tridiag_&
!$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, successGPU, u_row, u_row_dev, &
!$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, successGPU, u_row, u_row_dev, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat, useIntelGPU)
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat, useIntelGPU, &
!$omp& matrixCols)
my_thread
=
omp_get_thread_num
()
my_thread
=
omp_get_thread_num
()
n_threads
=
omp_get_num_threads
()
n_threads
=
omp_get_num_threads
()
...
...
src/elpa2/compute_hh_trafo.F90
View file @
9b6dcccf
...
@@ -187,10 +187,10 @@ last_stripe_width, kernel)
...
@@ -187,10 +187,10 @@ last_stripe_width, kernel)
if
(
wantDebug
)
then
if
(
wantDebug
)
then
if
(
useGPU
.and.
&
if
(
useGPU
.and.
&
#if REALCASE == 1
#if REALCASE == 1
(
kernel
.ne.
ELPA_2STAGE_REAL_GPU
))
then
(
kernel
.ne.
ELPA_2STAGE_REAL_
NVIDIA_
GPU
))
then
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
(
kernel
.ne.
ELPA_2STAGE_COMPLEX_GPU
))
then
(
kernel
.ne.
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
))
then
#endif
#endif
print
*
,
"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
print
*
,
"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop
stop
...
@@ -198,10 +198,10 @@ last_stripe_width, kernel)
...
@@ -198,10 +198,10 @@ last_stripe_width, kernel)
endif
endif
#if REALCASE == 1
#if REALCASE == 1
if
(
kernel
.eq.
ELPA_2STAGE_REAL_GPU
)
then
if
(
kernel
.eq.
ELPA_2STAGE_REAL_
NVIDIA_
GPU
)
then
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_GPU
)
then
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
)
then
#endif
#endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if
(
ncols
<
1
)
then
if
(
ncols
<
1
)
then
...
@@ -263,11 +263,11 @@ last_stripe_width, kernel)
...
@@ -263,11 +263,11 @@ last_stripe_width, kernel)
#if REALCASE == 1
#if REALCASE == 1
! GPU kernel real
! GPU kernel real
if
(
kernel
.eq.
ELPA_2STAGE_REAL_GPU
)
then
if
(
kernel
.eq.
ELPA_2STAGE_REAL_
NVIDIA_
GPU
)
then
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
! GPU kernel complex
! GPU kernel complex
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_GPU
)
then
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
)
then
#endif
#endif
if
(
wantDebug
)
then
if
(
wantDebug
)
then
call
obj
%
timer
%
start
(
"compute_hh_trafo: GPU"
)
call
obj
%
timer
%
start
(
"compute_hh_trafo: GPU"
)
...
...
src/elpa2/elpa2_template.F90
View file @
9b6dcccf
...
@@ -209,7 +209,7 @@
...
@@ -209,7 +209,7 @@
#undef GPU_KERNEL
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_REAL_GPU
#define GPU_KERNEL ELPA_2STAGE_REAL_
NVIDIA_
GPU
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
#define KERNEL_STRING "real_kernel"
#define KERNEL_STRING "real_kernel"
#endif
#endif
...
@@ -217,7 +217,7 @@
...
@@ -217,7 +217,7 @@
#undef GPU_KERNEL
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
#define KERNEL_STRING "complex_kernel"
#define KERNEL_STRING "complex_kernel"
#endif
#endif
...
@@ -379,6 +379,12 @@
...
@@ -379,6 +379,12 @@
print
*
,
"Problem getting option for AMD GPU. Aborting..."
print
*
,
"Problem getting option for AMD GPU. Aborting..."
stop
stop
endif
endif
else
if
(
gpu_vendor
()
==
INTEL_GPU
)
then
call
obj
%
get
(
"intel-gpu"
,
gpu
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
else
gpu
=
0
gpu
=
0
endif
endif
...
...
src/elpa2/elpa2_trans_ev_band_to_full_template.F90
View file @
9b6dcccf
...
@@ -370,7 +370,7 @@ subroutine trans_ev_band_to_full_&
...
@@ -370,7 +370,7 @@ subroutine trans_ev_band_to_full_&
if
(
i
>
1
)
then
if
(
i
>
1
)
then
if
(
useIntelGPU
)
then
if
(
useIntelGPU
)
then
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
#if 0
#if 0
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
int
(
l_rows
,
kind
=
BLAS_KIND
),
ONE
,
hvm
,
&
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
int
(
l_rows
,
kind
=
BLAS_KIND
),
ONE
,
hvm
,
&
...
@@ -383,7 +383,7 @@ subroutine trans_ev_band_to_full_&
...
@@ -383,7 +383,7 @@ subroutine trans_ev_band_to_full_&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
hvm
(:,(
i
-1
)
*
nbw
+1
:),
&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
hvm
(:,(
i
-1
)
*
nbw
+1
:),
&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
ZERO
,
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
ZERO
,
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
#endif
#endif
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")
else
else
call
obj
%
timer
%
start
(
"blas"
)
call
obj
%
timer
%
start
(
"blas"
)
...
@@ -402,7 +402,7 @@ subroutine trans_ev_band_to_full_&
...
@@ -402,7 +402,7 @@ subroutine trans_ev_band_to_full_&
call
obj
%
timer
%
stop
(
"mpi_communication"
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
if
(
useIntelGPU
)
then
if
(
useIntelGPU
)
then
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
#if 0
#if 0
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
...
@@ -418,7 +418,7 @@ subroutine trans_ev_band_to_full_&
...
@@ -418,7 +418,7 @@ subroutine trans_ev_band_to_full_&
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
&
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
#endif
#endif
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")
else
else
call
obj
%
timer
%
start
(
"blas"
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
...
@@ -432,7 +432,7 @@ subroutine trans_ev_band_to_full_&
...
@@ -432,7 +432,7 @@ subroutine trans_ev_band_to_full_&
#else /* WITH_MPI */
#else /* WITH_MPI */
if
(
useIntelGPU
)
then
if
(
useIntelGPU
)
then
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
#if 0
#if 0
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
...
@@ -448,7 +448,7 @@ subroutine trans_ev_band_to_full_&
...
@@ -448,7 +448,7 @@ subroutine trans_ev_band_to_full_&
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
&
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
#endif
#endif
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")