Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
9b6dcccf
Commit
9b6dcccf
authored
Mar 03, 2021
by
Andreas Marek
Browse files
Some changes to make compile again with oneAPI compiler
parent
db0c1416
Changes
21
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
configure.ac
View file @
9b6dcccf
...
...
@@ -878,12 +878,14 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq
])
m4_define(elpa_m4_gpu_kernels, [
real_gpu
complex_gpu
m4_define(elpa_m4_nvidia_gpu_kernels, [
real_nvidia_gpu
complex_nvidia_gpu
real_amd_gpu
complex_amd_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq
nvidia_
gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
...
...
@@ -927,7 +929,7 @@ ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([
nvidia_
gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
...
...
@@ -976,8 +978,8 @@ if test x"$with_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_gpu=yes
use_complex_gpu=yes
use_real_
nvida_
gpu=yes
use_complex_
nvidia_
gpu=yes
fi
...
...
@@ -1054,7 +1056,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_
nvidia_
gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...
...
@@ -1477,8 +1479,8 @@ AC_ARG_ENABLE([Nvidia-gpu],
AC_MSG_RESULT([${use_nvidia_gpu}])
if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_real_gpu=yes
use_complex_gpu=yes
use_
nvidia_
real_gpu=yes
use_
nvidia_
complex_gpu=yes
fi
AC_MSG_CHECKING(whether INTEL GPU version should be used)
...
...
@@ -1514,8 +1516,8 @@ if test x"${use_amd_gpu}" = x"yes" ; then
########################################
# must be changed
#######################################
use_real_gpu=no
use_complex_gpu=no
use_real_
amd_
gpu=no
use_complex_
amd_
gpu=no
fi
...
...
@@ -1607,8 +1609,8 @@ AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$u
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support])
AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU_COMPILED=1
ELPA_2STAGE_REAL_
NVIDIA_
GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
...
...
@@ -1633,8 +1635,8 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_LANG_POP([C])
fi
else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU_COMPILED=0
ELPA_2STAGE_REAL_
NVIDIA_
GPU_COMPILED=0
fi
...
...
@@ -1671,10 +1673,8 @@ else
ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=0
ELPA_2STAGE_REAL_AMD_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_AMD_GPU_COMPILED])
AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"])
if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then
...
...
@@ -2021,7 +2021,7 @@ AC_CONFIG_FILES([
m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=
no
need_manual_cpp=
yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC"
...
...
elpa/elpa_constants.h.in
View file @
9b6dcccf
...
...
@@ -50,7 +50,7 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_
NVIDIA_
GPU, 18, @ELPA_2STAGE_REAL_
NVIDIA_
GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
...
...
@@ -104,7 +104,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 22, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU, 22, @ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
...
...
generate_automake_test_programs.py
View file @
9b6dcccf
...
...
@@ -21,8 +21,9 @@ solver_flag = {
"scalapack_part"
:
"-DTEST_SCALAPACK_PART"
,
}
gpu_flag
=
{
0
:
"-DTEST_GPU=0"
,
1
:
"-DTEST_GPU=1"
,
"GPU_OFF"
:
"-DTEST_NVIDIA_GPU=0 -DTEST_INTEL_GPU=0"
,
"NVIDIA_GPU_ON"
:
"-DTEST_NVIDIA_GPU=1"
,
"INTEL_GPU_ON"
:
"-DTEST_INTEL_GPU=1"
,
}
gpu_id_flag
=
{
0
:
"-DTEST_GPU_SET_ID=0"
,
...
...
@@ -86,14 +87,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
# exclude some test combinations
# analytic tests only for "eigenvectors" and not on GPU
if
(
m
==
"analytic"
and
(
g
==
1
or
t
!=
"eigenvectors"
)):
if
(
m
==
"analytic"
and
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
or
t
!=
"eigenvectors"
)):
continue
# Frank tests only for "eigenvectors" and eigenvalues and real double precision case
if
(
m
==
"frank"
and
((
t
!=
"eigenvectors"
or
t
!=
"eigenvalues"
)
and
(
d
!=
"real"
or
p
!=
"double"
))):
continue
if
(
s
in
[
"scalapack_all"
,
"scalapack_part"
]
and
(
g
==
1
or
t
!=
"eigenvectors"
or
m
!=
"analytic"
)):
if
(
s
in
[
"scalapack_all"
,
"scalapack_part"
]
and
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
or
t
!=
"eigenvectors"
or
m
!=
"analytic"
)):
continue
# do not test single-precision scalapack
...
...
@@ -127,7 +128,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
continue
# qr only for 2stage real
if
(
q
==
1
and
(
s
!=
"2stage"
or
d
!=
"real"
or
t
!=
"eigenvectors"
or
g
==
1
or
m
!=
"random"
)):
if
(
q
==
1
and
(
s
!=
"2stage"
or
d
!=
"real"
or
t
!=
"eigenvectors"
or
g
==
"NVIDIA_GPU_ON"
or
"INTEL_GPU_ON"
or
m
!=
"random"
)):
continue
if
(
spl
==
"myself"
and
(
d
!=
"real"
or
p
!=
"double"
or
q
!=
0
or
m
!=
"random"
or
(
t
!=
"eigenvectors"
and
t
!=
"cholesky"
)
or
lang
!=
"Fortran"
or
lay
!=
"square"
)):
...
...
@@ -154,10 +155,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
print
(
"if ENABLE_C_TESTS"
)
endifs
+=
1
if
(
g
==
1
):
if
(
g
==
"NVIDIA_GPU_ON"
):
print
(
"if WITH_NVIDIA_GPU_VERSION"
)
endifs
+=
1
if
(
g
==
"INTEL_GPU_ON"
):
print
(
"if WITH_INTEL_GPU_VERSION"
)
endifs
+=
1
if
(
lay
==
"all_layouts"
):
print
(
"if WITH_MPI"
)
endifs
+=
1
...
...
@@ -190,7 +195,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
langsuffix
=
language_flag
[
lang
],
d
=
d
,
p
=
p
,
t
=
t
,
s
=
s
,
kernelsuffix
=
""
if
kernel
==
"nokernel"
else
"_"
+
kernel
,
gpusuffix
=
"gpu_"
if
g
else
""
,
gpusuffix
=
"gpu_"
if
(
g
==
"NVIDIA_GPU_ON"
or
g
==
"INTEL_GPU_ON"
)
else
""
,
gpuidsuffix
=
"set_gpu_id_"
if
gid
else
""
,
qrsuffix
=
"qr_"
if
q
else
""
,
m
=
m
,
...
...
src/GPU/check_for_gpu.F90
View file @
9b6dcccf
...
...
@@ -110,6 +110,7 @@ module mod_check_for_gpu
endif
endif
success
=
.true.
#ifdef WITH_NVIDIA_GPU_VERSION
success
=
cuda_setdevice
(
use_gpu_id
)
#endif
...
...
@@ -128,7 +129,8 @@ module mod_check_for_gpu
if
(
wantDebugMessage
)
then
print
'(3(a,i0))'
,
'MPI rank '
,
myid
,
' uses GPU #'
,
deviceNumber
endif
success
=
.true.
#ifdef WITH_NVIDIA_GPU_VERSION
success
=
cublas_create
(
cublasHandle
)
#endif
...
...
@@ -159,6 +161,7 @@ module mod_check_for_gpu
endif
endif
success
=
.true.
#ifdef WITH_NVIDIA_GPU_VERSION
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success
=
cuda_getdevicecount
(
numberOfDevices
)
...
...
src/GPU/mod_vendor_agnostic_layer.F90
View file @
9b6dcccf
...
...
@@ -2,7 +2,9 @@
module
elpa_gpu
use
precision
use
iso_c_binding
#ifdef WITH_INTEL_GPU_VERSION
use
mkl_offload
#endif
integer
(
kind
=
c_int
),
parameter
::
nvidia_gpu
=
1
integer
(
kind
=
c_int
),
parameter
::
amd_gpu
=
2
integer
(
kind
=
c_int
),
parameter
::
intel_gpu
=
3
...
...
src/elpa1/elpa1_template.F90
View file @
9b6dcccf
...
...
@@ -308,6 +308,12 @@ function elpa_solve_evp_&
print
*
,
"Problem getting option for AMD GPU. Aborting..."
stop
endif
else
if
(
gpu_vendor
()
==
INTEL_GPU
)
then
call
obj
%
get
(
"intel-gpu"
,
gpu
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
gpu
=
0
endif
...
...
@@ -318,6 +324,7 @@ function elpa_solve_evp_&
useGPU
=
.false.
endif
print
*
,
"after activating gpu..."
call
obj
%
get
(
"is_skewsymmetric"
,
skewsymmetric
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for skewsymmetric. Aborting..."
...
...
@@ -351,6 +358,7 @@ function elpa_solve_evp_&
do_useGPU
=
.false.
print
*
,
"before check gpu..."
if
(
useGPU
)
then
call
obj
%
timer
%
start
(
"check_for_gpu"
)
...
...
@@ -379,6 +387,7 @@ function elpa_solve_evp_&
endif
print
*
,
"after check gpu..."
do_useGPU_tridiag
=
do_useGPU
do_useGPU_solve_tridi
=
do_useGPU
do_useGPU_trans_ev
=
do_useGPU
...
...
@@ -447,7 +456,7 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX
call
nvtxRangePush
(
"tridi"
)
#endif
print
*
,
"before tridiag..."
call
tridiag_
&
&
MATH_DATATYPE
&
&
_
&
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
9b6dcccf
...
...
@@ -501,6 +501,7 @@ subroutine tridiag_&
aux
(
1
:
2
*
n_stored_vecs
)
=
conjg
(
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
))
#endif
if
(
useIntelGPU
)
then
print
*
,
"intel phase aaaaaaaaaaaaaaaaaaaaaaaaaa"
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mkl_offload"
)
#if REALCASE == 1
aux
(
1
:
2
*
n_stored_vecs
)
=
uv_stored_cols
(
l_cols
+1
,
1
:
2
*
n_stored_vecs
)
...
...
@@ -675,7 +676,8 @@ subroutine tridiag_&
!$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, successGPU, u_row, u_row_dev, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat, useIntelGPU)
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat, useIntelGPU, &
!$omp& matrixCols)
my_thread
=
omp_get_thread_num
()
n_threads
=
omp_get_num_threads
()
...
...
src/elpa2/compute_hh_trafo.F90
View file @
9b6dcccf
...
...
@@ -187,10 +187,10 @@ last_stripe_width, kernel)
if
(
wantDebug
)
then
if
(
useGPU
.and.
&
#if REALCASE == 1
(
kernel
.ne.
ELPA_2STAGE_REAL_GPU
))
then
(
kernel
.ne.
ELPA_2STAGE_REAL_
NVIDIA_
GPU
))
then
#endif
#if COMPLEXCASE == 1
(
kernel
.ne.
ELPA_2STAGE_COMPLEX_GPU
))
then
(
kernel
.ne.
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
))
then
#endif
print
*
,
"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop
...
...
@@ -198,10 +198,10 @@ last_stripe_width, kernel)
endif
#if REALCASE == 1
if
(
kernel
.eq.
ELPA_2STAGE_REAL_GPU
)
then
if
(
kernel
.eq.
ELPA_2STAGE_REAL_
NVIDIA_
GPU
)
then
#endif
#if COMPLEXCASE == 1
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_GPU
)
then
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
)
then
#endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if
(
ncols
<
1
)
then
...
...
@@ -263,11 +263,11 @@ last_stripe_width, kernel)
#if REALCASE == 1
! GPU kernel real
if
(
kernel
.eq.
ELPA_2STAGE_REAL_GPU
)
then
if
(
kernel
.eq.
ELPA_2STAGE_REAL_
NVIDIA_
GPU
)
then
#endif
#if COMPLEXCASE == 1
! GPU kernel complex
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_GPU
)
then
if
(
kernel
.eq.
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
)
then
#endif
if
(
wantDebug
)
then
call
obj
%
timer
%
start
(
"compute_hh_trafo: GPU"
)
...
...
src/elpa2/elpa2_template.F90
View file @
9b6dcccf
...
...
@@ -209,7 +209,7 @@
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_REAL_GPU
#define GPU_KERNEL ELPA_2STAGE_REAL_
NVIDIA_
GPU
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
#define KERNEL_STRING "real_kernel"
#endif
...
...
@@ -217,7 +217,7 @@
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
#define KERNEL_STRING "complex_kernel"
#endif
...
...
@@ -379,6 +379,12 @@
print
*
,
"Problem getting option for AMD GPU. Aborting..."
stop
endif
else
if
(
gpu_vendor
()
==
INTEL_GPU
)
then
call
obj
%
get
(
"intel-gpu"
,
gpu
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
gpu
=
0
endif
...
...
src/elpa2/elpa2_trans_ev_band_to_full_template.F90
View file @
9b6dcccf
...
...
@@ -370,7 +370,7 @@ subroutine trans_ev_band_to_full_&
if
(
i
>
1
)
then
if
(
useIntelGPU
)
then
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
#if 0
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
int
(
l_rows
,
kind
=
BLAS_KIND
),
ONE
,
hvm
,
&
...
...
@@ -383,7 +383,7 @@ subroutine trans_ev_band_to_full_&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
hvm
(:,(
i
-1
)
*
nbw
+1
:),
&
int
(
max_local_rows
,
kind
=
BLAS_KIND
),
ZERO
,
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
#endif
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")
else
call
obj
%
timer
%
start
(
"blas"
)
...
...
@@ -402,7 +402,7 @@ subroutine trans_ev_band_to_full_&
call
obj
%
timer
%
stop
(
"mpi_communication"
)
if
(
useIntelGPU
)
then
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
#if 0
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
...
...
@@ -418,7 +418,7 @@ subroutine trans_ev_band_to_full_&
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp2
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
#endif
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")
else
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
...
...
@@ -432,7 +432,7 @@ subroutine trans_ev_band_to_full_&
#else /* WITH_MPI */
if
(
useIntelGPU
)
then
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
#if 0
call
PRECISION_TRMM
(
'L'
,
'U'
,
'N'
,
'N'
,
int
(
t_rows
,
kind
=
BLAS_KIND
),
int
(
t_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
...
...
@@ -448,7 +448,7 @@ subroutine trans_ev_band_to_full_&
tmat_complete
(
t_rows
+1
,
t_rows
+1
),
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
t_tmp
,
int
(
cwy_blocking
,
kind
=
BLAS_KIND
))
#endif
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")
else
call
obj
%
timer
%
start
(
"blas"
)
...
...
@@ -471,7 +471,7 @@ subroutine trans_ev_band_to_full_&
if
(
l_rows
>
0
)
then
if
(
useGPU
)
then
if
(
useIntelGPU
)
then
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
#if 0
call
PRECISION_GEMM
(
BLAS_TRANS_OR_CONJ
,
'N'
,
&
int
(
n_cols
,
kind
=
BLAS_KIND
),
int
(
l_cols
,
kind
=
BLAS_KIND
),
int
(
l_rows
,
kind
=
BLAS_KIND
),
ONE
,
&
...
...
@@ -484,7 +484,7 @@ subroutine trans_ev_band_to_full_&
hvm
,
int
(
ubound
(
hvm
,
dim
=
1
),
kind
=
BLAS_KIND
),
q_mat
,
int
(
ldq
,
kind
=
BLAS_KIND
),
ZERO
,
tmp1
,
&
int
(
n_cols
,
kind
=
BLAS_KIND
))
#endif
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")
else
successGPU
=
gpu_memcpy
(
hvm_dev
,
int
(
loc
(
hvm
),
kind
=
c_intptr_t
),
&
...
...
@@ -525,7 +525,7 @@ subroutine trans_ev_band_to_full_&
if
(
l_rows
>
0
)
then
if
(
useGPU
)
then
if
(
useIntelGPU
)
then
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
call
PRECISION_TRMM
(
'L'
,
'U'
,
BLAS_TRANS_OR_CONJ
,
'N'
,
&
int
(
n_cols
,
kind
=
BLAS_KIND
),
int
(
l_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
tmp2
,
int
(
n_cols
,
kind
=
BLAS_KIND
))
...
...
@@ -533,7 +533,7 @@ subroutine trans_ev_band_to_full_&
int
(
n_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
hvm
,
&
int
(
ubound
(
hvm
,
dim
=
1
),
kind
=
BLAS_KIND
),
tmp2
,
int
(
n_cols
,
kind
=
BLAS_KIND
),
ONE
,
&
q_mat
,
int
(
ldq
,
kind
=
BLAS_KIND
))
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")
else
successGPU
=
gpu_memcpy
(
tmp_dev
,
int
(
loc
(
tmp2
),
kind
=
c_intptr_t
),
&
...
...
@@ -569,7 +569,7 @@ subroutine trans_ev_band_to_full_&
if
(
useGPU
)
then
if
(
useIntelGPU
)
then
#if 0
call
obj
%
timer
%
start
(
"mkl_offload"
)
!
call obj%timer%start("mkl_offload")
call
PRECISION_TRMM
(
'L'
,
'U'
,
BLAS_TRANS_OR_CONJ
,
'N'
,
&
int
(
n_cols
,
kind
=
BLAS_KIND
),
int
(
l_cols
,
kind
=
BLAS_KIND
),
ONE
,
tmat_complete
,
&
int
(
cwy_blocking
,
kind
=
BLAS_KIND
),
&
...
...
@@ -587,7 +587,7 @@ subroutine trans_ev_band_to_full_&
-
ONE
,
hvm
,
int
(
ubound
(
hvm
,
dim
=
1
),
kind
=
BLAS_KIND
),
tmp1
,
int
(
n_cols
,
kind
=
BLAS_KIND
),
ONE
,
q_mat
,
&
int
(
ldq
,
kind
=
BLAS_KIND
))
#endif
call
obj
%
timer
%
stop
(
"mkl_offload"
)
!
call obj%timer%stop("mkl_offload")
else
successGPU
=
gpu_memcpy
(
tmat_dev
,
int
(
loc
(
tmat_complete
),
kind
=
c_intptr_t
),
&
cwy_blocking
*
cwy_blocking
*
size_of_datatype
,
gpuMemcpyHostToDevice
)
...
...
src/elpa_index.c
View file @
9b6dcccf
...
...
@@ -767,8 +767,14 @@ static const char *real_kernel_name(int kernel) {
}
}
#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
#define REAL_NVIDIA_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_NVIDIA_GPU ? gpu_is_active : 1
#define REAL_AMD_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_AMD_GPU ? gpu_is_active : 1
#define REAL_INTEL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_INTEL_GPU ? gpu_is_active : 1
static
int
real_kernel_is_valid
(
elpa_index_t
index
,
int
n
,
int
new_value
)
{
int
solver
=
elpa_index_get_int_value
(
index
,
"solver"
,
NULL
);
...
...
@@ -777,7 +783,9 @@ static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
}
int
gpu_is_active
=
(
elpa_index_get_int_value
(
index
,
"nvidia-gpu"
,
NULL
)
||
elpa_index_get_int_value
(
index
,
"amd-gpu"
,
NULL
)
||
elpa_index_get_int_value
(
index
,
"intel-gpu"
,
NULL
));
switch
(
new_value
)
{
ELPA_FOR_ALL_2STAGE_REAL_KERNELS
(
VALID_CASE_3
,
REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE
)
ELPA_FOR_ALL_2STAGE_REAL_KERNELS
(
VALID_CASE_3
,
REAL_NVIDIA_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE
)
//ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_AMD_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
//ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_INTEL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
default:
return
0
;
}
...
...
@@ -806,8 +814,14 @@ static const char *complex_kernel_name(int kernel) {
}
}
#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
#define COMPLEX_NVIDIA_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_NVIDIA_GPU ? gpu_is_active : 1
#define COMPLEX_AMD_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_AMD_GPU ? gpu_is_active : 1
#define COMPLEX_INTEL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_INTEL_GPU ? gpu_is_active : 1
static
int
complex_kernel_is_valid
(
elpa_index_t
index
,
int
n
,
int
new_value
)
{
int
solver
=
elpa_index_get_int_value
(
index
,
"solver"
,
NULL
);
...
...
@@ -816,7 +830,9 @@ static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
}
int
gpu_is_active
=
(
elpa_index_get_int_value
(
index
,
"nvidia-gpu"
,
NULL
)
||
elpa_index_get_int_value
(
index
,
"amd-gpu"
,
NULL
)
||
elpa_index_get_int_value
(
index
,
"intel-gpu"
,
NULL
));
switch
(
new_value
)
{
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS
(
VALID_CASE_3
,
COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE
)
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS
(
VALID_CASE_3
,
COMPLEX_NVIDIA_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE
)
//ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_AMD_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
// ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_INTEL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
default:
return
0
;
}
...
...
src/general/precision_macros.h
View file @
9b6dcccf
...
...
@@ -103,6 +103,9 @@
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef cublas_PRECISION_SYMV
#undef mkl_offload_PRECISION_GEMM
#undef mkl_offload_PRECISION_GEMV
#undef mkl_offload_PRECISION_TRMM
#undef scal_PRECISION_GEMM
#undef scal_PRECISION_NRM2
#undef scal_PRECISION_LASET
...
...
@@ -177,6 +180,9 @@
#define cublas_PRECISION_TRMM cublas_DTRMM
#define cublas_PRECISION_GEMV cublas_DGEMV
#define cublas_PRECISION_SYMV cublas_DSYMV
#define mkl_offload_PRECISION_GEMM mkl_offload_DGEMM
#define mkl_offload_PRECISION_GEMV mkl_offload_DGEMV
#define mkl_offload_PRECISION_TRMM mkl_offload_DTRMM
#define scal_PRECISION_GEMM PDGEMM
#define scal_PRECISION_NRM2 PDNRM2
#define scal_PRECISION_LASET PDLASET
...
...
@@ -248,6 +254,9 @@
#define cublas_PRECISION_TRMM cublas_STRMM
#define cublas_PRECISION_GEMV cublas_SGEMV
#define cublas_PRECISION_SYMV cublas_SSYMV
#define mkl_offload_PRECISION_GEMM mkl_offload_SGEMM
#define mkl_offload_PRECISION_GEMV mkl_offload_SGEMV
#define mkl_offload_PRECISION_TRMM mkl_offload_STRMM
#define scal_PRECISION_GEMM PSGEMM
#define scal_PRECISION_NRM2 PSNRM2
#define scal_PRECISION_LASET PSLASET
...
...
@@ -332,6 +341,9 @@
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef cublas_PRECISION_SYMV
#undef mkl_offload_PRECISION_GEMM
#undef mkl_offload_PRECISION_GEMV
#undef mkl_offload_PRECISION_TRMM
#undef scal_PRECISION_GEMM
#undef scal_PRECISION_DOTC
#undef scal_PRECISION_LASET
...
...
@@ -416,6 +428,9 @@
#define cublas_PRECISION_TRMM cublas_ZTRMM
#define cublas_PRECISION_GEMV cublas_ZGEMV
#define cublas_PRECISION_SYMV cublas_ZSYMV
#define mkl_offload_PRECISION_GEMM mkl_offload_ZGEMM
#define mkl_offload_PRECISION_GEMV mkl_offload_ZGEMV
#define mkl_offload_PRECISION_TRMM mkl_offload_ZTRMM
#define scal_PRECISION_GEMM PZGEMM
#define scal_PRECISION_DOTC PZDOTC
#define scal_PRECISION_LASET PZLASET
...
...
@@ -491,6 +506,9 @@
#define cublas_PRECISION_TRMM cublas_CTRMM
#define cublas_PRECISION_GEMV cublas_CGEMV
#define cublas_PRECISION_SYMV cublas_CSYMV
#define mkl_offload_PRECISION_GEMM mkl_offload_CGEMM
#define mkl_offload_PRECISION_GEMV mkl_offload_CGEMV
#define mkl_offload_PRECISION_TRMM mkl_offload_CTRMM
#define scal_PRECISION_GEMM PCGEMM
#define scal_PRECISION_DOTC PCDOTC
#define scal_PRECISION_LASET PCLASET
...
...
src/helpers/mod_simd_kernel.F90
View file @
9b6dcccf
...
...
@@ -90,7 +90,7 @@ module simd_kernel
realKernels_to_simdTable
(
ELPA_2STAGE_REAL_SVE512_BLOCK2
)
=
SVE512_INSTR
realKernels_to_simdTable
(
ELPA_2STAGE_REAL_SVE512_BLOCK4
)
=
SVE512_INSTR
realKernels_to_simdTable
(
ELPA_2STAGE_REAL_SVE512_BLOCK6
)
=
SVE512_INSTR
realKernels_to_simdTable
(
ELPA_2STAGE_REAL_
GPU
)
=
NVIDIA_INSTR
realKernels_to_simdTable
(
ELPA_2STAGE_REAL_
NVIDIA_GPU
)
=
NVIDIA_INSTR
realKernels_to_simdTable
(
ELPA_2STAGE_REAL_SPARC64_BLOCK2
)
=
SPARC_INSTR
realKernels_to_simdTable
(
ELPA_2STAGE_REAL_SPARC64_BLOCK4
)
=
SPARC_INSTR
realKernels_to_simdTable
(
ELPA_2STAGE_REAL_SPARC64_BLOCK6
)
=
SPARC_INSTR
...
...
@@ -123,7 +123,7 @@ module simd_kernel
simdTable_to_realKernels
(
AVX_INSTR
)
=
ELPA_2STAGE_REAL_AVX_BLOCK2
simdTable_to_realKernels
(
AVX2_INSTR
)
=
ELPA_2STAGE_REAL_AVX2_BLOCK2
simdTable_to_realKernels
(
AVX512_INSTR
)
=
ELPA_2STAGE_REAL_AVX512_BLOCK2
simdTable_to_realKernels
(
NVIDIA_INSTR
)
=
ELPA_2STAGE_REAL_GPU
simdTable_to_realKernels
(
NVIDIA_INSTR
)
=
ELPA_2STAGE_REAL_
NVIDIA_
GPU
simdTable_to_realKernels
(
SPARC_INSTR
)
=
ELPA_2STAGE_REAL_SPARC64_BLOCK2
simdTable_to_realKernels
(
ARCH64_INSTR
)
=
ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
simdTable_to_realKernels
(
VSX_INSTR
)
=
ELPA_2STAGE_REAL_VSX_BLOCK2
...
...
@@ -163,7 +163,7 @@ module simd_kernel
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_SVE512_BLOCK2
)
=
SVE512_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1
)
=
ARCH64_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2
)
=
ARCH64_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_
GPU
)
=
NVIDIA_INSTR
complexKernels_to_simdTable
(
ELPA_2STAGE_COMPLEX_
NVIDIA_GPU
)
=
NVIDIA_INSTR
simd_set_index
=
complexKernels_to_simdTable
(
kernel
)
...
...
@@ -187,7 +187,7 @@ module simd_kernel
simdTable_to_complexKernels
(
SVE256_INSTR
)
=
ELPA_2STAGE_COMPLEX_SVE256_BLOCK1
simdTable_to_complexKernels
(
SVE512_INSTR
)
=
ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
simdTable_to_complexKernels
(
ARCH64_INSTR
)
=
ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1
simdTable_to_complexKernels
(
NVIDIA_INSTR
)
=
ELPA_2STAGE_COMPLEX_GPU
simdTable_to_complexKernels
(
NVIDIA_INSTR
)
=
ELPA_2STAGE_COMPLEX_
NVIDIA_
GPU
kernel
=
simdTable_to_complexKernels
(
simd_set_index
)
...
...
test/C/test.c
View file @
9b6dcccf
...
...
@@ -273,8 +273,21 @@ int main(int argc, char** argv) {