Commit 9b6dcccf authored by Andreas Marek's avatar Andreas Marek
Browse files

Some changes to make compile again with oneAPI compiler

parent db0c1416
...@@ -878,12 +878,14 @@ m4_define(elpa_m4_bgq_kernels, [ ...@@ -878,12 +878,14 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq complex_bgq
]) ])
m4_define(elpa_m4_gpu_kernels, [ m4_define(elpa_m4_nvidia_gpu_kernels, [
real_gpu real_nvidia_gpu
complex_gpu complex_nvidia_gpu
real_amd_gpu
complex_amd_gpu
]) ])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu]) m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu])
m4_define(elpa_m4_all_kernels, m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type], m4_foreach_w([elpa_m4_type],
...@@ -927,7 +929,7 @@ ELPA_SELECT_KERNELS([avx512],[enable]) ...@@ -927,7 +929,7 @@ ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable]) ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable]) ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable]) ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable]) ELPA_SELECT_KERNELS([nvidia_gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable]) ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable]) ELPA_SELECT_KERNELS([bgq],[disable])
...@@ -976,8 +978,8 @@ if test x"$with_gpu_support_only" = x"yes" ; then ...@@ -976,8 +978,8 @@ if test x"$with_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[ m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no use_[]elpa_m4_kernel[]=no
]) ])
use_real_gpu=yes use_real_nvida_gpu=yes
use_complex_gpu=yes use_complex_nvidia_gpu=yes
fi fi
...@@ -1054,7 +1056,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -1054,7 +1056,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel], m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel], m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels, elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ), [m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[ [
if test -z "$default_[]elpa_m4_kind[]_kernel"; then if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...@@ -1477,8 +1479,8 @@ AC_ARG_ENABLE([Nvidia-gpu], ...@@ -1477,8 +1479,8 @@ AC_ARG_ENABLE([Nvidia-gpu],
AC_MSG_RESULT([${use_nvidia_gpu}]) AC_MSG_RESULT([${use_nvidia_gpu}])
if test x"${use_nvidia_gpu}" = x"yes" ; then if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes need_nvidia_gpu=yes
use_real_gpu=yes use_nvidia_real_gpu=yes
use_complex_gpu=yes use_nvidia_complex_gpu=yes
fi fi
AC_MSG_CHECKING(whether INTEL GPU version should be used) AC_MSG_CHECKING(whether INTEL GPU version should be used)
...@@ -1514,8 +1516,8 @@ if test x"${use_amd_gpu}" = x"yes" ; then ...@@ -1514,8 +1516,8 @@ if test x"${use_amd_gpu}" = x"yes" ; then
######################################## ########################################
# must be changed # must be changed
####################################### #######################################
use_real_gpu=no use_real_amd_gpu=no
use_complex_gpu=no use_complex_amd_gpu=no
fi fi
...@@ -1607,8 +1609,8 @@ AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$u ...@@ -1607,8 +1609,8 @@ AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$u
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support]) AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support])
AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build]) AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1 ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1 ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified) AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx], AC_ARG_ENABLE([nvtx],
...@@ -1633,8 +1635,8 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then ...@@ -1633,8 +1635,8 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_LANG_POP([C]) AC_LANG_POP([C])
fi fi
else else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0 ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0 ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=0
fi fi
...@@ -1671,10 +1673,8 @@ else ...@@ -1671,10 +1673,8 @@ else
ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=0 ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=0
ELPA_2STAGE_REAL_AMD_GPU_COMPILED=0 ELPA_2STAGE_REAL_AMD_GPU_COMPILED=0
fi fi
AC_SUBST([ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_GPU_COMPILED])
AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"]) AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"])
if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then
...@@ -2021,7 +2021,7 @@ AC_CONFIG_FILES([ ...@@ -2021,7 +2021,7 @@ AC_CONFIG_FILES([
m4_include([m4/ax_fc_check_define.m4]) m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed]) AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=no need_manual_cpp=yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[]) AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[]) AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC" ACTUAL_FC="$FC"
......
...@@ -50,7 +50,7 @@ enum ELPA_SOLVERS { ...@@ -50,7 +50,7 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_NVIDIA_GPU, 18, @ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
...@@ -104,7 +104,7 @@ enum ELPA_REAL_KERNELS { ...@@ -104,7 +104,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 22, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
...@@ -21,8 +21,9 @@ solver_flag = { ...@@ -21,8 +21,9 @@ solver_flag = {
"scalapack_part": "-DTEST_SCALAPACK_PART", "scalapack_part": "-DTEST_SCALAPACK_PART",
} }
gpu_flag = { gpu_flag = {
0: "-DTEST_GPU=0", "GPU_OFF": "-DTEST_NVIDIA_GPU=0 -DTEST_INTEL_GPU=0",
1: "-DTEST_GPU=1", "NVIDIA_GPU_ON": "-DTEST_NVIDIA_GPU=1",
"INTEL_GPU_ON": "-DTEST_INTEL_GPU=1",
} }
gpu_id_flag = { gpu_id_flag = {
0: "-DTEST_GPU_SET_ID=0", 0: "-DTEST_GPU_SET_ID=0",
...@@ -86,14 +87,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key ...@@ -86,14 +87,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
# exclude some test combinations # exclude some test combinations
# analytic tests only for "eigenvectors" and not on GPU # analytic tests only for "eigenvectors" and not on GPU
if(m == "analytic" and (g == 1 or t != "eigenvectors")): if(m == "analytic" and ( g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or t != "eigenvectors")):
continue continue
# Frank tests only for "eigenvectors" and eigenvalues and real double precision case # Frank tests only for "eigenvectors" and eigenvalues and real double precision case
if(m == "frank" and ((t != "eigenvectors" or t != "eigenvalues") and (d != "real" or p != "double"))): if(m == "frank" and ((t != "eigenvectors" or t != "eigenvalues") and (d != "real" or p != "double"))):
continue continue
if(s in ["scalapack_all", "scalapack_part"] and (g == 1 or t != "eigenvectors" or m != "analytic")): if(s in ["scalapack_all", "scalapack_part"] and (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or t != "eigenvectors" or m != "analytic")):
continue continue
# do not test single-precision scalapack # do not test single-precision scalapack
...@@ -127,7 +128,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key ...@@ -127,7 +128,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
continue continue
# qr only for 2stage real # qr only for 2stage real
if (q == 1 and (s != "2stage" or d != "real" or t != "eigenvectors" or g == 1 or m != "random")): if (q == 1 and (s != "2stage" or d != "real" or t != "eigenvectors" or g == "NVIDIA_GPU_ON" or "INTEL_GPU_ON" or m != "random")):
continue continue
if(spl == "myself" and (d != "real" or p != "double" or q != 0 or m != "random" or (t != "eigenvectors" and t != "cholesky") or lang != "Fortran" or lay != "square")): if(spl == "myself" and (d != "real" or p != "double" or q != 0 or m != "random" or (t != "eigenvectors" and t != "cholesky") or lang != "Fortran" or lay != "square")):
...@@ -154,10 +155,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key ...@@ -154,10 +155,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
print("if ENABLE_C_TESTS") print("if ENABLE_C_TESTS")
endifs += 1 endifs += 1
if (g == 1): if (g == "NVIDIA_GPU_ON"):
print("if WITH_NVIDIA_GPU_VERSION") print("if WITH_NVIDIA_GPU_VERSION")
endifs += 1 endifs += 1
if (g == "INTEL_GPU_ON"):
print("if WITH_INTEL_GPU_VERSION")
endifs += 1
if (lay == "all_layouts"): if (lay == "all_layouts"):
print("if WITH_MPI") print("if WITH_MPI")
endifs += 1 endifs += 1
...@@ -190,7 +195,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key ...@@ -190,7 +195,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
langsuffix=language_flag[lang], langsuffix=language_flag[lang],
d=d, p=p, t=t, s=s, d=d, p=p, t=t, s=s,
kernelsuffix="" if kernel == "nokernel" else "_" + kernel, kernelsuffix="" if kernel == "nokernel" else "_" + kernel,
gpusuffix="gpu_" if g else "", gpusuffix="gpu_" if (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON") else "",
gpuidsuffix="set_gpu_id_" if gid else "", gpuidsuffix="set_gpu_id_" if gid else "",
qrsuffix="qr_" if q else "", qrsuffix="qr_" if q else "",
m=m, m=m,
......
...@@ -110,6 +110,7 @@ module mod_check_for_gpu ...@@ -110,6 +110,7 @@ module mod_check_for_gpu
endif endif
endif endif
success = .true.
#ifdef WITH_NVIDIA_GPU_VERSION #ifdef WITH_NVIDIA_GPU_VERSION
success = cuda_setdevice(use_gpu_id) success = cuda_setdevice(use_gpu_id)
#endif #endif
...@@ -128,7 +129,8 @@ module mod_check_for_gpu ...@@ -128,7 +129,8 @@ module mod_check_for_gpu
if (wantDebugMessage) then if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif endif
success = .true.
#ifdef WITH_NVIDIA_GPU_VERSION #ifdef WITH_NVIDIA_GPU_VERSION
success = cublas_create(cublasHandle) success = cublas_create(cublasHandle)
#endif #endif
...@@ -159,6 +161,7 @@ module mod_check_for_gpu ...@@ -159,6 +161,7 @@ module mod_check_for_gpu
endif endif
endif endif
success = .true.
#ifdef WITH_NVIDIA_GPU_VERSION #ifdef WITH_NVIDIA_GPU_VERSION
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname) ! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success = cuda_getdevicecount(numberOfDevices) success = cuda_getdevicecount(numberOfDevices)
......
...@@ -2,7 +2,9 @@ ...@@ -2,7 +2,9 @@
module elpa_gpu module elpa_gpu
use precision use precision
use iso_c_binding use iso_c_binding
#ifdef WITH_INTEL_GPU_VERSION
use mkl_offload
#endif
integer(kind=c_int), parameter :: nvidia_gpu = 1 integer(kind=c_int), parameter :: nvidia_gpu = 1
integer(kind=c_int), parameter :: amd_gpu = 2 integer(kind=c_int), parameter :: amd_gpu = 2
integer(kind=c_int), parameter :: intel_gpu = 3 integer(kind=c_int), parameter :: intel_gpu = 3
......
...@@ -308,6 +308,12 @@ function elpa_solve_evp_& ...@@ -308,6 +308,12 @@ function elpa_solve_evp_&
print *,"Problem getting option for AMD GPU. Aborting..." print *,"Problem getting option for AMD GPU. Aborting..."
stop stop
endif endif
else if (gpu_vendor() == INTEL_GPU) then
call obj%get("intel-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else else
gpu = 0 gpu = 0
endif endif
...@@ -318,6 +324,7 @@ function elpa_solve_evp_& ...@@ -318,6 +324,7 @@ function elpa_solve_evp_&
useGPU = .false. useGPU = .false.
endif endif
print *,"after activating gpu..."
call obj%get("is_skewsymmetric",skewsymmetric,error) call obj%get("is_skewsymmetric",skewsymmetric,error)
if (error .ne. ELPA_OK) then if (error .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric. Aborting..." print *,"Problem getting option for skewsymmetric. Aborting..."
...@@ -351,6 +358,7 @@ function elpa_solve_evp_& ...@@ -351,6 +358,7 @@ function elpa_solve_evp_&
do_useGPU = .false. do_useGPU = .false.
print *,"before check gpu..."
if (useGPU) then if (useGPU) then
call obj%timer%start("check_for_gpu") call obj%timer%start("check_for_gpu")
...@@ -379,6 +387,7 @@ function elpa_solve_evp_& ...@@ -379,6 +387,7 @@ function elpa_solve_evp_&
endif endif
print *,"after check gpu..."
do_useGPU_tridiag = do_useGPU do_useGPU_tridiag = do_useGPU
do_useGPU_solve_tridi = do_useGPU do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev = do_useGPU do_useGPU_trans_ev = do_useGPU
...@@ -447,7 +456,7 @@ function elpa_solve_evp_& ...@@ -447,7 +456,7 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX #ifdef WITH_NVTX
call nvtxRangePush("tridi") call nvtxRangePush("tridi")
#endif #endif
print *,"before tridiag..."
call tridiag_& call tridiag_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
......
...@@ -501,6 +501,7 @@ subroutine tridiag_& ...@@ -501,6 +501,7 @@ subroutine tridiag_&
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs)) aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif #endif
if (useIntelGPU) then if (useIntelGPU) then
print *,"intel phase aaaaaaaaaaaaaaaaaaaaaaaaaa"
if (wantDebug) call obj%timer%start("mkl_offload") if (wantDebug) call obj%timer%start("mkl_offload")
#if REALCASE == 1 #if REALCASE == 1
aux(1:2*n_stored_vecs) = uv_stored_cols(l_cols+1,1:2*n_stored_vecs) aux(1:2*n_stored_vecs) = uv_stored_cols(l_cols+1,1:2*n_stored_vecs)
...@@ -675,7 +676,8 @@ subroutine tridiag_& ...@@ -675,7 +676,8 @@ subroutine tridiag_&
!$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, successGPU, u_row, u_row_dev, & !$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, successGPU, u_row, u_row_dev, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, & !$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, & !$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat, useIntelGPU) !$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat, useIntelGPU, &
!$omp& matrixCols)
my_thread = omp_get_thread_num() my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads() n_threads = omp_get_num_threads()
......
...@@ -187,10 +187,10 @@ last_stripe_width, kernel) ...@@ -187,10 +187,10 @@ last_stripe_width, kernel)
if (wantDebug) then if (wantDebug) then
if (useGPU .and. & if (useGPU .and. &
#if REALCASE == 1 #if REALCASE == 1
( kernel .ne. ELPA_2STAGE_REAL_GPU)) then ( kernel .ne. ELPA_2STAGE_REAL_NVIDIA_GPU)) then
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then ( kernel .ne. ELPA_2STAGE_COMPLEX_NVIDIA_GPU)) then
#endif #endif
print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!" print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop stop
...@@ -198,10 +198,10 @@ last_stripe_width, kernel) ...@@ -198,10 +198,10 @@ last_stripe_width, kernel)
endif endif
#if REALCASE == 1 #if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then if (kernel .eq. ELPA_2STAGE_REAL_NVIDIA_GPU) then
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then if (kernel .eq. ELPA_2STAGE_COMPLEX_NVIDIA_GPU) then
#endif #endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) then if (ncols < 1) then
...@@ -263,11 +263,11 @@ last_stripe_width, kernel) ...@@ -263,11 +263,11 @@ last_stripe_width, kernel)
#if REALCASE == 1 #if REALCASE == 1
! GPU kernel real ! GPU kernel real
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then if (kernel .eq. ELPA_2STAGE_REAL_NVIDIA_GPU) then
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! GPU kernel complex ! GPU kernel complex
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then if (kernel .eq. ELPA_2STAGE_COMPLEX_NVIDIA_GPU) then
#endif #endif
if (wantDebug) then if (wantDebug) then
call obj%timer%start("compute_hh_trafo: GPU") call obj%timer%start("compute_hh_trafo: GPU")
......
...@@ -209,7 +209,7 @@ ...@@ -209,7 +209,7 @@
#undef GPU_KERNEL #undef GPU_KERNEL
#undef GENERIC_KERNEL #undef GENERIC_KERNEL
#undef KERNEL_STRING #undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_REAL_GPU #define GPU_KERNEL ELPA_2STAGE_REAL_NVIDIA_GPU
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC #define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
#define KERNEL_STRING "real_kernel" #define KERNEL_STRING "real_kernel"
#endif #endif
...@@ -217,7 +217,7 @@ ...@@ -217,7 +217,7 @@
#undef GPU_KERNEL #undef GPU_KERNEL
#undef GENERIC_KERNEL #undef GENERIC_KERNEL
#undef KERNEL_STRING #undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU #define GPU_KERNEL ELPA_2STAGE_COMPLEX_NVIDIA_GPU
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC #define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
#define KERNEL_STRING "complex_kernel" #define KERNEL_STRING "complex_kernel"
#endif #endif
...@@ -379,6 +379,12 @@ ...@@ -379,6 +379,12 @@
print *,"Problem getting option for AMD GPU. Aborting..." print *,"Problem getting option for AMD GPU. Aborting..."
stop stop
endif endif
else if (gpu_vendor() == INTEL_GPU) then
call obj%get("intel-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else else
gpu = 0 gpu = 0
endif endif
......
...@@ -370,7 +370,7 @@ subroutine trans_ev_band_to_full_& ...@@ -370,7 +370,7 @@ subroutine trans_ev_band_to_full_&
if (i > 1) then if (i > 1) then
if (useIntelGPU) then if (useIntelGPU) then
call obj%timer%start("mkl_offload") !call obj%timer%start("mkl_offload")
#if 0 #if 0
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', & call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, hvm, & int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, hvm, &
...@@ -383,7 +383,7 @@ subroutine trans_ev_band_to_full_& ...@@ -383,7 +383,7 @@ subroutine trans_ev_band_to_full_&
int(max_local_rows,kind=BLAS_KIND), hvm(:,(i-1)*nbw+1:), & int(max_local_rows,kind=BLAS_KIND), hvm(:,(i-1)*nbw+1:), &
int(max_local_rows,kind=BLAS_KIND), ZERO, t_tmp, int(cwy_blocking, kind=BLAS_KIND)) int(max_local_rows,kind=BLAS_KIND), ZERO, t_tmp, int(cwy_blocking, kind=BLAS_KIND))
#endif #endif
call obj%timer%stop("mkl_offload") !call obj%timer%stop("mkl_offload")
else else
call obj%timer%start("blas") call obj%timer%start("blas")
...@@ -402,7 +402,7 @@ subroutine trans_ev_band_to_full_& ...@@ -402,7 +402,7 @@ subroutine trans_ev_band_to_full_&
call obj%timer%stop("mpi_communication") call obj%timer%stop("mpi_communication")
if (useIntelGPU) then if (useIntelGPU) then
call obj%timer%start("mkl_offload") !call obj%timer%start("mkl_offload")
#if 0 #if 0
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, & call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND)) int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
...@@ -418,7 +418,7 @@ subroutine trans_ev_band_to_full_& ...@@ -418,7 +418,7 @@ subroutine trans_ev_band_to_full_&
tmat_complete(t_rows+1,t_rows+1), & tmat_complete(t_rows+1,t_rows+1), &
int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND)) int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
#endif #endif
call obj%timer%stop("mkl_offload") !call obj%timer%stop("mkl_offload")
else else
call obj%timer%start("blas") call obj%timer%start("blas")
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, & call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
...@@ -432,7 +432,7 @@ subroutine trans_ev_band_to_full_& ...@@ -432,7 +432,7 @@ subroutine trans_ev_band_to_full_&
#else /* WITH_MPI */ #else /* WITH_MPI */
if (useIntelGPU) then if (useIntelGPU) then
call obj%timer%start("mkl_offload") !call obj%timer%start("mkl_offload")
#if 0 #if 0
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, & call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND)) int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND))
...@@ -448,7 +448,7 @@ subroutine trans_ev_band_to_full_& ...@@ -448,7 +448,7 @@ subroutine trans_ev_band_to_full_&
tmat_complete(t_rows+1,t_rows+1), & tmat_complete(t_rows+1,t_rows+1), &
int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND)) int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND))
#endif #endif
call obj%timer%stop("mkl_offload") !call obj%timer%stop("mkl_offload")