Commit 9b6dcccf authored by Andreas Marek's avatar Andreas Marek
Browse files

Some changes to make compile again with oneAPI compiler

parent db0c1416
......@@ -878,12 +878,14 @@ m4_define(elpa_m4_bgq_kernels, [
complex_bgq
])
m4_define(elpa_m4_gpu_kernels, [
real_gpu
complex_gpu
m4_define(elpa_m4_nvidia_gpu_kernels, [
real_nvidia_gpu
complex_nvidia_gpu
real_amd_gpu
complex_amd_gpu
])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq gpu])
m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly sve128 avx avx2 sve256 avx512 sve512 bgp bgq nvidia_gpu])
m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type],
......@@ -927,7 +929,7 @@ ELPA_SELECT_KERNELS([avx512],[enable])
ELPA_SELECT_KERNELS([sve128],[disable])
ELPA_SELECT_KERNELS([sve256],[disable])
ELPA_SELECT_KERNELS([sve512],[disable])
ELPA_SELECT_KERNELS([gpu],[disable])
ELPA_SELECT_KERNELS([nvidia_gpu],[disable])
ELPA_SELECT_KERNELS([bgp],[disable])
ELPA_SELECT_KERNELS([bgq],[disable])
......@@ -976,8 +978,8 @@ if test x"$with_gpu_support_only" = x"yes" ; then
m4_foreach_w([elpa_m4_kernel],elpa_m4_all_kernels,[
use_[]elpa_m4_kernel[]=no
])
use_real_gpu=yes
use_complex_gpu=yes
use_real_nvida_gpu=yes
use_complex_nvidia_gpu=yes
fi
......@@ -1054,7 +1056,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_gpu_kernels,
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sve128_kernels elpa_m4_sve256_kernels elpa_m4_sve512_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels elpa_m4_nvidia_gpu_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[
if test -z "$default_[]elpa_m4_kind[]_kernel"; then
......@@ -1477,8 +1479,8 @@ AC_ARG_ENABLE([Nvidia-gpu],
AC_MSG_RESULT([${use_nvidia_gpu}])
if test x"${use_nvidia_gpu}" = x"yes" ; then
need_nvidia_gpu=yes
use_real_gpu=yes
use_complex_gpu=yes
use_nvidia_real_gpu=yes
use_nvidia_complex_gpu=yes
fi
AC_MSG_CHECKING(whether INTEL GPU version should be used)
......@@ -1514,8 +1516,8 @@ if test x"${use_amd_gpu}" = x"yes" ; then
########################################
# must be changed
#######################################
use_real_gpu=no
use_complex_gpu=no
use_real_amd_gpu=no
use_complex_amd_gpu=no
fi
......@@ -1607,8 +1609,8 @@ AM_CONDITIONAL([WITH_NVIDIA_GPU_VERSION],[test x"$use_real_gpu" = x"yes" -o x"$u
if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_NVIDIA_GPU_VERSION],[1],[enable Nvidia GPU support])
AC_DEFINE([WITH_NVIDIA_GPU_KERNEL],[1],[Nvidia GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=1
ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
......@@ -1633,8 +1635,8 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_LANG_POP([C])
fi
else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0
ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED=0
ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED=0
fi
......@@ -1671,10 +1673,8 @@ else
ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED=0
ELPA_2STAGE_REAL_AMD_GPU_COMPILED=0
fi
AC_SUBST([ELPA_2STAGE_COMPLEX_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_COMPLEX_AMD_GPU_COMPILED])
AC_SUBST([ELPA_2STAGE_REAL_AMD_GPU_COMPILED])
AM_CONDITIONAL([WITH_INTEL_GPU_VERSION],[test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes"])
if test x"$use_real_intel_gpu" = x"yes" -o x"$use_complex_intel_gpu" = x"yes" ; then
......@@ -2021,7 +2021,7 @@ AC_CONFIG_FILES([
m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=no
need_manual_cpp=yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC"
......
......@@ -50,7 +50,7 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_AVX512_BLOCK2, 15, @ELPA_2STAGE_REAL_AVX512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK4, 16, @ELPA_2STAGE_REAL_AVX512_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_AVX512_BLOCK6, 17, @ELPA_2STAGE_REAL_AVX512_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GPU, 18, @ELPA_2STAGE_REAL_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_NVIDIA_GPU, 18, @ELPA_2STAGE_REAL_NVIDIA_GPU_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
......@@ -104,7 +104,7 @@ enum ELPA_REAL_KERNELS {
X(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2, 19, @ELPA_2STAGE_COMPLEX_SVE512_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1, 20, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2, 21, @ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_COMPLEX_GPU, 22, @ELPA_2STAGE_COMPLEX_GPU_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_COMPLEX_NVIDIA_GPU, 22, @ELPA_2STAGE_COMPLEX_NVIDIA_GPU_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(X) \
......
......@@ -21,8 +21,9 @@ solver_flag = {
"scalapack_part": "-DTEST_SCALAPACK_PART",
}
gpu_flag = {
0: "-DTEST_GPU=0",
1: "-DTEST_GPU=1",
"GPU_OFF": "-DTEST_NVIDIA_GPU=0 -DTEST_INTEL_GPU=0",
"NVIDIA_GPU_ON": "-DTEST_NVIDIA_GPU=1",
"INTEL_GPU_ON": "-DTEST_INTEL_GPU=1",
}
gpu_id_flag = {
0: "-DTEST_GPU_SET_ID=0",
......@@ -86,14 +87,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
# exclude some test combinations
# analytic tests only for "eigenvectors" and not on GPU
if(m == "analytic" and (g == 1 or t != "eigenvectors")):
if(m == "analytic" and ( g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or t != "eigenvectors")):
continue
# Frank tests only for "eigenvectors" and eigenvalues and real double precision case
if(m == "frank" and ((t != "eigenvectors" or t != "eigenvalues") and (d != "real" or p != "double"))):
continue
if(s in ["scalapack_all", "scalapack_part"] and (g == 1 or t != "eigenvectors" or m != "analytic")):
if(s in ["scalapack_all", "scalapack_part"] and (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON" or t != "eigenvectors" or m != "analytic")):
continue
# do not test single-precision scalapack
......@@ -127,7 +128,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
continue
# qr only for 2stage real
if (q == 1 and (s != "2stage" or d != "real" or t != "eigenvectors" or g == 1 or m != "random")):
if (q == 1 and (s != "2stage" or d != "real" or t != "eigenvectors" or g == "NVIDIA_GPU_ON" or "INTEL_GPU_ON" or m != "random")):
continue
if(spl == "myself" and (d != "real" or p != "double" or q != 0 or m != "random" or (t != "eigenvectors" and t != "cholesky") or lang != "Fortran" or lay != "square")):
......@@ -154,10 +155,14 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
print("if ENABLE_C_TESTS")
endifs += 1
if (g == 1):
if (g == "NVIDIA_GPU_ON"):
print("if WITH_NVIDIA_GPU_VERSION")
endifs += 1
if (g == "INTEL_GPU_ON"):
print("if WITH_INTEL_GPU_VERSION")
endifs += 1
if (lay == "all_layouts"):
print("if WITH_MPI")
endifs += 1
......@@ -190,7 +195,7 @@ for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.key
langsuffix=language_flag[lang],
d=d, p=p, t=t, s=s,
kernelsuffix="" if kernel == "nokernel" else "_" + kernel,
gpusuffix="gpu_" if g else "",
gpusuffix="gpu_" if (g == "NVIDIA_GPU_ON" or g == "INTEL_GPU_ON") else "",
gpuidsuffix="set_gpu_id_" if gid else "",
qrsuffix="qr_" if q else "",
m=m,
......
......@@ -110,6 +110,7 @@ module mod_check_for_gpu
endif
endif
success = .true.
#ifdef WITH_NVIDIA_GPU_VERSION
success = cuda_setdevice(use_gpu_id)
#endif
......@@ -128,7 +129,8 @@ module mod_check_for_gpu
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
success = .true.
#ifdef WITH_NVIDIA_GPU_VERSION
success = cublas_create(cublasHandle)
#endif
......@@ -159,6 +161,7 @@ module mod_check_for_gpu
endif
endif
success = .true.
#ifdef WITH_NVIDIA_GPU_VERSION
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success = cuda_getdevicecount(numberOfDevices)
......
......@@ -2,7 +2,9 @@
module elpa_gpu
use precision
use iso_c_binding
#ifdef WITH_INTEL_GPU_VERSION
use mkl_offload
#endif
integer(kind=c_int), parameter :: nvidia_gpu = 1
integer(kind=c_int), parameter :: amd_gpu = 2
integer(kind=c_int), parameter :: intel_gpu = 3
......
......@@ -308,6 +308,12 @@ function elpa_solve_evp_&
print *,"Problem getting option for AMD GPU. Aborting..."
stop
endif
else if (gpu_vendor() == INTEL_GPU) then
call obj%get("intel-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
gpu = 0
endif
......@@ -318,6 +324,7 @@ function elpa_solve_evp_&
useGPU = .false.
endif
print *,"after activating gpu..."
call obj%get("is_skewsymmetric",skewsymmetric,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for skewsymmetric. Aborting..."
......@@ -351,6 +358,7 @@ function elpa_solve_evp_&
do_useGPU = .false.
print *,"before check gpu..."
if (useGPU) then
call obj%timer%start("check_for_gpu")
......@@ -379,6 +387,7 @@ function elpa_solve_evp_&
endif
print *,"after check gpu..."
do_useGPU_tridiag = do_useGPU
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev = do_useGPU
......@@ -447,7 +456,7 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX
call nvtxRangePush("tridi")
#endif
print *,"before tridiag..."
call tridiag_&
&MATH_DATATYPE&
&_&
......
......@@ -501,6 +501,7 @@ subroutine tridiag_&
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
if (useIntelGPU) then
print *,"intel phase aaaaaaaaaaaaaaaaaaaaaaaaaa"
if (wantDebug) call obj%timer%start("mkl_offload")
#if REALCASE == 1
aux(1:2*n_stored_vecs) = uv_stored_cols(l_cols+1,1:2*n_stored_vecs)
......@@ -675,7 +676,8 @@ subroutine tridiag_&
!$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, successGPU, u_row, u_row_dev, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat, useIntelGPU)
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat, useIntelGPU, &
!$omp& matrixCols)
my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads()
......
......@@ -187,10 +187,10 @@ last_stripe_width, kernel)
if (wantDebug) then
if (useGPU .and. &
#if REALCASE == 1
( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
( kernel .ne. ELPA_2STAGE_REAL_NVIDIA_GPU)) then
#endif
#if COMPLEXCASE == 1
( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
( kernel .ne. ELPA_2STAGE_COMPLEX_NVIDIA_GPU)) then
#endif
print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop
......@@ -198,10 +198,10 @@ last_stripe_width, kernel)
endif
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
if (kernel .eq. ELPA_2STAGE_REAL_NVIDIA_GPU) then
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_NVIDIA_GPU) then
#endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) then
......@@ -263,11 +263,11 @@ last_stripe_width, kernel)
#if REALCASE == 1
! GPU kernel real
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
if (kernel .eq. ELPA_2STAGE_REAL_NVIDIA_GPU) then
#endif
#if COMPLEXCASE == 1
! GPU kernel complex
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_NVIDIA_GPU) then
#endif
if (wantDebug) then
call obj%timer%start("compute_hh_trafo: GPU")
......
......@@ -209,7 +209,7 @@
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_REAL_GPU
#define GPU_KERNEL ELPA_2STAGE_REAL_NVIDIA_GPU
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
#define KERNEL_STRING "real_kernel"
#endif
......@@ -217,7 +217,7 @@
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_NVIDIA_GPU
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
#define KERNEL_STRING "complex_kernel"
#endif
......@@ -379,6 +379,12 @@
print *,"Problem getting option for AMD GPU. Aborting..."
stop
endif
else if (gpu_vendor() == INTEL_GPU) then
call obj%get("intel-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
gpu = 0
endif
......
......@@ -370,7 +370,7 @@ subroutine trans_ev_band_to_full_&
if (i > 1) then
if (useIntelGPU) then
call obj%timer%start("mkl_offload")
!call obj%timer%start("mkl_offload")
#if 0
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, hvm, &
......@@ -383,7 +383,7 @@ subroutine trans_ev_band_to_full_&
int(max_local_rows,kind=BLAS_KIND), hvm(:,(i-1)*nbw+1:), &
int(max_local_rows,kind=BLAS_KIND), ZERO, t_tmp, int(cwy_blocking, kind=BLAS_KIND))
#endif
call obj%timer%stop("mkl_offload")
!call obj%timer%stop("mkl_offload")
else
call obj%timer%start("blas")
......@@ -402,7 +402,7 @@ subroutine trans_ev_band_to_full_&
call obj%timer%stop("mpi_communication")
if (useIntelGPU) then
call obj%timer%start("mkl_offload")
!call obj%timer%start("mkl_offload")
#if 0
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
......@@ -418,7 +418,7 @@ subroutine trans_ev_band_to_full_&
tmat_complete(t_rows+1,t_rows+1), &
int(cwy_blocking,kind=BLAS_KIND), t_tmp2, int(cwy_blocking,kind=BLAS_KIND))
#endif
call obj%timer%stop("mkl_offload")
!call obj%timer%stop("mkl_offload")
else
call obj%timer%start("blas")
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
......@@ -432,7 +432,7 @@ subroutine trans_ev_band_to_full_&
#else /* WITH_MPI */
if (useIntelGPU) then
call obj%timer%start("mkl_offload")
!call obj%timer%start("mkl_offload")
#if 0
call PRECISION_TRMM('L', 'U', 'N', 'N', int(t_rows,kind=BLAS_KIND), int(t_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND))
......@@ -448,7 +448,7 @@ subroutine trans_ev_band_to_full_&
tmat_complete(t_rows+1,t_rows+1), &
int(cwy_blocking,kind=BLAS_KIND), t_tmp, int(cwy_blocking,kind=BLAS_KIND))
#endif
call obj%timer%stop("mkl_offload")
!call obj%timer%stop("mkl_offload")
else
call obj%timer%start("blas")
......@@ -471,7 +471,7 @@ subroutine trans_ev_band_to_full_&
if (l_rows>0) then
if (useGPU) then
if (useIntelGPU) then
call obj%timer%start("mkl_offload")
!call obj%timer%start("mkl_offload")
#if 0
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), int(l_rows,kind=BLAS_KIND), ONE, &
......@@ -484,7 +484,7 @@ subroutine trans_ev_band_to_full_&
hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), q_mat, int(ldq,kind=BLAS_KIND), ZERO, tmp1, &
int(n_cols,kind=BLAS_KIND))
#endif
call obj%timer%stop("mkl_offload")
!call obj%timer%stop("mkl_offload")
else
successGPU = gpu_memcpy(hvm_dev, int(loc(hvm),kind=c_intptr_t), &
......@@ -525,7 +525,7 @@ subroutine trans_ev_band_to_full_&
if (l_rows>0) then
if (useGPU) then
if (useIntelGPU) then
call obj%timer%start("mkl_offload")
!call obj%timer%start("mkl_offload")
call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', &
int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), tmp2, int(n_cols,kind=BLAS_KIND))
......@@ -533,7 +533,7 @@ subroutine trans_ev_band_to_full_&
int(n_cols,kind=BLAS_KIND), -ONE, hvm, &
int(ubound(hvm,dim=1),kind=BLAS_KIND), tmp2, int(n_cols,kind=BLAS_KIND), ONE, &
q_mat, int(ldq,kind=BLAS_KIND))
call obj%timer%stop("mkl_offload")
!call obj%timer%stop("mkl_offload")
else
successGPU = gpu_memcpy(tmp_dev, int(loc(tmp2),kind=c_intptr_t), &
......@@ -569,7 +569,7 @@ subroutine trans_ev_band_to_full_&
if (useGPU) then
if (useIntelGPU) then
#if 0
call obj%timer%start("mkl_offload")
!call obj%timer%start("mkl_offload")
call PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', &
int(n_cols,kind=BLAS_KIND), int(l_cols,kind=BLAS_KIND), ONE, tmat_complete, &
int(cwy_blocking,kind=BLAS_KIND), &
......@@ -587,7 +587,7 @@ subroutine trans_ev_band_to_full_&
-ONE, hvm, int(ubound(hvm,dim=1),kind=BLAS_KIND), tmp1, int(n_cols,kind=BLAS_KIND), ONE, q_mat, &
int(ldq,kind=BLAS_KIND))
#endif
call obj%timer%stop("mkl_offload")
!call obj%timer%stop("mkl_offload")
else
successGPU = gpu_memcpy(tmat_dev, int(loc(tmat_complete),kind=c_intptr_t), &
cwy_blocking*cwy_blocking*size_of_datatype, gpuMemcpyHostToDevice)
......
......@@ -767,8 +767,14 @@ static const char *real_kernel_name(int kernel) {
}
}
#define REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_GPU ? gpu_is_active : 1
#define REAL_NVIDIA_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_NVIDIA_GPU ? gpu_is_active : 1
#define REAL_AMD_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_AMD_GPU ? gpu_is_active : 1
#define REAL_INTEL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_REAL_INTEL_GPU ? gpu_is_active : 1
static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int solver = elpa_index_get_int_value(index, "solver", NULL);
......@@ -777,7 +783,9 @@ static int real_kernel_is_valid(elpa_index_t index, int n, int new_value) {
}
int gpu_is_active = (elpa_index_get_int_value(index, "nvidia-gpu", NULL) || elpa_index_get_int_value(index, "amd-gpu", NULL) || elpa_index_get_int_value(index, "intel-gpu", NULL));
switch(new_value) {
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_NVIDIA_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
//ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_AMD_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
//ELPA_FOR_ALL_2STAGE_REAL_KERNELS(VALID_CASE_3, REAL_INTEL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
default:
return 0;
}
......@@ -806,8 +814,14 @@ static const char *complex_kernel_name(int kernel) {
}
}
#define COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_GPU ? gpu_is_active : 1
#define COMPLEX_NVIDIA_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_NVIDIA_GPU ? gpu_is_active : 1
#define COMPLEX_AMD_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_AMD_GPU ? gpu_is_active : 1
#define COMPLEX_INTEL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE(kernel_number) \
kernel_number == ELPA_2STAGE_COMPLEX_INTEL_GPU ? gpu_is_active : 1
static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
int solver = elpa_index_get_int_value(index, "solver", NULL);
......@@ -816,7 +830,9 @@ static int complex_kernel_is_valid(elpa_index_t index, int n, int new_value) {
}
int gpu_is_active = (elpa_index_get_int_value(index, "nvidia-gpu", NULL) || elpa_index_get_int_value(index, "amd-gpu", NULL) || elpa_index_get_int_value(index, "intel-gpu", NULL));
switch(new_value) {
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_NVIDIA_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
//ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_AMD_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
// ELPA_FOR_ALL_2STAGE_COMPLEX_KERNELS(VALID_CASE_3, COMPLEX_INTEL_GPU_KERNEL_ONLY_WHEN_GPU_IS_ACTIVE)
default:
return 0;
}
......
......@@ -103,6 +103,9 @@
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef cublas_PRECISION_SYMV
#undef mkl_offload_PRECISION_GEMM
#undef mkl_offload_PRECISION_GEMV
#undef mkl_offload_PRECISION_TRMM
#undef scal_PRECISION_GEMM
#undef scal_PRECISION_NRM2
#undef scal_PRECISION_LASET
......@@ -177,6 +180,9 @@
#define cublas_PRECISION_TRMM cublas_DTRMM
#define cublas_PRECISION_GEMV cublas_DGEMV
#define cublas_PRECISION_SYMV cublas_DSYMV
#define mkl_offload_PRECISION_GEMM mkl_offload_DGEMM
#define mkl_offload_PRECISION_GEMV mkl_offload_DGEMV
#define mkl_offload_PRECISION_TRMM mkl_offload_DTRMM
#define scal_PRECISION_GEMM PDGEMM
#define scal_PRECISION_NRM2 PDNRM2
#define scal_PRECISION_LASET PDLASET
......@@ -248,6 +254,9 @@
#define cublas_PRECISION_TRMM cublas_STRMM
#define cublas_PRECISION_GEMV cublas_SGEMV
#define cublas_PRECISION_SYMV cublas_SSYMV
#define mkl_offload_PRECISION_GEMM mkl_offload_SGEMM
#define mkl_offload_PRECISION_GEMV mkl_offload_SGEMV
#define mkl_offload_PRECISION_TRMM mkl_offload_STRMM
#define scal_PRECISION_GEMM PSGEMM
#define scal_PRECISION_NRM2 PSNRM2
#define scal_PRECISION_LASET PSLASET
......@@ -332,6 +341,9 @@
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef cublas_PRECISION_SYMV
#undef mkl_offload_PRECISION_GEMM
#undef mkl_offload_PRECISION_GEMV
#undef mkl_offload_PRECISION_TRMM
#undef scal_PRECISION_GEMM
#undef scal_PRECISION_DOTC
#undef scal_PRECISION_LASET
......@@ -416,6 +428,9 @@
#define cublas_PRECISION_TRMM cublas_ZTRMM
#define cublas_PRECISION_GEMV cublas_ZGEMV
#define cublas_PRECISION_SYMV cublas_ZSYMV
#define mkl_offload_PRECISION_GEMM mkl_offload_ZGEMM
#define mkl_offload_PRECISION_GEMV mkl_offload_ZGEMV
#define mkl_offload_PRECISION_TRMM mkl_offload_ZTRMM
#define scal_PRECISION_GEMM PZGEMM
#define scal_PRECISION_DOTC PZDOTC
#define scal_PRECISION_LASET PZLASET
......@@ -491,6 +506,9 @@
#define cublas_PRECISION_TRMM cublas_CTRMM
#define cublas_PRECISION_GEMV cublas_CGEMV
#define cublas_PRECISION_SYMV cublas_CSYMV
#define mkl_offload_PRECISION_GEMM mkl_offload_CGEMM
#define mkl_offload_PRECISION_GEMV mkl_offload_CGEMV
#define mkl_offload_PRECISION_TRMM mkl_offload_CTRMM
#define scal_PRECISION_GEMM PCGEMM
#define scal_PRECISION_DOTC PCDOTC
#define scal_PRECISION_LASET PCLASET
......
......@@ -90,7 +90,7 @@ module simd_kernel
realKernels_to_simdTable(ELPA_2STAGE_REAL_SVE512_BLOCK2) = SVE512_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SVE512_BLOCK4) = SVE512_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SVE512_BLOCK6) = SVE512_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_GPU) = NVIDIA_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_NVIDIA_GPU) = NVIDIA_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK2) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK4) = SPARC_INSTR
realKernels_to_simdTable(ELPA_2STAGE_REAL_SPARC64_BLOCK6) = SPARC_INSTR
......@@ -123,7 +123,7 @@ module simd_kernel
simdTable_to_realKernels(AVX_INSTR) = ELPA_2STAGE_REAL_AVX_BLOCK2
simdTable_to_realKernels(AVX2_INSTR) = ELPA_2STAGE_REAL_AVX2_BLOCK2
simdTable_to_realKernels(AVX512_INSTR) = ELPA_2STAGE_REAL_AVX512_BLOCK2
simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_GPU
simdTable_to_realKernels(NVIDIA_INSTR) = ELPA_2STAGE_REAL_NVIDIA_GPU
simdTable_to_realKernels(SPARC_INSTR) = ELPA_2STAGE_REAL_SPARC64_BLOCK2
simdTable_to_realKernels(ARCH64_INSTR) = ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2
simdTable_to_realKernels(VSX_INSTR) = ELPA_2STAGE_REAL_VSX_BLOCK2
......@@ -163,7 +163,7 @@ module simd_kernel
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_SVE512_BLOCK2) = SVE512_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1) = ARCH64_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK2) = ARCH64_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_GPU) = NVIDIA_INSTR
complexKernels_to_simdTable(ELPA_2STAGE_COMPLEX_NVIDIA_GPU) = NVIDIA_INSTR
simd_set_index = complexKernels_to_simdTable(kernel)
......@@ -187,7 +187,7 @@ module simd_kernel
simdTable_to_complexKernels(SVE256_INSTR) = ELPA_2STAGE_COMPLEX_SVE256_BLOCK1
simdTable_to_complexKernels(SVE512_INSTR) = ELPA_2STAGE_COMPLEX_SVE512_BLOCK1
simdTable_to_complexKernels(ARCH64_INSTR) = ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1
simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_GPU
simdTable_to_complexKernels(NVIDIA_INSTR) = ELPA_2STAGE_COMPLEX_NVIDIA_GPU
kernel = simdTable_to_complexKernels(simd_set_index)
......
......@@ -273,8 +273,21 @@ int main(int argc, char** argv) {
#endif
assert_elpa_ok(error_elpa);