Commit 979c5985 authored by Andreas Marek's avatar Andreas Marek
Browse files

Forgot to set rocBlas handle

With this commit ELPA does run correctly on AMD GPUs.
Functionality and correctness tests have been carried out
on AMD MI100
parent 4a4d6051
......@@ -1532,22 +1532,22 @@ if test x"${use_nvidia_gpu}" = x"yes" ; then
use_nvidia_complex_gpu=yes
fi
AC_MSG_CHECKING(whether INTEL GPU version should be used)
AC_ARG_ENABLE([intel-gpu],
AS_HELP_STRING([--enable-intel-gpu],
[do use INTEL GPU version]),
[if test x"$enableval" = x"yes"; then
use_intel_gpu=yes
else
use_intel_gpu=no
fi],
[use_intel_gpu=no])
AC_MSG_RESULT([${use_intel_gpu}])
if test x"${use_intel_gpu}" = x"yes" ; then
need_intel_gpu=yes
use_real_intel_gpu=yes
use_complex_intel_gpu=yes
fi
#AC_MSG_CHECKING(whether INTEL GPU version should be used)
#AC_ARG_ENABLE([intel-gpu],
# AS_HELP_STRING([--enable-intel-gpu],
# [do use INTEL GPU version]),
# [if test x"$enableval" = x"yes"; then
# use_intel_gpu=yes
# else
# use_intel_gpu=no
# fi],
# [use_intel_gpu=no])
#AC_MSG_RESULT([${use_intel_gpu}])
#if test x"${use_intel_gpu}" = x"yes" ; then
# need_intel_gpu=yes
# use_real_intel_gpu=yes
# use_complex_intel_gpu=yes
#fi
AC_MSG_CHECKING(whether AMD-GPU version should be used)
AC_ARG_ENABLE([AMD-gpu],
......@@ -2094,7 +2094,7 @@ AC_CONFIG_FILES([
m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=yes
need_manual_cpp=no
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC"
......
......@@ -146,9 +146,10 @@ module mod_check_for_gpu
#endif
stop 1
endif
else
#ifdef WITH_NVIDIA_GPU_VERSION
#if defined(WITH_NVIDIA_GPU_VERSION) || !defined(WITH_AMD_GPU_VERSION)
if (cublasHandle .ne. -1) then
#endif
#ifdef WITH_AMD_GPU_VERSION
......
......@@ -453,11 +453,11 @@ subroutine trans_ev_&
call obj%timer%stop("mkl_offload")
else
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd, &
q_dev, ldq, ZERO, tmp_dev, nstor)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else ! useGPU
......@@ -559,7 +559,7 @@ subroutine trans_ev_&
call obj%timer%stop("mkl_offload")
#endif /* WITH_MPI */
else
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_TRMM('L', 'L', 'N', 'N', &
nstor, l_cols, ONE, tmat_dev, max_stored_rows, &
tmp_dev, nstor)
......@@ -567,7 +567,7 @@ subroutine trans_ev_&
call gpublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, &
-ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor, &
ONE, q_dev, ldq)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else !useGPU
#ifdef WITH_MPI
......
......@@ -782,7 +782,7 @@ subroutine tridiag_&
! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
! this requires altering of the algorithm when later explicitly updating the matrix
! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
if (wantDebug) call obj%timer%start("cublas")
if (wantDebug) call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols, &
ONE, a_dev, matrixRows, &
v_row_dev , 1, &
......@@ -797,7 +797,7 @@ subroutine tridiag_&
! ONE, u_row_dev + (l_row_beg - 1) * &
! size_of_datatype, 1)
! endif
if (wantDebug) call obj%timer%stop("cublas")
if (wantDebug) call obj%timer%stop("gpublas")
endif
else ! mat_vec_as_one_block
!perform multiplication by stripes - it is faster than by blocks, since we call cublas with
......@@ -1096,7 +1096,7 @@ subroutine tridiag_&
else
! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
! we than use the same calls as for CPU version
if (wantDebug) call obj%timer%start("cublas")
if (wantDebug) call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev + (l_row_beg - 1) * &
......@@ -1105,7 +1105,7 @@ subroutine tridiag_&
size_of_datatype, &
max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * matrixRows) * &
size_of_datatype , matrixRows)
if (wantDebug) call obj%timer%stop("cublas")
if (wantDebug) call obj%timer%stop("gpublas")
endif
endif
else !useGPU
......@@ -1147,12 +1147,12 @@ subroutine tridiag_&
else
!update whole (remaining) part of matrix, including tiles below diagonal
!we can do that in one large cublas call
if (wantDebug) call obj%timer%start("cublas")
if (wantDebug) call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev, max_local_rows, &
uv_stored_cols_dev, max_local_cols, &
ONE, a_dev, matrixRows)
if (wantDebug) call obj%timer%stop("cublas")
if (wantDebug) call obj%timer%stop("gpublas")
endif
endif
endif
......
......@@ -360,11 +360,11 @@
aux_off = (lrs-1)*size_of_datatype
b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', nstor, lce-lcs+1, &
lre-lrs+1, ONE, aux_dev+aux_off, l_rows, b_dev+b_off, ldb, ZERO, &
tmp1_dev, nstor)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
num = nstor*(lce-lcs+1)*size_of_datatype
successGPU = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
......
......@@ -330,8 +330,8 @@ module hip_c_kernel
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: row_group_dev
#ifdef WITH_AMD_GPU_VERSION
call launch_my_pack_c_hip_kernel_real_double(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev, a_dev, &
row_group_dev)
call launch_my_pack_c_hip_kernel_real_double(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev, &
a_dev, row_group_dev)
#endif
end subroutine
......@@ -344,8 +344,8 @@ module hip_c_kernel
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: row_group_dev
#ifdef WITH_AMD_GPU_VERSION
call launch_my_pack_c_hip_kernel_real_single(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev, a_dev, &
row_group_dev)
call launch_my_pack_c_hip_kernel_real_single(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev, &
a_dev, row_group_dev)
#endif
end subroutine
#endif
......@@ -426,8 +426,8 @@ module hip_c_kernel
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: row_group_dev
#ifdef WITH_AMD_GPU_VERSION
call launch_my_pack_c_hip_kernel_complex_single(row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, a_dev, &
row_group_dev)
call launch_my_pack_c_hip_kernel_complex_single(row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
a_dev, row_group_dev)
#endif
end subroutine
#endif
......
......@@ -1187,7 +1187,7 @@ max_threads)
call obj%timer%stop("mkl_offload")
else
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
lce-lcs+1, n_cols, lre, &
ONE, (a_dev + ((lcs-1)*lda* &
......@@ -1197,10 +1197,10 @@ max_threads)
size_of_datatype), &
cur_l_cols)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
if(i==0) cycle
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
lre = min(l_rows,i*l_rows_tile)
if (isSkewsymmetric) then
......@@ -1222,7 +1222,7 @@ max_threads)
size_of_datatype), &
cur_l_rows)
endif
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else ! useGPU
......@@ -1419,13 +1419,13 @@ max_threads)
nbw*nbw*size_of_datatype,gpuMemcpyHostToDevice)
check_memcpy_gpu("bandred: tmat -> tmat_dev ", successGPU)
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
l_cols, n_cols, ONE, tmat_dev, nbw, umc_dev, cur_l_cols)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
n_cols, n_cols, l_cols, ONE, umc_dev, cur_l_cols, &
(umc_dev+(cur_l_cols * n_cols )*size_of_datatype),cur_l_cols, &
......@@ -1433,7 +1433,7 @@ max_threads)
call gpublas_PRECISION_TRMM('Right', 'Upper', BLAS_TRANS_OR_CONJ, 'Nonunit', &
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
successGPU = gpu_memcpy(int(loc(vav),kind=c_intptr_t), &
vav_dev, nbw*nbw*size_of_datatype, gpuMemcpyDeviceToHost)
......@@ -1538,7 +1538,7 @@ max_threads)
endif
else
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
if (isSkewsymmetric) then
call gpublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
#if REALCASE == 1
......@@ -1564,7 +1564,7 @@ max_threads)
cur_l_cols, vav_dev,nbw, &
ONE, umc_dev, cur_l_cols)
endif
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
successGPU = gpu_memcpy(int(loc(umcGPU(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype, gpuMemcpyDeviceToHost)
......@@ -1689,7 +1689,7 @@ max_threads)
print *,"this should never happen"
stop
endif
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, myend-mystart+1, &
lce-lcs+1, 2*n_cols, -ONE, &
......@@ -1697,7 +1697,7 @@ max_threads)
size_of_datatype), &
cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
size_of_datatype), lda)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else
call obj%timer%start("blas")
......@@ -1731,7 +1731,7 @@ max_threads)
call obj%timer%stop("mkl_offload")
else
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
lre, lce-lcs+1, 2*n_cols, -ONE, &
......@@ -1739,7 +1739,7 @@ max_threads)
size_of_datatype), &
cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
size_of_datatype), lda)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else ! useGPU
......
......@@ -491,11 +491,11 @@ subroutine trans_ev_band_to_full_&
max_local_rows*cwy_blocking*size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_gpu("trans_ev_band_to_full: hvm -> hvm_dev", successGPU)
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
n_cols, l_cols, l_rows, ONE, hvm_dev, max_local_rows, &
q_dev, ldq , ZERO, tmp_dev, n_cols)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
#ifdef WITH_MPI
! copy data from device to host for a later MPI_ALLREDUCE
......@@ -544,12 +544,12 @@ subroutine trans_ev_band_to_full_&
cwy_blocking*cwy_blocking*size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_gpu("trans_ev_band_to_full: tmat_complete -> tmat_dev", successGPU)
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', &
n_cols, l_cols, ONE, tmat_dev, cwy_blocking, tmp_dev, n_cols)
call gpublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -ONE, hvm_dev, max_local_rows, tmp_dev, &
n_cols, ONE, q_dev, ldq)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else
call obj%timer%start("blas")
......@@ -593,13 +593,13 @@ subroutine trans_ev_band_to_full_&
cwy_blocking*cwy_blocking*size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_gpu("trans_ev_band_to_full: tmat_complete -> tmat_dev", successGPU)
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_TRMM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', &
n_cols, l_cols, ONE, tmat_dev, cwy_blocking, &
tmp_dev, n_cols)
call gpublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, &
-ONE, hvm_dev, max_local_rows, tmp_dev, n_cols, ONE, q_dev, ldq)
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else
call obj%timer%start("blas")
......
......@@ -836,12 +836,12 @@
#endif
call obj%timer%stop("mkl_offload")
else
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, &
1.0_rk, qtmp1_dev, ubound(qtmp1,dim=1), &
ev_dev, ubound(ev,dim=1), &
1.0_rk, qtmp2_dev, ubound(qtmp2,dim=1))
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else
call obj%timer%start("blas")
......@@ -898,12 +898,12 @@
call obj%timer%stop("mkl_offload")
else
call obj%timer%start("cublas")
call obj%timer%start("gpublas")
call gpublas_PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, &
1.0_rk, qtmp1_dev + l_rnm * size_of_datatype, ubound(qtmp1,dim=1), &
ev_dev, ubound(ev,dim=1), &
1.0_rk, qtmp2_dev + l_rnm * size_of_datatype, ubound(qtmp2,dim=1))
call obj%timer%stop("cublas")
call obj%timer%stop("gpublas")
endif
else
call obj%timer%start("blas")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment