Commit 7ea680e7 authored by Andreas Marek's avatar Andreas Marek
Browse files

GPU layer for elpa1_trans_ev_tridi

parent a9e5226e
......@@ -61,15 +61,15 @@ module cuda_functions
! TODO global variable, has to be changed
integer(kind=C_intptr_T) :: cublasHandle = -1
integer(kind=c_intptr_t), parameter :: size_of_double_real = 8_rk8
#ifdef WANT_SINGLE_PRECISION_REAL
integer(kind=c_intptr_t), parameter :: size_of_single_real = 4_rk4
#endif
integer(kind=c_intptr_t), parameter :: size_of_double_complex = 16_ck8
#ifdef WANT_SINGLE_PRECISION_COMPLEX
integer(kind=c_intptr_t), parameter :: size_of_single_complex = 8_ck4
#endif
! integer(kind=c_intptr_t), parameter :: size_of_double_real = 8_rk8
!#ifdef WANT_SINGLE_PRECISION_REAL
! integer(kind=c_intptr_t), parameter :: size_of_single_real = 4_rk4
!#endif
!
! integer(kind=c_intptr_t), parameter :: size_of_double_complex = 16_ck8
!#ifdef WANT_SINGLE_PRECISION_COMPLEX
! integer(kind=c_intptr_t), parameter :: size_of_single_complex = 8_ck4
!#endif
! functions to set and query the CUDA devices
interface
......
......@@ -61,15 +61,15 @@ module hip_functions
! TODO global variable, has to be changed
integer(kind=C_intptr_T) :: rocblasHandle = -1
integer(kind=c_intptr_t), parameter :: size_of_double_real = 8_rk8
#ifdef WANT_SINGLE_PRECISION_REAL
integer(kind=c_intptr_t), parameter :: size_of_single_real = 4_rk4
#endif
integer(kind=c_intptr_t), parameter :: size_of_double_complex = 16_ck8
#ifdef WANT_SINGLE_PRECISION_COMPLEX
integer(kind=c_intptr_t), parameter :: size_of_single_complex = 8_ck4
#endif
! integer(kind=c_intptr_t), parameter :: size_of_double_real = 8_rk8
!#ifdef WANT_SINGLE_PRECISION_REAL
! integer(kind=c_intptr_t), parameter :: size_of_single_real = 4_rk4
!#endif
!
! integer(kind=c_intptr_t), parameter :: size_of_double_complex = 16_ck8
!#ifdef WANT_SINGLE_PRECISION_COMPLEX
! integer(kind=c_intptr_t), parameter :: size_of_single_complex = 8_ck4
!#endif
! functions to set and query the CUDA devices
interface
......
#include "config-f90.h"
module elpa_gpu
use precision
use iso_c_binding
integer(kind=c_int), parameter :: nvidia_gpu = 1
......@@ -13,6 +14,15 @@ module elpa_gpu
integer(kind=c_int) :: gpuHostRegisterMapped
integer(kind=c_int) :: gpuHostRegisterPortable
integer(kind=c_intptr_t), parameter :: size_of_double_real = 8_rk8
#ifdef WANT_SINGLE_PRECISION_REAL
integer(kind=c_intptr_t), parameter :: size_of_single_real = 4_rk4
#endif
integer(kind=c_intptr_t), parameter :: size_of_double_complex = 16_ck8
#ifdef WANT_SINGLE_PRECISION_COMPLEX
integer(kind=c_intptr_t), parameter :: size_of_single_complex = 8_ck4
#endif
contains
function gpu_vendor() result(vendor)
use precision
......@@ -405,4 +415,98 @@ module elpa_gpu
end subroutine
subroutine gpublas_dtrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: side, uplo, trans, diag
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,ldb
real(kind=C_DOUBLE) :: alpha
integer(kind=C_intptr_T) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_dtrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dtrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
subroutine gpublas_strmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: side, uplo, trans, diag
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,ldb
real(kind=C_FLOAT) :: alpha
integer(kind=C_intptr_T) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_strmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_strmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
subroutine gpublas_ztrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: side, uplo, trans, diag
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,ldb
complex(kind=C_DOUBLE_COMPLEX) :: alpha
integer(kind=C_intptr_T) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_ztrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_ztrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
subroutine gpublas_ctrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: side, uplo, trans, diag
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,ldb
complex(kind=C_FLOAT_COMPLEX) :: alpha
integer(kind=C_intptr_T) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_ctrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_ctrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
end module
......@@ -94,10 +94,12 @@ subroutine trans_ev_&
&PRECISION &
(obj, na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use hip_functions
use, intrinsic :: iso_c_binding
use precision
use elpa_abstract_impl
use elpa_blas_interfaces
use elpa_gpu
implicit none
#include "../general/precision_kinds.F90"
......@@ -240,44 +242,44 @@ subroutine trans_ev_&
!&MATH_DATATYPE&
!&", "hvm1", istat, errorMessage)
num = (max_local_rows*max_stored_rows) * size_of_datatype
successCUDA = cuda_malloc_host(hvm1_host,num)
successCUDA = gpu_malloc_host(hvm1_host,num)
check_alloc_cuda("trans_ev: hvm1_host", successCUDA)
call c_f_pointer(hvm1_host,hvm1,(/(max_local_rows*max_stored_rows)/))
num = (max_stored_rows*max_stored_rows) * size_of_datatype
successCUDA = cuda_malloc_host(tmat_host,num)
successCUDA = gpu_malloc_host(tmat_host,num)
check_alloc_cuda("trans_ev: tmat_host", successCUDA)
call c_f_pointer(tmat_host,tmat,(/max_stored_rows,max_stored_rows/))
num = (max_local_cols*max_stored_rows) * size_of_datatype
successCUDA = cuda_malloc_host(tmp1_host,num)
successCUDA = gpu_malloc_host(tmp1_host,num)
check_alloc_cuda("trans_ev: tmp1_host", successCUDA)
call c_f_pointer(tmp1_host,tmp1,(/(max_local_cols*max_stored_rows)/))
num = (max_local_cols*max_stored_rows) * size_of_datatype
successCUDA = cuda_malloc_host(tmp2_host,num)
successCUDA = gpu_malloc_host(tmp2_host,num)
check_alloc_cuda("trans_ev: tmp2_host", successCUDA)
call c_f_pointer(tmp2_host,tmp2,(/(max_local_cols*max_stored_rows)/))
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
successCUDA = gpu_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype)
successCUDA = gpu_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype)
successCUDA = gpu_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successCUDA)
num = ldq * matrixCols * size_of_datatype
successCUDA = cuda_malloc(q_dev, num)
successCUDA = gpu_malloc(q_dev, num)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_host_register(int(loc(q_mat),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
successCUDA = gpu_host_register(int(loc(q_mat),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("trans_ev: q_mat", successCUDA)
successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
num, cudaMemcpyHostToDevice)
successCUDA = gpu_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
num, gpuMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -385,14 +387,14 @@ subroutine trans_ev_&
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), &
hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = gpu_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), &
hvm_ubnd * nstor * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), &
max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = gpu_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), &
max_stored_rows * max_stored_rows * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -401,7 +403,7 @@ subroutine trans_ev_&
if (l_rows>0) then
if (useGPU) then
call obj%timer%start("cublas")
call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
call gpublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd, &
q_dev, ldq, ZERO, tmp_dev, nstor)
call obj%timer%stop("cublas")
......@@ -419,7 +421,7 @@ subroutine trans_ev_&
else !l_rows>0
if (useGPU) then
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype)
successCUDA = gpu_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype)
check_memcpy_cuda("trans_ev", successCUDA)
else
tmp1(1:l_cols*nstor) = 0
......@@ -430,8 +432,8 @@ subroutine trans_ev_&
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = gpu_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
call obj%timer%start("mpi_communication")
......@@ -440,8 +442,8 @@ subroutine trans_ev_&
call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = gpu_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), &
max_local_cols * max_stored_rows * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -453,11 +455,11 @@ subroutine trans_ev_&
if (l_rows>0) then
if (useGPU) then
call obj%timer%start("cublas")
call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', &
call gpublas_PRECISION_TRMM('L', 'L', 'N', 'N', &
nstor, l_cols, ONE, tmat_dev, max_stored_rows, &
tmp_dev, nstor)
call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, &
call gpublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, &
-ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor, &
ONE, q_dev, ldq)
call obj%timer%stop("cublas")
......@@ -496,26 +498,26 @@ subroutine trans_ev_&
if (useGPU) then
!q_mat = q_dev
successCUDA = cuda_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), &
q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = gpu_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), &
q_dev, ldq * matrixCols * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
successCUDA = cuda_host_unregister(int(loc(q_mat),kind=c_intptr_t))
successCUDA = gpu_host_unregister(int(loc(q_mat),kind=c_intptr_t))
check_host_unregister_cuda("trans_ev: q_mat", successCUDA)
successCUDA = cuda_free_host(hvm1_host)
successCUDA = gpu_free_host(hvm1_host)
check_host_dealloc_cuda("trans_ev: hvm1_host", successCUDA)
nullify(hvm1)
successCUDA = cuda_free_host(tmat_host)
successCUDA = gpu_free_host(tmat_host)
check_host_dealloc_cuda("trans_ev: tmat_host", successCUDA)
nullify(tmat)
successCUDA = cuda_free_host(tmp1_host)
successCUDA = gpu_free_host(tmp1_host)
check_host_dealloc_cuda("trans_ev: tmp1_host", successCUDA)
nullify(tmp1)
successCUDA = cuda_free_host(tmp2_host)
successCUDA = gpu_free_host(tmp2_host)
check_host_dealloc_cuda("trans_ev: tmp2_host", successCUDA)
nullify(tmp2)
......@@ -528,16 +530,16 @@ subroutine trans_ev_&
!endif
!deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
successCUDA = cuda_free(q_dev)
successCUDA = gpu_free(q_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(tmp_dev)
successCUDA = gpu_free(tmp_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(hvm_dev)
successCUDA = gpu_free(hvm_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(tmat_dev)
successCUDA = gpu_free(tmat_dev)
check_dealloc_cuda("trans_ev", successCUDA)
else
deallocate(tmat, tmp1, tmp2, stat=istat, errmsg=errorMessage)
......
......@@ -60,6 +60,8 @@
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
use elpa_gpu
use mod_check_for_gpu
use elpa_blas_interfaces
implicit none
......
......@@ -107,6 +107,8 @@ last_stripe_width, kernel)
use cuda_c_kernel
use cuda_functions
use hip_functions
use elpa_gpu
use elpa_generated_fortran_interfaces
......
......@@ -101,6 +101,8 @@ max_threads)
!-------------------------------------------------------------------------------
use cuda_functions
use hip_functions
use elpa_gpu
use, intrinsic :: iso_c_binding
use elpa1_compute
#ifdef WITH_OPENMP_TRADITIONAL
......
......@@ -80,6 +80,8 @@
use elpa2_compute
use elpa_mpi
use cuda_functions
use hip_functions
use elpa_gpu
use mod_check_for_gpu
use elpa_omp
#ifdef HAVE_HETEROGENOUS_CLUSTER_SUPPORT
......
......@@ -99,6 +99,8 @@ subroutine trans_ev_band_to_full_&
!-------------------------------------------------------------------------------
use precision
use cuda_functions
use hip_functions
use elpa_gpu
use, intrinsic :: iso_c_binding
use elpa_abstract_impl
use elpa_blas_interfaces
......
......@@ -92,6 +92,8 @@ subroutine trans_ev_tridi_to_band_&
use pack_unpack_gpu
use compute_hh_trafo
use cuda_functions
use hip_functions
use elpa_gpu
use precision
use, intrinsic :: iso_c_binding
#ifdef WITH_OPENMP_TRADITIONAL
......
......@@ -52,6 +52,8 @@ subroutine pack_row_group_&
rows, n_offset, row_count)
use cuda_c_kernel
use cuda_functions
use hip_functions
use elpa_gpu
use precision
use, intrinsic :: iso_c_binding
implicit none
......@@ -112,6 +114,8 @@ end subroutine
use precision
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
use elpa_gpu
implicit none
integer(kind=c_intptr_t) :: row_group_dev, a_dev
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
......
......@@ -59,6 +59,8 @@ subroutine redist_band_&
use precision
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
use elpa_gpu
use elpa_utilities, only : local_index, check_allocate_f, check_deallocate_f
use elpa_mpi
implicit none
......
......@@ -69,6 +69,8 @@ module matrix_plot
subroutine prmat(na, useGpu, a_mat, a_dev, lda, matrixCols, nblk, my_prow, my_pcol, np_rows, np_cols, name, iteration)
use cuda_functions
use hip_functions
use elpa_gpu
use, intrinsic :: iso_c_binding
use precision
implicit none
......
......@@ -60,6 +60,8 @@
(obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success, max_threads)
use cuda_functions
use hip_functions
use elpa_gpu
use, intrinsic :: iso_c_binding
use precision
use elpa_abstract_impl
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment