Commit 3e3b9706 authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to implement vendor agnostic layer in elpa1_tridiag

parent ebc9ed9b
......@@ -45,6 +45,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa2/elpa2_compute.F90 \
src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
src/GPU/check_for_gpu.F90 \
src/GPU/mod_vendor_agnostic_layer.F90 \
src/GPU/CUDA/mod_cuda.F90 \
src/GPU/ROCm/mod_hip.F90 \
src/elpa2/GPU/CUDA/interface_c_kernel.F90 \
......
......@@ -268,7 +268,7 @@ module cuda_functions
implicit none
integer(kind=C_intptr_T) :: a
integer(kind=c_intptr_t), intent(in), value :: width_height
integer(kind=c_intptr_t), intent(in), value :: width_height
integer(kind=C_INT) :: istat
end function cuda_malloc_c
......@@ -635,8 +635,8 @@ module cuda_functions
use, intrinsic :: iso_c_binding
implicit none
integer(kind=C_intptr_t) :: a
integer(kind=c_intptr_t), intent(in) :: width_height
integer(kind=c_intptr_t) :: a
integer(kind=c_intptr_t), intent(in) :: width_height
logical :: success
#ifdef WITH_NVIDIA_GPU_VERSION
success = cuda_malloc_c(a, width_height) /= 0
......@@ -664,8 +664,8 @@ module cuda_functions
use, intrinsic :: iso_c_binding
implicit none
type(c_ptr) :: a
integer(kind=c_intptr_t), intent(in) :: width_height
type(c_ptr) :: a
integer(kind=c_intptr_t), intent(in) :: width_height
logical :: success
#ifdef WITH_NVIDIA_GPU_VERSION
success = cuda_malloc_host_c(a, width_height) /= 0
......
......@@ -820,7 +820,7 @@ module hip_functions
#endif
end function hip_memcpy2d
function cuda_hip_register(a, size, flag) result(success)
function hip_host_register(a, size, flag) result(success)
use, intrinsic :: iso_c_binding
......@@ -837,7 +837,7 @@ module hip_functions
#endif
end function
function cuda_hip_unregister(a) result(success)
function hip_host_unregister(a) result(success)
use, intrinsic :: iso_c_binding
......
......@@ -50,6 +50,7 @@ module mod_check_for_gpu
function check_for_gpu(obj, myid, numberOfDevices, wantDebug) result(gpuAvailable)
use cuda_functions
use hip_functions
use precision
use elpa_mpi
use elpa_abstract_impl
......@@ -109,19 +110,38 @@ module mod_check_for_gpu
endif
endif
#ifdef WITH_NVIDIA_GPU_VERSION
success = cuda_setdevice(use_gpu_id)
#endif
#ifdef WITH_AMD_GPU_VERSION
success = hip_setdevice(use_gpu_id)
#endif
if (.not.(success)) then
#ifdef WITH_NVIDIA_GPU_VERSION
print *,"Cannot set CudaDevice"
#endif
#ifdef WITH_AMD_GPU_VERSION
print *,"Cannot set HIPDevice"
#endif
stop 1
endif
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
#ifdef WITH_NVIDIA_GPU_VERSION
success = cublas_create(cublasHandle)
#endif
#ifdef WITH_AMD_GPU_VERSION
success = rocblas_create(cublasHandle)
#endif
if (.not.(success)) then
#ifdef WITH_NVIDIA_GPU_VERSION
print *,"Cannot create cublas handle"
#endif
#ifdef WITH_AMD_GPU_VERSION
print *,"Cannot create rocblas handle"
#endif
stop 1
endif
else
......@@ -139,11 +159,21 @@ module mod_check_for_gpu
endif
endif
#ifdef WITH_NVIDIA_GPU_VERSION
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success = cuda_getdevicecount(numberOfDevices)
#endif
#ifdef WITH_AMD_GPU_VERSION
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success = hip_getdevicecount(numberOfDevices)
#endif
if (.not.(success)) then
#ifdef WITH_NVIDIA_GPU_VERSION
print *,"error in cuda_getdevicecount"
#endif
#ifdef WITH_AMPD_GPU_VERSION
print *,"error in hip_getdevicecount"
#endif
stop 1
endif
......@@ -171,19 +201,39 @@ module mod_check_for_gpu
endif
deviceNumber = mod(myid, numberOfDevices)
#ifdef WITH_NIVDIA_GPU_VERSION
success = cuda_setdevice(deviceNumber)
#endif
#ifdef WITH_AMD_GPU_VERSION
success = hip_setdevice(deviceNumber)
#endif
if (.not.(success)) then
#ifdef WITH_NIVDIA_GPU_VERSION
print *,"Cannot set CudaDevice"
#endif
#ifdef WITH_AMD_GPU_VERSION
print *,"Cannot set hipDevice"
#endif
stop 1
endif
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
#ifdef WITH_NIVDIA_GPU_VERSION
success = cublas_create(cublasHandle)
#endif
#ifdef WITH_AMD_GPU_VERSION
success = rocblas_create(cublasHandle)
#endif
if (.not.(success)) then
#ifdef WITH_NIVDIA_GPU_VERSION
print *,"Cannot create cublas handle"
#endif
#ifdef WITH_AMD_GPU_VERSION
print *,"Cannot create rocblas handle"
#endif
stop 1
endif
......
#include "config-f90.h"
module elpa_gpu
use iso_c_binding
integer(kind=c_int), parameter :: nvidia_gpu = 1
integer(kind=c_int), parameter :: amd_gpu = 2
integer(kind=c_int), parameter :: no_gpu = -1
integer(kind=c_int) :: use_gpu_vendor
integer(kind=c_int) :: gpuHostRegisterDefault
integer(kind=c_int) :: gpuMemcpyHostToDevice
integer(kind=c_int) :: gpuMemcpyDeviceToHost
integer(kind=c_int) :: gpuMemcpyDeviceToDevice
integer(kind=c_int) :: gpuHostRegisterMapped
integer(kind=c_int) :: gpuHostRegisterPortable
contains
function gpu_vendor() result(vendor)
use precision
implicit none
integer(kind=c_int) :: vendor
! default
vendor = no_gpu
#ifdef WITH_NVIDIA_GPU_VERSION
vendor = nvidia_gpu
#endif
#ifdef WITH_AMD_GPU_VERSION
vendor = amd_gpu
#endif
use_gpu_vendor = vendor
return
end function
subroutine set_gpu_parameters
use cuda_functions
use hip_functions
implicit none
if (use_gpu_vendor == nvidia_gpu) then
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
gpuMemcpyHostToDevice = cudaMemcpyHostToDevice
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
gpuMemcpyDeviceToHost = cudaMemcpyDeviceToHost
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
gpuMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice
cudaHostRegisterPortable = cuda_hostRegisterPortable()
gpuHostRegisterPortable = cudaHostRegisterPortable
cudaHostRegisterMapped = cuda_hostRegisterMapped()
gpuHostRegisterMapped = cudaHostRegisterMapped
cudaHostRegisterDefault = cuda_hostRegisterDefault()
gpuHostRegisterDefault = cudaHostRegisterDefault
endif
if (use_gpu_vendor == amd_gpu) then
hipMemcpyHostToDevice = hip_memcpyHostToDevice()
gpuMemcpyHostToDevice = hipMemcpyHostToDevice
hipMemcpyDeviceToHost = hip_memcpyDeviceToHost()
gpuMemcpyDeviceToHost = hipMemcpyDeviceToHost
hipMemcpyDeviceToDevice = hip_memcpyDeviceToDevice()
gpuMemcpyDeviceToDevice = hipMemcpyDeviceToDevice
hipHostRegisterPortable = hip_hostRegisterPortable()
gpuHostRegisterPortable = hipHostRegisterPortable
hipHostRegisterMapped = hip_hostRegisterMapped()
gpuHostRegisterMapped = hipHostRegisterMapped
hipHostRegisterDefault = hip_hostRegisterDefault()
gpuHostRegisterDefault = hipHostRegisterDefault
endif
end subroutine
function gpu_malloc_host(array, elements) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
type(c_ptr) :: array
integer(kind=c_intptr_t), intent(in) :: elements
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_malloc_host(array, elements)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_host_malloc(array, elements)
endif
end function
function gpu_malloc(array, elements) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_intptr_T) :: array
integer(kind=c_intptr_t), intent(in) :: elements
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_malloc(array, elements)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_malloc(array, elements)
endif
end function
function gpu_host_register(array, elements, flag) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_intptr_t) :: array
integer(kind=c_intptr_t), intent(in) :: elements
integer(kind=C_INT), intent(in) :: flag
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_host_register(array, elements, flag)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_host_register(array, elements, flag)
endif
end function
function gpu_memcpy(dst, src, size, dir) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_intptr_t) :: dst
integer(kind=C_intptr_t) :: src
integer(kind=c_intptr_t), intent(in) :: size
integer(kind=C_INT), intent(in) :: dir
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_memcpy(dst, src, size, dir)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_memcpy(dst, src, size, dir)
endif
end function
function gpu_memset(a, val, size) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=c_intptr_t) :: a
integer(kind=ik) :: val
integer(kind=c_intptr_t), intent(in) :: size
integer(kind=C_INT) :: istat
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_memset(a, val, size)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_memset(a, val, size)
endif
end function
function gpu_free(a) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=c_intptr_t) :: a
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_free(a)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_free(a)
endif
end function
function gpu_free_host(a) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
type(c_ptr), value :: a
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_free_host(a)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_host_free(a)
endif
end function
function gpu_host_unregister(a) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=c_intptr_t) :: a
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_host_unregister(a)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_host_unregister(a)
endif
end function
end module
......@@ -73,6 +73,8 @@ function elpa_solve_evp_&
#endif
use precision
use cuda_functions
use hip_functions
use elpa_gpu
use mod_check_for_gpu
use, intrinsic :: iso_c_binding
use elpa_abstract_impl
......@@ -294,11 +296,22 @@ function elpa_solve_evp_&
obj%eigenvalues_only = .true.
endif
call obj%get("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu. Aborting..."
stop
if (gpu_vendor() == NVIDIA_GPU) then
call obj%get("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for NVIDIA GPU. Aborting..."
stop
endif
else if (gpu_vendor() == AMD_GPU) then
call obj%get("amd-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for AMD GPU. Aborting..."
stop
endif
else
gpu = 0
endif
if (gpu .eq. 1) then
useGPU =.true.
else
......@@ -344,11 +357,7 @@ function elpa_solve_evp_&
if (check_for_gpu(obj, my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
call set_gpu_parameters()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
......
......@@ -104,6 +104,7 @@ subroutine tridiag_&
use matrix_plot
use elpa_omp
use elpa_blas_interfaces
use elpa_gpu
implicit none
#include "../general/precision_kinds.F90"
......@@ -289,33 +290,33 @@ subroutine tridiag_&
if (useGPU) then
num = (max_local_rows+1) * size_of_datatype
successCUDA = cuda_malloc_host(v_row_host,num)
successCUDA = gpu_malloc_host(v_row_host, num)
check_host_alloc_cuda("tridiag: v_row_host", successCUDA)
call c_f_pointer(v_row_host,v_row,(/(max_local_rows+1)/))
num = (max_local_cols) * size_of_datatype
successCUDA = cuda_malloc_host(v_col_host,num)
successCUDA = gpu_malloc_host(v_col_host,num)
check_host_alloc_cuda("tridiag: v_col_host", successCUDA)
call c_f_pointer(v_col_host,v_col,(/(max_local_cols)/))
num = (max_local_cols) * size_of_datatype
successCUDA = cuda_malloc_host(u_col_host,num)
successCUDA = gpu_malloc_host(u_col_host,num)
check_host_alloc_cuda("tridiag: u_col_host", successCUDA)
call c_f_pointer(u_col_host,u_col,(/(max_local_cols)/))
num = (max_local_rows) * size_of_datatype
successCUDA = cuda_malloc_host(u_row_host,num)
successCUDA = gpu_malloc_host(u_row_host,num)
check_host_alloc_cuda("tridiag: u_row_host", successCUDA)
call c_f_pointer(u_row_host,u_row,(/(max_local_rows)/))
num = (max_local_rows * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
successCUDA = gpu_host_register(int(loc(vu_stored_rows),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: vu_stored_roes", successCUDA)
num = (max_local_cols * 2*max_stored_uv) * size_of_datatype
successCUDA = cuda_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
successCUDA = gpu_host_register(int(loc(uv_stored_cols),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: uv_stored_cols", successCUDA)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
......@@ -323,8 +324,8 @@ subroutine tridiag_&
#else
num = na * 4
#endif
successCUDA = cuda_host_register(int(loc(e_vec),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
successCUDA = gpu_host_register(int(loc(e_vec),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: e_vec", successCUDA)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
......@@ -332,8 +333,8 @@ subroutine tridiag_&
#else
num = na * 4
#endif
successCUDA = cuda_host_register(int(loc(d_vec),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
successCUDA = gpu_host_register(int(loc(d_vec),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: d_vec", successCUDA)
else
......@@ -372,23 +373,23 @@ subroutine tridiag_&
u_col = 0
if (useGPU) then
successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
successCUDA = gpu_malloc(v_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: v_row_dev", successCUDA)
successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)
successCUDA = gpu_malloc(u_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
successCUDA = gpu_malloc(v_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
successCUDA = gpu_malloc(u_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: u_col_dev", successCUDA)
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
successCUDA = gpu_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
successCUDA = gpu_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
endif !useGPU
......@@ -415,15 +416,15 @@ subroutine tridiag_&
num = matrixRows * matrixCols * size_of_datatype
successCUDA = cuda_malloc(a_dev, num)
successCUDA = gpu_malloc(a_dev, num)
check_alloc_cuda("tridiag: a_dev", successCUDA)
successCUDA = cuda_host_register(int(loc(a_mat),kind=c_intptr_t),num,&
cudaHostRegisterDefault)
successCUDA = gpu_host_register(int(loc(a_mat),kind=c_intptr_t),num,&
gpuHostRegisterDefault)
check_host_register_cuda("tridiag: a_mat", successCUDA)
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
num, cudaMemcpyHostToDevice)
successCUDA = gpu_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
num, gpuMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev", successCUDA)
endif
......@@ -450,8 +451,8 @@ subroutine tridiag_&
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset,
! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(int(loc(v_row),kind=c_intptr_t), &
a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = gpu_memcpy(int(loc(v_row),kind=c_intptr_t), &
a_dev + a_offset, (l_rows)* size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag a_dev 1", successCUDA)
else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
......@@ -563,19 +564,19 @@ subroutine tridiag_&
u_row(1:l_rows) = 0
if (l_rows > 0 .and. l_cols> 0 ) then
if (useGPU) then
successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
successCUDA = gpu_memset(u_col_dev, 0, l_cols * size_of_datatype)
check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
successCUDA = gpu_memset(u_row_dev, 0, l_rows * size_of_datatype)
check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
l_cols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = gpu_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
l_cols * size_of_datatype, gpuMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_col_dev", successCUDA)