Commit 5466e576 authored by Andreas Marek's avatar Andreas Marek
Browse files

Some GPU functions with type(c_ptr) interface

parent 685e9338
......@@ -218,7 +218,7 @@ extern "C" {
}
int cudaMemcpy2dFromC(intptr_t *dest, size_t dpitch, intptr_t *src, size_t spitch, size_t width, size_t height, int dir) {
cudaError_t cuerr = cudaMemcpy2D( dest, dpitch, src, spitch, width, height, (cudaMemcpyKind)dir);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMemcpy2d: %s\n",cudaGetErrorString(cuerr));
......@@ -435,6 +435,32 @@ extern "C" {
// todo: it provides out-of-place (and apparently more efficient) implementation
// todo: by passing B twice (in place of C as well), we should fall back to in-place algorithm
void cublasDcopy_elpa_wrapper (intptr_t handle, int n, double *x, int incx, double *y, int incy){
cublasDcopy(*((cublasHandle_t*)handle), n, x, incx, y, incy);
}
void cublasScopy_elpa_wrapper (intptr_t handle, int n, float *x, int incx, float *y, int incy){
cublasScopy(*((cublasHandle_t*)handle), n, x, incx, y, incy);
}
void cublasZcopy_elpa_wrapper (intptr_t handle, int n, double _Complex *x, int incx, double _Complex *y, int incy){
const cuDoubleComplex* X_casted = (const cuDoubleComplex*) x;
cuDoubleComplex* Y_casted = ( cuDoubleComplex*) y;
cublasZcopy(*((cublasHandle_t*)handle), n, X_casted, incx, Y_casted, incy);
}
void cublasCcopy_elpa_wrapper (intptr_t handle, int n, float _Complex *x, int incx, float _Complex *y, int incy){
const cuFloatComplex* X_casted = (const cuFloatComplex*) x;
cuFloatComplex* Y_casted = ( cuFloatComplex*) y;
cublasCcopy(*((cublasHandle_t*)handle), n, X_casted, incx, Y_casted, incy);
}
void cublasDtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
int m, int n, double alpha, const double *A,
int lda, double *B, int ldb){
......
This diff is collapsed.
This diff is collapsed.
......@@ -436,6 +436,32 @@ extern "C" {
// todo: it provides out-of-place (and apparently more efficient) implementation
// todo: by passing B twice (in place of C as well), we should fall back to in-place algorithm
void rocblasDcopy_elpa_wrapper (intptr_t handle, int n, double *x, int incx, double *y, int incy){
rocblas_dcopy(*((rocblas_handle*)handle), n, x, incx, y, incy);
}
void rocblasScopy_elpa_wrapper (intptr_t handle, int n, float *x, int incx, float *y, int incy){
rocblas_scopy(*((rocblas_handle*)handle), n, x, incx, y, incy);
}
void rocblasZcopy_elpa_wrapper (intptr_t handle, int n, double _Complex *x, int incx, double _Complex *y, int incy){
const rocblas_double_complex* X_casted = (const rocblas_double_complex*) x;
rocblas_double_complex* Y_casted = (rocblas_double_complex*) y;
rocblas_zcopy(*((rocblas_handle*)handle), n, X_casted, incx, Y_casted, incy);
}
void rocblasCcopy_elpa_wrapper (intptr_t handle, int n, float _Complex *x, int incx, float _Complex *y, int incy){
const rocblas_float_complex* X_casted = (const rocblas_float_complex*) x;
rocblas_float_complex* Y_casted = ( rocblas_float_complex*) y;
rocblas_ccopy(*((rocblas_handle*)handle), n, X_casted, incx, Y_casted, incy);
}
void rocblas_dtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
int m, int n, double alpha, const double *A,
int lda, double *B, int ldb){
......
......@@ -75,6 +75,58 @@ module elpa_gpu
#ifdef WANT_SINGLE_PRECISION_COMPLEX
integer(kind=c_intptr_t), parameter :: size_of_single_complex = 8_ck4
#endif
interface gpu_memcpy
module procedure gpu_memcpy_intptr
module procedure gpu_memcpy_cptr
module procedure gpu_memcpy_mixed
end interface
interface gpu_memcpy2d
module procedure gpu_memcpy2d_intptr
module procedure gpu_memcpy2d_cptr
end interface
interface gpublas_dcopy
module procedure gpublas_dcopy_intptr
module procedure gpublas_dcopy_cptr
end interface
interface gpublas_scopy
module procedure gpublas_scopy_intptr
module procedure gpublas_scopy_cptr
end interface
interface gpublas_zcopy
module procedure gpublas_zcopy_intptr
module procedure gpublas_zcopy_cptr
end interface
interface gpublas_ccopy
module procedure gpublas_ccopy_intptr
module procedure gpublas_ccopy_cptr
end interface
interface gpublas_dtrmm
module procedure gpublas_dtrmm_intptr
module procedure gpublas_dtrmm_cptr
end interface
interface gpublas_strmm
module procedure gpublas_strmm_intptr
module procedure gpublas_strmm_cptr
end interface
interface gpublas_ztrmm
module procedure gpublas_ztrmm_intptr
module procedure gpublas_ztrmm_cptr
end interface
interface gpublas_ctrmm
module procedure gpublas_ctrmm_intptr
module procedure gpublas_ctrmm_cptr
end interface
contains
function gpu_vendor() result(vendor)
use precision
......@@ -132,6 +184,23 @@ module elpa_gpu
end subroutine
function gpu_devicesynchronize() result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_devicesynchronize()
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_devicesynchronize()
endif
end function
function gpu_malloc_host(array, elements) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
......@@ -190,7 +259,7 @@ module elpa_gpu
end function
function gpu_memcpy(dst, src, size, dir) result(success)
function gpu_memcpy_intptr(dst, src, size, dir) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
......@@ -202,11 +271,53 @@ module elpa_gpu
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_memcpy(dst, src, size, dir)
success = cuda_memcpy_intptr(dst, src, size, dir)
endif
if (use_gpu_vendor == amd_gpu) then
!success = hip_memcpy_intptr(dst, src, size, dir)
endif
end function
function gpu_memcpy_cptr(dst, src, size, dir) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
type(c_ptr) :: dst
type(c_ptr) :: src
integer(kind=c_intptr_t), intent(in) :: size
integer(kind=C_INT), intent(in) :: dir
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_memcpy_cptr(dst, src, size, dir)
endif
if (use_gpu_vendor == amd_gpu) then
!success = hip_memcpy_cptr(dst, src, size, dir)
endif
end function
function gpu_memcpy_mixed(dst, src, size, dir) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
type(c_ptr) :: dst
integer(kind=C_intptr_t) :: src
integer(kind=c_intptr_t), intent(in) :: size
integer(kind=C_INT), intent(in) :: dir
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_memcpy_mixed(dst, src, size, dir)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_memcpy(dst, src, size, dir)
!success = hip_memcpy_cptr(dst, src, size, dir)
endif
end function
......@@ -292,7 +403,7 @@ module elpa_gpu
end function
function gpu_memcpy2d(dst, dpitch, src, spitch, width, height , dir) result(success)
function gpu_memcpy2d_intptr(dst, dpitch, src, spitch, width, height , dir) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
......@@ -310,11 +421,37 @@ module elpa_gpu
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_memcpy2d(dst, dpitch, src, spitch, width, height , dir)
success = cuda_memcpy2d_intptr(dst, dpitch, src, spitch, width, height , dir)
endif
if (use_gpu_vendor == amd_gpu) then
success = hip_memcpy2d(dst, dpitch, src, spitch, width, height , dir)
!success = hip_memcpy2d(dst, dpitch, src, spitch, width, height , dir)
endif
end function
function gpu_memcpy2d_cptr(dst, dpitch, src, spitch, width, height , dir) result(success)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
type(c_ptr) :: dst
integer(kind=c_intptr_t), intent(in) :: dpitch
type(c_ptr) :: src
integer(kind=c_intptr_t), intent(in) :: spitch
integer(kind=c_intptr_t), intent(in) :: width
integer(kind=c_intptr_t), intent(in) :: height
integer(kind=C_INT), intent(in) :: dir
logical :: success
if (use_gpu_vendor == nvidia_gpu) then
success = cuda_memcpy2d_cptr(dst, dpitch, src, spitch, width, height , dir)
endif
if (use_gpu_vendor == amd_gpu) then
!success = hip_memcpy2d(dst, dpitch, src, spitch, width, height , dir)
endif
end function
......@@ -496,8 +633,177 @@ module elpa_gpu
endif
end subroutine
subroutine gpublas_dcopy_intptr(n, x, incx, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx, incy
integer(kind=C_intptr_T) :: x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_dcopy_intptr(n, x, incx, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dcopy_intptr(n, x, incx, y, incy)
endif
end subroutine
subroutine gpublas_dcopy_cptr(n, x, incx, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx, incy
type(c_ptr) :: x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_dcopy_cptr(n, x, incx, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dcopy_cptr(n, x, incx, y, incy)
endif
end subroutine
subroutine gpublas_scopy_intptr(n, x, incx, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx, incy
integer(kind=C_intptr_T) :: x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_scopy_intptr(n, x, incx, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dcopy_intptr(n, x, incx, y, incy)
endif
end subroutine
subroutine gpublas_scopy_cptr(n, x, incx, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx, incy
type(c_ptr) :: x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_scopy_cptr(n, x, incx, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_scopy_cptr(n, x, incx, y, incy)
endif
end subroutine
subroutine gpublas_zcopy_intptr(n, x, incx, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx, incy
integer(kind=C_intptr_T) :: x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_zcopy_intptr(n, x, incx, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_zcopy_intptr(n, x, incx, y, incy)
endif
end subroutine
subroutine gpublas_zcopy_cptr(n, x, incx, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx, incy
type(c_ptr) :: x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_zcopy_cptr(n, x, incx, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dcopy_cptr(n, x, incx, y, incy)
endif
end subroutine
subroutine gpublas_ccopy_intptr(n, x, incx, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx, incy
integer(kind=C_intptr_T) :: x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_ccopy_intptr(n, x, incx, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_ccopy_intptr(n, x, incx, y, incy)
endif
end subroutine
subroutine gpublas_ccopy_cptr(n, x, incx, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx, incy
type(c_ptr) :: x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_ccopy_cptr(n, x, incx, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_ccopy_cptr(n, x, incx, y, incy)
endif
end subroutine
subroutine gpublas_dtrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
subroutine gpublas_dtrmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
......@@ -511,17 +817,41 @@ module elpa_gpu
integer(kind=C_intptr_T) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_dtrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
call cublas_dtrmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dtrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
call rocblas_dtrmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
subroutine gpublas_strmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
subroutine gpublas_dtrmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: side, uplo, trans, diag
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,ldb
real(kind=C_DOUBLE) :: alpha
type(c_ptr) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_dtrmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dtrmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
subroutine gpublas_strmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
......@@ -535,17 +865,41 @@ module elpa_gpu
integer(kind=C_intptr_T) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_strmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
call cublas_strmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_strmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
call rocblas_strmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
subroutine gpublas_strmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: side, uplo, trans, diag
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,ldb
real(kind=C_FLOAT) :: alpha
type(c_ptr) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_strmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_strmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
subroutine gpublas_ztrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
end subroutine
subroutine gpublas_ztrmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
......@@ -559,16 +913,17 @@ module elpa_gpu
integer(kind=C_intptr_T) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_ztrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
call cublas_ztrmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_ztrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
call rocblas_ztrmm_intptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
subroutine gpublas_ctrmm(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
subroutine gpublas_ztrmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
use, intrinsic :: iso_c_binding
use cuda_functions
......@@ -578,15 +933,62 @@ module elpa_gpu
character(1,C_CHAR),value :: side, uplo, trans, diag
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,ldb
complex(kind=C_FLOAT_COMPLEX) :: alpha
complex(kind=C_DOUBLE_COMPLEX) :: alpha
type(c_ptr) :: a, b
if (use_gpu_vendor == nvidia_gpu) then
call cublas_ztrmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_ztrmm_cptr(side, uplo, trans, diag, m, n, alpha, a, lda, b, ldb)
endif
end subroutine
subroutine gpublas_ctrmm_intptr(side<