Commit 6e5c03a6 authored by Wenzhe Yu's avatar Wenzhe Yu 😎
Browse files

Extend CUDA wrapper

* cudaMallocHost
* cudaFreeHost
* cudaHostRegister
* cudaHostUnregister
parent 6cd5a4f1
......@@ -71,7 +71,7 @@
#ifdef WITH_GPU_VERSION
extern "C" {
int cublasCreateFromC(intptr_t *cublas_handle) {
// printf("in c: %p\n", *cublas_handle);
*cublas_handle = (intptr_t) malloc(sizeof(cublasHandle_t));
......@@ -83,8 +83,8 @@ extern "C" {
}
else if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
errormessage("Error in cublasCreate: %s\n", "the CUDA Runtime initialization failed");
return 0;
}
return 0;
}
else if (status == CUBLAS_STATUS_ALLOC_FAILED) {
errormessage("Error in cublasCreate: %s\n", "the resources could not be allocated");
return 0;
......@@ -92,7 +92,7 @@ extern "C" {
else{
errormessage("Error in cublasCreate: %s\n", "unknown error");
return 0;
}
}
}
int cublasDestroyFromC(intptr_t *cublas_handle) {
......@@ -104,12 +104,12 @@ extern "C" {
}
else if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
errormessage("Error in cublasDestroy: %s\n", "the library has not been initialized");
return 0;
}
return 0;
}
else{
errormessage("Error in cublasCreate: %s\n", "unknown error");
return 0;
}
}
}
int cudaSetDeviceFromC(int n) {
......@@ -168,6 +168,31 @@ extern "C" {
return 1;
}
int cudaMallocHostFromC(intptr_t *a, size_t width_height) {
cudaError_t cuerr = cudaMallocHost((void **) a, width_height);
#ifdef DEBUG_CUDA
printf("MallocHost pointer address: %p \n", *a);
#endif
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMallocHost: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaFreeHostFromC(intptr_t *a) {
#ifdef DEBUG_CUDA
printf("FreeHost pointer address: %p \n", a);
#endif
cudaError_t cuerr = cudaFreeHost(a);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaFreeHost: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemsetFromC(intptr_t *a, int value, size_t count) {
cudaError_t cuerr = cudaMemset( a, value, count);
......@@ -197,6 +222,27 @@ extern "C" {
}
return 1;
}
int cudaHostRegisterFromC(intptr_t *a, int value, int flag) {
cudaError_t cuerr = cudaHostRegister( a, value, flag);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaHostRegister: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaHostUnregisterFromC(intptr_t *a) {
cudaError_t cuerr = cudaHostUnregister( a);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaHostUnregister: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemcpyDeviceToDeviceFromC(void) {
int val = cudaMemcpyDeviceToDevice;
return val;
......@@ -209,6 +255,10 @@ extern "C" {
int val = cudaMemcpyDeviceToHost;
return val;
}
int cudaHostRegisterDefaultFromC(void) {
int val = cudaHostRegisterDefault;
return val;
}
int cudaHostRegisterPortableFromC(void) {
int val = cudaHostRegisterPortable;
return val;
......@@ -217,7 +267,7 @@ extern "C" {
int val = cudaHostRegisterMapped;
return val;
}
cublasOperation_t operation_new_api(char trans) {
if (trans == 'N' || trans == 'n') {
return CUBLAS_OP_N;
......@@ -249,11 +299,11 @@ extern "C" {
return CUBLAS_FILL_MODE_LOWER;
}
}
cublasSideMode_t side_mode_new_api(char side) {
if (side == 'L' || side == 'l') {
return CUBLAS_SIDE_LEFT;
}
}
else if (side == 'R' || side == 'r') {
return CUBLAS_SIDE_RIGHT;
}
......@@ -263,7 +313,7 @@ extern "C" {
return CUBLAS_SIDE_LEFT;
}
}
cublasDiagType_t diag_type_new_api(char diag) {
if (diag == 'N' || diag == 'n') {
return CUBLAS_DIAG_NON_UNIT;
......@@ -277,62 +327,62 @@ extern "C" {
return CUBLAS_DIAG_NON_UNIT;
}
}
void cublasDgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy) {
cublasDgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha, A, lda, x, incx, &beta, y, incy);
double beta, double *y, int incy) {
cublasDgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha, A, lda, x, incx, &beta, y, incy);
}
void cublasSgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy) {
cublasSgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha, A, lda, x, incx, &beta, y, incy);
float beta, float *y, int incy) {
cublasSgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha, A, lda, x, incx, &beta, y, incy);
}
void cublasZgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, double _Complex alpha,
const double _Complex *A, int lda, const double _Complex *x, int incx,
double _Complex beta, double _Complex *y, int incy) {
double _Complex beta, double _Complex *y, int incy) {
cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
cuDoubleComplex beta_casted = *((cuDoubleComplex*)(&beta));
const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
const cuDoubleComplex* x_casted = (const cuDoubleComplex*) x;
cuDoubleComplex* y_casted = (cuDoubleComplex*) y;
cublasZgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);
cublasZgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);
}
void cublasCgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, float _Complex alpha,
const float _Complex *A, int lda, const float _Complex *x, int incx,
float _Complex beta, float _Complex *y, int incy) {
float _Complex beta, float _Complex *y, int incy) {
cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
cuFloatComplex beta_casted = *((cuFloatComplex*)(&beta));
const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
const cuFloatComplex* x_casted = (const cuFloatComplex*) x;
cuFloatComplex* y_casted = (cuFloatComplex*) y;
cublasCgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);
cublasCgemv(*((cublasHandle_t*)handle), operation_new_api(trans),
m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);
}
void cublasDgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta,
double *C, int ldc) {
cublasDgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
cublasDgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
......@@ -340,8 +390,8 @@ extern "C" {
float alpha, const float *A, int lda,
const float *B, int ldb, float beta,
float *C, int ldc) {
cublasSgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
cublasSgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
}
......@@ -349,15 +399,15 @@ extern "C" {
double _Complex alpha, const double _Complex *A, int lda,
const double _Complex *B, int ldb, double _Complex beta,
double _Complex *C, int ldc) {
cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
cuDoubleComplex beta_casted = *((cuDoubleComplex*)(&beta));
const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
const cuDoubleComplex* B_casted = (const cuDoubleComplex*) B;
cuDoubleComplex* C_casted = (cuDoubleComplex*) C;
cublasZgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
cublasZgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
}
......@@ -365,28 +415,28 @@ extern "C" {
float _Complex alpha, const float _Complex *A, int lda,
const float _Complex *B, int ldb, float _Complex beta,
float _Complex *C, int ldc) {
cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
cuFloatComplex beta_casted = *((cuFloatComplex*)(&beta));
const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
const cuFloatComplex* B_casted = (const cuFloatComplex*) B;
cuFloatComplex* C_casted = (cuFloatComplex*) C;
cublasCgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
cublasCgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb),
m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
}
// todo: new CUBLAS API diverged from standard BLAS api for these functions
// todo: it provides out-of-place (and apparently more efficient) implementation
// todo: by passing B twice (in place of C as well), we should fall back to in-place algorithm
void cublasDtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
int m, int n, double alpha, const double *A,
int lda, double *B, int ldb){
cublasDtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
cublasDtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
diag_type_new_api(diag), m, n, &alpha, A, lda, B, ldb, B, ldb);
}
......@@ -394,7 +444,7 @@ extern "C" {
int m, int n, float alpha, const float *A,
int lda, float *B, int ldb){
cublasStrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
cublasStrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
diag_type_new_api(diag), m, n, &alpha, A, lda, B, ldb, B, ldb);
}
......@@ -403,11 +453,11 @@ extern "C" {
int lda, double _Complex *B, int ldb){
cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
cuDoubleComplex* B_casted = (cuDoubleComplex*) B;
cublasZtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
cuDoubleComplex* B_casted = (cuDoubleComplex*) B;
cublasZtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
diag_type_new_api(diag), m, n, &alpha_casted, A_casted, lda, B_casted, ldb, B_casted, ldb);
}
......@@ -416,14 +466,14 @@ extern "C" {
int lda, float _Complex *B, int ldb){
cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
cuFloatComplex* B_casted = (cuFloatComplex*) B;
cublasCtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
cuFloatComplex* B_casted = (cuFloatComplex*) B;
cublasCtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa),
diag_type_new_api(diag), m, n, &alpha_casted, A_casted, lda, B_casted, ldb, B_casted, ldb);
}
}
#endif /* WITH_GPU_VERSION */
......@@ -51,9 +51,10 @@ module cuda_functions
integer(kind=ik) :: cudaMemcpyHostToDevice
integer(kind=ik) :: cudaMemcpyDeviceToHost
integer(kind=ik) :: cudaMemcpyDeviceToDevice
integer(kind=ik) :: cudaHostRegisterDefault
integer(kind=ik) :: cudaHostRegisterPortable
integer(kind=ik) :: cudaHostRegisterMapped
integer(kind=ik) :: cudaMemcpyDeviceToDevice
! TODO global variable, has to be changed
integer(kind=C_intptr_T) :: cublasHandle = -1
......@@ -77,7 +78,7 @@ module cuda_functions
integer(kind=C_intptr_T) :: handle
integer(kind=C_INT) :: istat
end function cublas_create_c
end interface
end interface
interface
function cublas_destroy_c(handle) result(istat) &
......@@ -87,7 +88,7 @@ module cuda_functions
integer(kind=C_intptr_T) :: handle
integer(kind=C_INT) :: istat
end function cublas_destroy_c
end interface
end interface
interface
function cuda_setdevice_c(n) result(istat) &
......@@ -151,6 +152,15 @@ module cuda_functions
end function
end interface
interface
function cuda_hostRegisterDefault_c() result(flag) &
bind(C, name="cudaHostRegisterDefaultFromC")
use iso_c_binding
implicit none
integer(kind=c_int) :: flag
end function
end interface
interface
function cuda_hostRegisterPortable_c() result(flag) &
bind(C, name="cudaHostRegisterPortableFromC")
......@@ -205,6 +215,34 @@ module cuda_functions
end function cuda_memcpy2d_c
end interface
interface
function cuda_host_register_c(a, size, flag) result(istat) &
bind(C, name="cudaHostRegisterFromC")
use iso_c_binding
implicit none
integer(kind=C_intptr_t), value :: a
integer(kind=c_intptr_t), intent(in), value :: size
integer(kind=C_INT), intent(in), value :: flag
integer(kind=C_INT) :: istat
end function cuda_host_register_c
end interface
interface
function cuda_host_unregister_c(a) result(istat) &
bind(C, name="cudaHostUnregisterFromC")
use iso_c_binding
implicit none
integer(kind=C_intptr_t), value :: a
integer(kind=C_INT) :: istat
end function cuda_host_unregister_c
end interface
! functions to allocate and free CUDA memory
interface
......@@ -234,6 +272,33 @@ module cuda_functions
end function cuda_malloc_c
end interface
interface
function cuda_free_host_c(a) result(istat) &
bind(C, name="cudaFreeHostFromC")
use iso_c_binding
implicit none
type(c_ptr), value :: a
integer(kind=C_INT) :: istat
end function cuda_free_host_c
end interface
interface
function cuda_malloc_host_c(a, width_height) result(istat) &
bind(C, name="cudaMallocHostFromC")
use iso_c_binding
implicit none
type(c_ptr) :: a
integer(kind=c_intptr_t), intent(in), value :: width_height
integer(kind=C_INT) :: istat
end function cuda_malloc_host_c
end interface
interface
function cuda_memset_c(a, val, size) result(istat) &
bind(C, name="cudaMemsetFromC")
......@@ -281,7 +346,7 @@ module cuda_functions
real(kind=C_FLOAT),value :: alpha,beta
integer(kind=C_intptr_T), value :: a, b, c
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_sgemm_c
end interface
......@@ -298,7 +363,7 @@ module cuda_functions
real(kind=C_DOUBLE), value :: alpha
integer(kind=C_intptr_T), value :: a, b
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_dtrmm_c
end interface
......@@ -485,7 +550,7 @@ module cuda_functions
success = .true.
#endif
end function
function cuda_setdevice(n) result(success)
use iso_c_binding
......@@ -559,6 +624,35 @@ module cuda_functions
#endif
end function cuda_free
function cuda_malloc_host(a, width_height) result(success)
use iso_c_binding
implicit none
type(c_ptr) :: a
integer(kind=c_intptr_t), intent(in) :: width_height
logical :: success
#ifdef WITH_GPU_VERSION
success = cuda_malloc_host_c(a, width_height) /= 0
#else
success = .true.
#endif
end function
function cuda_free_host(a) result(success)
use iso_c_binding
implicit none
type(c_ptr), value :: a
logical :: success
#ifdef WITH_GPU_VERSION
success = cuda_free_host_c(a) /= 0
#else
success = .true.
#endif
end function cuda_free_host
function cuda_memset(a, val, size) result(success)
use iso_c_binding
......@@ -615,6 +709,18 @@ module cuda_functions
#endif
end function
function cuda_hostRegisterDefault() result(flag)
use iso_c_binding
use precision
implicit none
integer(kind=ik) :: flag
#ifdef WITH_GPU_VERSION
flag = int(cuda_hostRegisterDefault_c())
#else
flag = 0
#endif
end function
function cuda_hostRegisterPortable() result(flag)
use iso_c_binding
use precision
......@@ -678,6 +784,38 @@ module cuda_functions
#endif
end function cuda_memcpy2d
function cuda_host_register(a, size, flag) result(success)
use iso_c_binding
implicit none
integer(kind=C_intptr_t) :: a
integer(kind=c_intptr_t), intent(in) :: size
integer(kind=C_INT), intent(in) :: flag
logical :: success
#ifdef WITH_GPU_VERSION
success = cuda_host_register_c(a, size, flag) /= 0
#else
success = .true.
#endif
end function
function cuda_host_unregister(a) result(success)
use iso_c_binding
implicit none
integer(kind=C_intptr_t) :: a
logical :: success
#ifdef WITH_GPU_VERSION
success = cuda_host_unregister_c(a) /= 0
#else
success = .true.
#endif
end function
! cuBLAS
subroutine cublas_dgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
use iso_c_binding
......@@ -856,7 +994,7 @@ module cuda_functions
! subroutine cublas_dsymv(cta, n, alpha, a, lda, x, incx, beta, y, incy)
! use iso_c_binding
!
!
! implicit none
! character(1,C_CHAR),value :: cta
! integer(kind=C_INT) :: n
......@@ -867,10 +1005,10 @@ module cuda_functions
! call cublas_dsymv_c(cta, n, alpha, a, lda, x, incx, beta, y, incy)
! #endif
! end subroutine cublas_dsymv
!
!
! subroutine cublas_ssymv(cta, n, alpha, a, lda, x, incx, beta, y, incy)
! use iso_c_binding
!
!
! implicit none
! character(1,C_CHAR),value :: cta
! integer(kind=C_INT) :: n
......@@ -881,10 +1019,10 @@ module cuda_functions
! call cublas_ssymv_c(cta, n, alpha, a, lda, x, incx, beta, y, incy)
! #endif
! end subroutine cublas_ssymv
!
!
! subroutine cublas_zsymv(cta, n, alpha, a, lda, x, incx, beta, y, incy)
! use iso_c_binding
!
!
! implicit none
! character(1,C_CHAR),value :: cta
! integer(kind=C_INT) :: n
......@@ -895,10 +1033,10 @@ module cuda_functions
! ! call cublas_zsymv_c(cta, n, alpha, a, lda, x, incx, beta, y, incy)
! #endif
! end subroutine cublas_zsymv
!
!
! subroutine cublas_csymv(cta, n, alpha, a, lda, x, incx, beta, y, incy)
! use iso_c_binding
!
!
! implicit none
! character(1,C_CHAR),value :: cta