Commit 35ceb5b5 authored by Andreas Marek's avatar Andreas Marek
Browse files

Layer for GPUBLAS

parent 3e3b9706
......@@ -225,5 +225,184 @@ module elpa_gpu
endif
end function
subroutine gpublas_dgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,incx,incy
real(kind=C_DOUBLE) :: alpha,beta
integer(kind=C_intptr_T) :: a, x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_dgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
endif
end subroutine
subroutine gpublas_sgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,incx,incy
real(kind=C_FLOAT) :: alpha,beta
integer(kind=C_intptr_T) :: a, x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_sgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_sgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
endif
end subroutine
subroutine gpublas_zgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,incx,incy
complex(kind=C_DOUBLE_COMPLEX) :: alpha,beta
integer(kind=C_intptr_T) :: a, x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_zgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_zgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
endif
end subroutine
subroutine gpublas_cgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,incx,incy
complex(kind=C_FLOAT_COMPLEX) :: alpha,beta
integer(kind=C_intptr_T) :: a, x, y
if (use_gpu_vendor == nvidia_gpu) then
call cublas_cgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_cgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
endif
end subroutine
subroutine gpublas_dgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: cta, ctb
integer(kind=C_INT) :: m,n,k
integer(kind=C_INT), intent(in) :: lda,ldb,ldc
real(kind=C_DOUBLE) :: alpha,beta
integer(kind=C_intptr_T) :: a, b, c
if (use_gpu_vendor == nvidia_gpu) then
call cublas_dgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_dgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
endif
end subroutine
subroutine gpublas_sgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: cta, ctb
integer(kind=C_INT) :: m,n,k
integer(kind=C_INT), intent(in) :: lda,ldb,ldc
real(kind=C_FLOAT) :: alpha,beta
integer(kind=C_intptr_T) :: a, b, c
if (use_gpu_vendor == nvidia_gpu) then
call cublas_sgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_sgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
endif
end subroutine
subroutine gpublas_zgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: cta, ctb
integer(kind=C_INT) :: m,n,k
integer(kind=C_INT), intent(in) :: lda,ldb,ldc
complex(kind=C_DOUBLE_COMPLEX) :: alpha,beta
integer(kind=C_intptr_T) :: a, b, c
if (use_gpu_vendor == nvidia_gpu) then
call cublas_zgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_zgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
endif
end subroutine
subroutine gpublas_cgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
use, intrinsic :: iso_c_binding
use cuda_functions
use hip_functions
implicit none
character(1,C_CHAR),value :: cta, ctb
integer(kind=C_INT) :: m,n,k
integer(kind=C_INT), intent(in) :: lda,ldb,ldc
complex(kind=C_FLOAT_COMPLEX) :: alpha,beta
integer(kind=C_intptr_T) :: a, b, c
if (use_gpu_vendor == nvidia_gpu) then
call cublas_cgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
endif
if (use_gpu_vendor == amd_gpu) then
call rocblas_cgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
endif
end subroutine
end module
......@@ -589,7 +589,7 @@ subroutine tridiag_&
!$omp num_threads(max_threads) &
!$omp default(none) &
!$omp private(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end) &
!$omp shared(useGPU, isSkewsymmetric, cudaMemcpyDeviceToHost, successCuda, u_row, u_row_dev, &
!$omp shared(useGPU, isSkewsymmetric, gpuMemcpyDeviceToHost, successCuda, u_row, u_row_dev, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat)
......@@ -680,14 +680,14 @@ subroutine tridiag_&
! this requires altering of the algorithm when later explicitly updating the matrix
! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
if (wantDebug) call obj%timer%start("cublas")
call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols, &
call gpublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols, &
ONE, a_dev, matrixRows, &
v_row_dev , 1, &
ONE, u_col_dev, 1)
! todo: try with non transposed!!!
! if(i/=j) then
! call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! call gpublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset, matrixRows, &
! v_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1, &
......@@ -710,7 +710,7 @@ subroutine tridiag_&
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
size_of_datatype
call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
call gpublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, matrixRows, &
v_row_dev + (l_row_beg - 1) * size_of_datatype, 1, &
......@@ -728,12 +728,12 @@ subroutine tridiag_&
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * matrixRows) * &
size_of_datatype
if (isSkewsymmetric) then
call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
call gpublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
-ONE, a_dev + a_offset, matrixRows, &
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
else
call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
call gpublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, matrixRows, &
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
......@@ -741,12 +741,12 @@ subroutine tridiag_&
enddo
end if !multiplication as one block / per stripes
successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = gpu_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
u_col_dev, l_cols * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = gpu_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
u_row_dev, l_rows * size_of_datatype, gpuMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
endif ! useGPU
......@@ -899,7 +899,7 @@ subroutine tridiag_&
! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
! we than use the same calls as for CPU version
if (wantDebug) call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
call gpublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev + (l_row_beg - 1) * &
size_of_datatype, &
......@@ -928,7 +928,7 @@ subroutine tridiag_&
!update whole (remaining) part of matrix, including tiles below diagonal
!we can do that in one large cublas call
if (wantDebug) call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs, &
call gpublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev, max_local_rows, &
uv_stored_cols_dev, max_local_cols, &
ONE, a_dev, matrixRows)
......
......@@ -95,6 +95,10 @@
#undef PRECISION_COPY
#undef PRECISION_AXPY
#undef PRECISION_GER
#undef gpublas_PRECISION_GEMM
#undef gpublas_PRECISION_TRMM
#undef gpublas_PRECISION_GEMV
#undef gpublas_PRECISION_SYMV
#undef cublas_PRECISION_GEMM
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
......@@ -165,6 +169,10 @@
#define PRECISION_SCAL DSCAL
#define PRECISION_COPY DCOPY
#define PRECISION_AXPY DAXPY
#define gpublas_PRECISION_GEMM gpublas_DGEMM
#define gpublas_PRECISION_TRMM gpublas_DTRMM
#define gpublas_PRECISION_GEMV gpublas_DGEMV
#define gpublas_PRECISION_SYMV gpublas_DSYMV
#define cublas_PRECISION_GEMM cublas_DGEMM
#define cublas_PRECISION_TRMM cublas_DTRMM
#define cublas_PRECISION_GEMV cublas_DGEMV
......@@ -232,6 +240,10 @@
#define PRECISION_SCAL SSCAL
#define PRECISION_COPY SCOPY
#define PRECISION_AXPY SAXPY
#define gpublas_PRECISION_GEMM gpublas_SGEMM
#define gpublas_PRECISION_TRMM gpublas_STRMM
#define gpublas_PRECISION_GEMV gpublas_SGEMV
#define gpublas_PRECISION_SYMV gpublas_SSYMV
#define cublas_PRECISION_GEMM cublas_SGEMM
#define cublas_PRECISION_TRMM cublas_STRMM
#define cublas_PRECISION_GEMV cublas_SGEMV
......@@ -312,6 +324,10 @@
#undef PRECISION_SCAL
#undef PRECISION_COPY
#undef PRECISION_AXPY
#undef gpublas_PRECISION_GEMM
#undef gpublas_PRECISION_TRMM
#undef gpublas_PRECISION_GEMV
#undef gpublas_PRECISION_SYMV
#undef cublas_PRECISION_GEMM
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
......@@ -392,6 +408,10 @@
#define PRECISION_SCAL ZSCAL
#define PRECISION_COPY ZCOPY
#define PRECISION_AXPY ZAXPY
#define gpublas_PRECISION_GEMM gpublas_ZGEMM
#define gpublas_PRECISION_TRMM gpublas_ZTRMM
#define gpublas_PRECISION_GEMV gpublas_ZGEMV
#define gpublas_PRECISION_SYMV gpublas_ZSYMV
#define cublas_PRECISION_GEMM cublas_ZGEMM
#define cublas_PRECISION_TRMM cublas_ZTRMM
#define cublas_PRECISION_GEMV cublas_ZGEMV
......@@ -463,6 +483,10 @@
#define PRECISION_COPY CCOPY
#define PRECISION_AXPY CAXPY
#define PRECISION_GER CGER
#define gpublas_PRECISION_GEMM gpublas_CGEMM
#define gpublas_PRECISION_TRMM gpublas_CTRMM
#define gpublas_PRECISION_GEMV gpublas_CGEMV
#define gpublas_PRECISION_SYMV gpublas_CSYMV
#define cublas_PRECISION_GEMM cublas_CGEMM
#define cublas_PRECISION_TRMM cublas_CTRMM
#define cublas_PRECISION_GEMV cublas_CGEMV
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment