Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
35ceb5b5
Commit
35ceb5b5
authored
Feb 26, 2021
by
Andreas Marek
Browse files
Layer for GPUBLAS
parent
3e3b9706
Changes
3
Hide whitespace changes
Inline
Side-by-side
src/GPU/mod_vendor_agnostic_layer.F90
View file @
35ceb5b5
...
...
@@ -225,5 +225,184 @@ module elpa_gpu
endif
end
function
subroutine
gpublas_dgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
character
(
1
,
C_CHAR
),
value
::
cta
integer
(
kind
=
C_INT
)
::
m
,
n
integer
(
kind
=
C_INT
),
intent
(
in
)
::
lda
,
incx
,
incy
real
(
kind
=
C_DOUBLE
)
::
alpha
,
beta
integer
(
kind
=
C_intptr_T
)
::
a
,
x
,
y
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
call
cublas_dgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
call
rocblas_dgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
endif
end
subroutine
subroutine
gpublas_sgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
character
(
1
,
C_CHAR
),
value
::
cta
integer
(
kind
=
C_INT
)
::
m
,
n
integer
(
kind
=
C_INT
),
intent
(
in
)
::
lda
,
incx
,
incy
real
(
kind
=
C_FLOAT
)
::
alpha
,
beta
integer
(
kind
=
C_intptr_T
)
::
a
,
x
,
y
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
call
cublas_sgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
call
rocblas_sgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
endif
end
subroutine
subroutine
gpublas_zgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
character
(
1
,
C_CHAR
),
value
::
cta
integer
(
kind
=
C_INT
)
::
m
,
n
integer
(
kind
=
C_INT
),
intent
(
in
)
::
lda
,
incx
,
incy
complex
(
kind
=
C_DOUBLE_COMPLEX
)
::
alpha
,
beta
integer
(
kind
=
C_intptr_T
)
::
a
,
x
,
y
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
call
cublas_zgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
call
rocblas_zgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
endif
end
subroutine
subroutine
gpublas_cgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
character
(
1
,
C_CHAR
),
value
::
cta
integer
(
kind
=
C_INT
)
::
m
,
n
integer
(
kind
=
C_INT
),
intent
(
in
)
::
lda
,
incx
,
incy
complex
(
kind
=
C_FLOAT_COMPLEX
)
::
alpha
,
beta
integer
(
kind
=
C_intptr_T
)
::
a
,
x
,
y
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
call
cublas_cgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
call
rocblas_cgemv
(
cta
,
m
,
n
,
alpha
,
a
,
lda
,
x
,
incx
,
beta
,
y
,
incy
)
endif
end
subroutine
subroutine
gpublas_dgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
character
(
1
,
C_CHAR
),
value
::
cta
,
ctb
integer
(
kind
=
C_INT
)
::
m
,
n
,
k
integer
(
kind
=
C_INT
),
intent
(
in
)
::
lda
,
ldb
,
ldc
real
(
kind
=
C_DOUBLE
)
::
alpha
,
beta
integer
(
kind
=
C_intptr_T
)
::
a
,
b
,
c
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
call
cublas_dgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
call
rocblas_dgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
endif
end
subroutine
subroutine
gpublas_sgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
character
(
1
,
C_CHAR
),
value
::
cta
,
ctb
integer
(
kind
=
C_INT
)
::
m
,
n
,
k
integer
(
kind
=
C_INT
),
intent
(
in
)
::
lda
,
ldb
,
ldc
real
(
kind
=
C_FLOAT
)
::
alpha
,
beta
integer
(
kind
=
C_intptr_T
)
::
a
,
b
,
c
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
call
cublas_sgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
call
rocblas_sgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
endif
end
subroutine
subroutine
gpublas_zgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
character
(
1
,
C_CHAR
),
value
::
cta
,
ctb
integer
(
kind
=
C_INT
)
::
m
,
n
,
k
integer
(
kind
=
C_INT
),
intent
(
in
)
::
lda
,
ldb
,
ldc
complex
(
kind
=
C_DOUBLE_COMPLEX
)
::
alpha
,
beta
integer
(
kind
=
C_intptr_T
)
::
a
,
b
,
c
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
call
cublas_zgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
call
rocblas_zgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
endif
end
subroutine
subroutine
gpublas_cgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
character
(
1
,
C_CHAR
),
value
::
cta
,
ctb
integer
(
kind
=
C_INT
)
::
m
,
n
,
k
integer
(
kind
=
C_INT
),
intent
(
in
)
::
lda
,
ldb
,
ldc
complex
(
kind
=
C_FLOAT_COMPLEX
)
::
alpha
,
beta
integer
(
kind
=
C_intptr_T
)
::
a
,
b
,
c
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
call
cublas_cgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
call
rocblas_cgemm
(
cta
,
ctb
,
m
,
n
,
k
,
alpha
,
a
,
lda
,
b
,
ldb
,
beta
,
c
,
ldc
)
endif
end
subroutine
end
module
src/elpa1/elpa1_tridiag_template.F90
View file @
35ceb5b5
...
...
@@ -589,7 +589,7 @@ subroutine tridiag_&
!$omp num_threads(max_threads) &
!$omp default(none) &
!$omp private(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end) &
!$omp shared(useGPU, isSkewsymmetric,
cuda
MemcpyDeviceToHost, successCuda, u_row, u_row_dev, &
!$omp shared(useGPU, isSkewsymmetric,
gpu
MemcpyDeviceToHost, successCuda, u_row, u_row_dev, &
!$omp & v_row, v_row_dev, v_col, v_col_dev, u_col, u_col_dev, a_dev, a_offset, &
!$omp& max_local_cols, max_local_rows, obj, wantDebug, l_rows_per_tile, l_cols_per_tile, &
!$omp& matrixRows, istep, tile_size, l_rows, l_cols, ur_p, uc_p, a_mat)
...
...
@@ -680,14 +680,14 @@ subroutine tridiag_&
! this requires altering of the algorithm when later explicitly updating the matrix
! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cublas"
)
call
c
ublas_PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
l_rows
,
l_cols
,
&
call
gp
ublas_PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
l_rows
,
l_cols
,
&
ONE
,
a_dev
,
matrixRows
,
&
v_row_dev
,
1
,
&
ONE
,
u_col_dev
,
1
)
! todo: try with non transposed!!!
! if(i/=j) then
! call
c
ublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! call
gp
ublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
! ONE, a_dev + a_offset, matrixRows, &
! v_col_dev + (l_col_beg - 1) * &
! size_of_datatype, 1, &
...
...
@@ -710,7 +710,7 @@ subroutine tridiag_&
a_offset
=
((
l_row_beg
-1
)
+
(
l_col_beg
-
1
)
*
matrixRows
)
*
&
size_of_datatype
call
c
ublas_PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
call
gp
ublas_PRECISION_GEMV
(
BLAS_TRANS_OR_CONJ
,
&
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
&
ONE
,
a_dev
+
a_offset
,
matrixRows
,
&
v_row_dev
+
(
l_row_beg
-
1
)
*
size_of_datatype
,
1
,
&
...
...
@@ -728,12 +728,12 @@ subroutine tridiag_&
a_offset
=
((
l_row_beg
-1
)
+
(
l_col_beg
-
1
)
*
matrixRows
)
*
&
size_of_datatype
if
(
isSkewsymmetric
)
then
call
c
ublas_PRECISION_GEMV
(
'N'
,
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
&
call
gp
ublas_PRECISION_GEMV
(
'N'
,
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
&
-
ONE
,
a_dev
+
a_offset
,
matrixRows
,
&
v_col_dev
+
(
l_col_beg
-
1
)
*
size_of_datatype
,
1
,
&
ONE
,
u_row_dev
+
(
l_row_beg
-
1
)
*
size_of_datatype
,
1
)
else
call
c
ublas_PRECISION_GEMV
(
'N'
,
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
&
call
gp
ublas_PRECISION_GEMV
(
'N'
,
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
&
ONE
,
a_dev
+
a_offset
,
matrixRows
,
&
v_col_dev
+
(
l_col_beg
-
1
)
*
size_of_datatype
,
1
,
&
ONE
,
u_row_dev
+
(
l_row_beg
-
1
)
*
size_of_datatype
,
1
)
...
...
@@ -741,12 +741,12 @@ subroutine tridiag_&
enddo
end
if
!multiplication as one block / per stripes
successCUDA
=
cuda
_memcpy
(
int
(
loc
(
u_col
(
1
)),
kind
=
c_intptr_t
),
&
u_col_dev
,
l_cols
*
size_of_datatype
,
cuda
MemcpyDeviceToHost
)
successCUDA
=
gpu
_memcpy
(
int
(
loc
(
u_col
(
1
)),
kind
=
c_intptr_t
),
&
u_col_dev
,
l_cols
*
size_of_datatype
,
gpu
MemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: u_col_dev 1"
,
successCUDA
)
successCUDA
=
cuda
_memcpy
(
int
(
loc
(
u_row
(
1
)),
kind
=
c_intptr_t
),
&
u_row_dev
,
l_rows
*
size_of_datatype
,
cuda
MemcpyDeviceToHost
)
successCUDA
=
gpu
_memcpy
(
int
(
loc
(
u_row
(
1
)),
kind
=
c_intptr_t
),
&
u_row_dev
,
l_rows
*
size_of_datatype
,
gpu
MemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag: u_row_dev 1"
,
successCUDA
)
endif
! useGPU
...
...
@@ -899,7 +899,7 @@ subroutine tridiag_&
! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
! we than use the same calls as for CPU version
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cublas"
)
call
c
ublas_PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
&
call
gp
ublas_PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
&
l_row_end
-
l_row_beg
+1
,
l_col_end
-
l_col_beg
+1
,
2
*
n_stored_vecs
,
&
ONE
,
vu_stored_rows_dev
+
(
l_row_beg
-
1
)
*
&
size_of_datatype
,
&
...
...
@@ -928,7 +928,7 @@ subroutine tridiag_&
!update whole (remaining) part of matrix, including tiles below diagonal
!we can do that in one large cublas call
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cublas"
)
call
c
ublas_PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
l_rows
,
l_cols
,
2
*
n_stored_vecs
,
&
call
gp
ublas_PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
l_rows
,
l_cols
,
2
*
n_stored_vecs
,
&
ONE
,
vu_stored_rows_dev
,
max_local_rows
,
&
uv_stored_cols_dev
,
max_local_cols
,
&
ONE
,
a_dev
,
matrixRows
)
...
...
src/general/precision_macros.h
View file @
35ceb5b5
...
...
@@ -95,6 +95,10 @@
#undef PRECISION_COPY
#undef PRECISION_AXPY
#undef PRECISION_GER
#undef gpublas_PRECISION_GEMM
#undef gpublas_PRECISION_TRMM
#undef gpublas_PRECISION_GEMV
#undef gpublas_PRECISION_SYMV
#undef cublas_PRECISION_GEMM
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
...
...
@@ -165,6 +169,10 @@
#define PRECISION_SCAL DSCAL
#define PRECISION_COPY DCOPY
#define PRECISION_AXPY DAXPY
#define gpublas_PRECISION_GEMM gpublas_DGEMM
#define gpublas_PRECISION_TRMM gpublas_DTRMM
#define gpublas_PRECISION_GEMV gpublas_DGEMV
#define gpublas_PRECISION_SYMV gpublas_DSYMV
#define cublas_PRECISION_GEMM cublas_DGEMM
#define cublas_PRECISION_TRMM cublas_DTRMM
#define cublas_PRECISION_GEMV cublas_DGEMV
...
...
@@ -232,6 +240,10 @@
#define PRECISION_SCAL SSCAL
#define PRECISION_COPY SCOPY
#define PRECISION_AXPY SAXPY
#define gpublas_PRECISION_GEMM gpublas_SGEMM
#define gpublas_PRECISION_TRMM gpublas_STRMM
#define gpublas_PRECISION_GEMV gpublas_SGEMV
#define gpublas_PRECISION_SYMV gpublas_SSYMV
#define cublas_PRECISION_GEMM cublas_SGEMM
#define cublas_PRECISION_TRMM cublas_STRMM
#define cublas_PRECISION_GEMV cublas_SGEMV
...
...
@@ -312,6 +324,10 @@
#undef PRECISION_SCAL
#undef PRECISION_COPY
#undef PRECISION_AXPY
#undef gpublas_PRECISION_GEMM
#undef gpublas_PRECISION_TRMM
#undef gpublas_PRECISION_GEMV
#undef gpublas_PRECISION_SYMV
#undef cublas_PRECISION_GEMM
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
...
...
@@ -392,6 +408,10 @@
#define PRECISION_SCAL ZSCAL
#define PRECISION_COPY ZCOPY
#define PRECISION_AXPY ZAXPY
#define gpublas_PRECISION_GEMM gpublas_ZGEMM
#define gpublas_PRECISION_TRMM gpublas_ZTRMM
#define gpublas_PRECISION_GEMV gpublas_ZGEMV
#define gpublas_PRECISION_SYMV gpublas_ZSYMV
#define cublas_PRECISION_GEMM cublas_ZGEMM
#define cublas_PRECISION_TRMM cublas_ZTRMM
#define cublas_PRECISION_GEMV cublas_ZGEMV
...
...
@@ -463,6 +483,10 @@
#define PRECISION_COPY CCOPY
#define PRECISION_AXPY CAXPY
#define PRECISION_GER CGER
#define gpublas_PRECISION_GEMM gpublas_CGEMM
#define gpublas_PRECISION_TRMM gpublas_CTRMM
#define gpublas_PRECISION_GEMV gpublas_CGEMV
#define gpublas_PRECISION_SYMV gpublas_CSYMV
#define cublas_PRECISION_GEMM cublas_CGEMM
#define cublas_PRECISION_TRMM cublas_CTRMM
#define cublas_PRECISION_GEMV cublas_CGEMV
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment