Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
e37486d5
Commit
e37486d5
authored
Feb 27, 2021
by
Andreas Marek
Browse files
GPU layer merge_systems
parent
0a63551f
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/solve_tridi/merge_systems_template.F90
View file @
e37486d5
...
...
@@ -641,28 +641,28 @@
if
(
useGPU
)
then
num
=
(
gemm_dim_k
*
gemm_dim_l
)
*
size_of_datatype
successCUDA
=
cuda
_host_register
(
int
(
loc
(
qtmp1
),
kind
=
c_intptr_t
),
num
,&
cuda
HostRegisterDefault
)
successCUDA
=
gpu
_host_register
(
int
(
loc
(
qtmp1
),
kind
=
c_intptr_t
),
num
,&
gpu
HostRegisterDefault
)
check_host_register_cuda
(
"merge_systems: qtmp1"
,
successCUDA
)
successCUDA
=
cuda
_malloc
(
qtmp1_dev
,
num
)
successCUDA
=
gpu
_malloc
(
qtmp1_dev
,
num
)
check_alloc_cuda
(
"merge_systems: qtmp1_dev"
,
successCUDA
)
num
=
(
gemm_dim_l
*
gemm_dim_m
)
*
size_of_datatype
successCUDA
=
cuda
_host_register
(
int
(
loc
(
ev
),
kind
=
c_intptr_t
),
num
,&
cuda
HostRegisterDefault
)
successCUDA
=
gpu
_host_register
(
int
(
loc
(
ev
),
kind
=
c_intptr_t
),
num
,&
gpu
HostRegisterDefault
)
check_host_register_cuda
(
"merge_systems: ev"
,
successCUDA
)
successCUDA
=
cuda
_malloc
(
ev_dev
,
num
)
successCUDA
=
gpu
_malloc
(
ev_dev
,
num
)
check_alloc_cuda
(
"merge_systems: ev_dev"
,
successCUDA
)
num
=
(
gemm_dim_k
*
gemm_dim_m
)
*
size_of_datatype
successCUDA
=
cuda
_host_register
(
int
(
loc
(
qtmp2
),
kind
=
c_intptr_t
),
num
,&
cuda
HostRegisterDefault
)
successCUDA
=
gpu
_host_register
(
int
(
loc
(
qtmp2
),
kind
=
c_intptr_t
),
num
,&
gpu
HostRegisterDefault
)
check_host_register_cuda
(
"merge_systems: qtmp2"
,
successCUDA
)
successCUDA
=
cuda
_malloc
(
qtmp2_dev
,
num
)
successCUDA
=
gpu
_malloc
(
qtmp2_dev
,
num
)
check_alloc_cuda
(
"merge_systems: qtmp2_dev"
,
successCUDA
)
endif
...
...
@@ -726,8 +726,8 @@
endif
if
(
useGPU
)
then
successCUDA
=
cuda
_memcpy
(
qtmp1_dev
,
int
(
loc
(
qtmp1
(
1
,
1
)),
kind
=
c_intptr_t
),
&
gemm_dim_k
*
gemm_dim_l
*
size_of_datatype
,
cuda
MemcpyHostToDevice
)
successCUDA
=
gpu
_memcpy
(
qtmp1_dev
,
int
(
loc
(
qtmp1
(
1
,
1
)),
kind
=
c_intptr_t
),
&
gemm_dim_k
*
gemm_dim_l
*
size_of_datatype
,
gpu
MemcpyHostToDevice
)
check_memcpy_cuda
(
"merge_systems: qtmp1_dev"
,
successCUDA
)
endif
...
...
@@ -791,14 +791,14 @@
if
(
useGPU
)
then
!TODO: it should be enough to copy l_rows x ncnt
successCUDA
=
cuda
_memcpy
(
qtmp2_dev
,
int
(
loc
(
qtmp2
(
1
,
1
)),
kind
=
c_intptr_t
),
&
gemm_dim_k
*
gemm_dim_m
*
size_of_datatype
,
cuda
MemcpyHostToDevice
)
successCUDA
=
gpu
_memcpy
(
qtmp2_dev
,
int
(
loc
(
qtmp2
(
1
,
1
)),
kind
=
c_intptr_t
),
&
gemm_dim_k
*
gemm_dim_m
*
size_of_datatype
,
gpu
MemcpyHostToDevice
)
check_memcpy_cuda
(
"merge_systems: qtmp2_dev"
,
successCUDA
)
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA
=
cuda
_memcpy
(
ev_dev
,
int
(
loc
(
ev
(
1
,
1
)),
kind
=
c_intptr_t
),
&
gemm_dim_l
*
gemm_dim_m
*
size_of_datatype
,
cuda
MemcpyHostToDevice
)
successCUDA
=
gpu
_memcpy
(
ev_dev
,
int
(
loc
(
ev
(
1
,
1
)),
kind
=
c_intptr_t
),
&
gemm_dim_l
*
gemm_dim_m
*
size_of_datatype
,
gpu
MemcpyHostToDevice
)
check_memcpy_cuda
(
"merge_systems: ev_dev"
,
successCUDA
)
endif
...
...
@@ -807,7 +807,7 @@
if
(
l_rnm
>
0
.and.
ncnt
>
0
.and.
nnzu
>
0
)
then
if
(
useGPU
)
then
call
obj
%
timer
%
start
(
"cublas"
)
call
c
ublas_PRECISION_GEMM
(
'N'
,
'N'
,
l_rnm
,
ncnt
,
nnzu
,
&
call
gp
ublas_PRECISION_GEMM
(
'N'
,
'N'
,
l_rnm
,
ncnt
,
nnzu
,
&
1.0_rk
,
qtmp1_dev
,
ubound
(
qtmp1
,
dim
=
1
),
&
ev_dev
,
ubound
(
ev
,
dim
=
1
),
&
1.0_rk
,
qtmp2_dev
,
ubound
(
qtmp2
,
dim
=
1
))
...
...
@@ -842,8 +842,8 @@
if
(
useGPU
)
then
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA
=
cuda
_memcpy
(
ev_dev
,
int
(
loc
(
ev
(
1
,
1
)),
kind
=
c_intptr_t
),
&
gemm_dim_l
*
gemm_dim_m
*
size_of_datatype
,
cuda
MemcpyHostToDevice
)
successCUDA
=
gpu
_memcpy
(
ev_dev
,
int
(
loc
(
ev
(
1
,
1
)),
kind
=
c_intptr_t
),
&
gemm_dim_l
*
gemm_dim_m
*
size_of_datatype
,
gpu
MemcpyHostToDevice
)
check_memcpy_cuda
(
"merge_systems: ev_dev"
,
successCUDA
)
endif
...
...
@@ -852,7 +852,7 @@
if
(
l_rows
-
l_rnm
>
0
.and.
ncnt
>
0
.and.
nnzl
>
0
)
then
if
(
useGPU
)
then
call
obj
%
timer
%
start
(
"cublas"
)
call
c
ublas_PRECISION_GEMM
(
'N'
,
'N'
,
l_rows
-
l_rnm
,
ncnt
,
nnzl
,
&
call
gp
ublas_PRECISION_GEMM
(
'N'
,
'N'
,
l_rows
-
l_rnm
,
ncnt
,
nnzl
,
&
1.0_rk
,
qtmp1_dev
+
l_rnm
*
size_of_datatype
,
ubound
(
qtmp1
,
dim
=
1
),
&
ev_dev
,
ubound
(
ev
,
dim
=
1
),
&
1.0_rk
,
qtmp2_dev
+
l_rnm
*
size_of_datatype
,
ubound
(
qtmp2
,
dim
=
1
))
...
...
@@ -873,8 +873,8 @@
if
(
useGPU
)
then
!TODO either copy only half of the matrix here, and get rid of the
!previous copy or copy whole array here
successCUDA
=
cuda
_memcpy
(
int
(
loc
(
qtmp2
(
1
,
1
)),
kind
=
c_intptr_t
),
qtmp2_dev
,
&
gemm_dim_k
*
gemm_dim_m
*
size_of_datatype
,
cuda
MemcpyDeviceToHost
)
successCUDA
=
gpu
_memcpy
(
int
(
loc
(
qtmp2
(
1
,
1
)),
kind
=
c_intptr_t
),
qtmp2_dev
,
&
gemm_dim_k
*
gemm_dim_m
*
size_of_datatype
,
gpu
MemcpyDeviceToHost
)
check_memcpy_cuda
(
"merge_systems: qtmp2_dev"
,
successCUDA
)
endif
...
...
@@ -888,22 +888,22 @@
enddo
!do np = 1, npc_n
if
(
useGPU
)
then
successCUDA
=
cuda
_host_unregister
(
int
(
loc
(
qtmp1
),
kind
=
c_intptr_t
))
successCUDA
=
gpu
_host_unregister
(
int
(
loc
(
qtmp1
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"merge_systems: qtmp1"
,
successCUDA
)
successCUDA
=
cuda
_free
(
qtmp1_dev
)
successCUDA
=
gpu
_free
(
qtmp1_dev
)
check_dealloc_cuda
(
"merge_systems: qtmp1_dev"
,
successCUDA
)
successCUDA
=
cuda
_host_unregister
(
int
(
loc
(
qtmp2
),
kind
=
c_intptr_t
))
successCUDA
=
gpu
_host_unregister
(
int
(
loc
(
qtmp2
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"merge_systems: qtmp2"
,
successCUDA
)
successCUDA
=
cuda
_free
(
qtmp2_dev
)
successCUDA
=
gpu
_free
(
qtmp2_dev
)
check_dealloc_cuda
(
"merge_systems: qtmp2_dev"
,
successCUDA
)
successCUDA
=
cuda
_host_unregister
(
int
(
loc
(
ev
),
kind
=
c_intptr_t
))
successCUDA
=
gpu
_host_unregister
(
int
(
loc
(
ev
),
kind
=
c_intptr_t
))
check_host_unregister_cuda
(
"merge_systems: ev"
,
successCUDA
)
successCUDA
=
cuda
_free
(
ev_dev
)
successCUDA
=
gpu
_free
(
ev_dev
)
check_dealloc_cuda
(
"merge_systems: ev_dev"
,
successCUDA
)
endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment