Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
3e3b9706
Commit
3e3b9706
authored
Feb 26, 2021
by
Andreas Marek
Browse files
Start to implement vendor agnostic layer in elpa1_tridiag
parent
ebc9ed9b
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
3e3b9706
...
...
@@ -45,6 +45,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa2/elpa2_compute.F90
\
src/elpa2/kernels/mod_single_hh_trafo_real.F90
\
src/GPU/check_for_gpu.F90
\
src/GPU/mod_vendor_agnostic_layer.F90
\
src/GPU/CUDA/mod_cuda.F90
\
src/GPU/ROCm/mod_hip.F90
\
src/elpa2/GPU/CUDA/interface_c_kernel.F90
\
...
...
src/GPU/CUDA/mod_cuda.F90
View file @
3e3b9706
...
...
@@ -268,7 +268,7 @@ module cuda_functions
implicit
none
integer
(
kind
=
C_intptr_T
)
::
a
integer
(
kind
=
c_intptr_t
),
intent
(
in
),
value
::
width_height
integer
(
kind
=
c_intptr_t
),
intent
(
in
),
value
::
width_height
integer
(
kind
=
C_INT
)
::
istat
end
function
cuda_malloc_c
...
...
@@ -635,8 +635,8 @@ module cuda_functions
use
,
intrinsic
::
iso_c_binding
implicit
none
integer
(
kind
=
C
_intptr_t
)
::
a
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
width_height
integer
(
kind
=
c
_intptr_t
)
::
a
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
width_height
logical
::
success
#ifdef WITH_NVIDIA_GPU_VERSION
success
=
cuda_malloc_c
(
a
,
width_height
)
/
=
0
...
...
@@ -664,8 +664,8 @@ module cuda_functions
use
,
intrinsic
::
iso_c_binding
implicit
none
type
(
c_ptr
)
::
a
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
width_height
type
(
c_ptr
)
::
a
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
width_height
logical
::
success
#ifdef WITH_NVIDIA_GPU_VERSION
success
=
cuda_malloc_host_c
(
a
,
width_height
)
/
=
0
...
...
src/GPU/ROCm/mod_hip.F90
View file @
3e3b9706
...
...
@@ -820,7 +820,7 @@ module hip_functions
#endif
end
function
hip_memcpy2d
function
cuda_hip
_register
(
a
,
size
,
flag
)
result
(
success
)
function
hip_host
_register
(
a
,
size
,
flag
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
...
...
@@ -837,7 +837,7 @@ module hip_functions
#endif
end
function
function
cuda_hip
_unregister
(
a
)
result
(
success
)
function
hip_host
_unregister
(
a
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
...
...
src/GPU/check_for_gpu.F90
View file @
3e3b9706
...
...
@@ -50,6 +50,7 @@ module mod_check_for_gpu
function
check_for_gpu
(
obj
,
myid
,
numberOfDevices
,
wantDebug
)
result
(
gpuAvailable
)
use
cuda_functions
use
hip_functions
use
precision
use
elpa_mpi
use
elpa_abstract_impl
...
...
@@ -109,19 +110,38 @@ module mod_check_for_gpu
endif
endif
#ifdef WITH_NVIDIA_GPU_VERSION
success
=
cuda_setdevice
(
use_gpu_id
)
#endif
#ifdef WITH_AMD_GPU_VERSION
success
=
hip_setdevice
(
use_gpu_id
)
#endif
if
(
.not.
(
success
))
then
#ifdef WITH_NVIDIA_GPU_VERSION
print
*
,
"Cannot set CudaDevice"
#endif
#ifdef WITH_AMD_GPU_VERSION
print
*
,
"Cannot set HIPDevice"
#endif
stop
1
endif
if
(
wantDebugMessage
)
then
print
'(3(a,i0))'
,
'MPI rank '
,
myid
,
' uses GPU #'
,
deviceNumber
endif
#ifdef WITH_NVIDIA_GPU_VERSION
success
=
cublas_create
(
cublasHandle
)
#endif
#ifdef WITH_AMD_GPU_VERSION
success
=
rocblas_create
(
cublasHandle
)
#endif
if
(
.not.
(
success
))
then
#ifdef WITH_NVIDIA_GPU_VERSION
print
*
,
"Cannot create cublas handle"
#endif
#ifdef WITH_AMD_GPU_VERSION
print
*
,
"Cannot create rocblas handle"
#endif
stop
1
endif
else
...
...
@@ -139,11 +159,21 @@ module mod_check_for_gpu
endif
endif
#ifdef WITH_NVIDIA_GPU_VERSION
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success
=
cuda_getdevicecount
(
numberOfDevices
)
#endif
#ifdef WITH_AMD_GPU_VERSION
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success
=
hip_getdevicecount
(
numberOfDevices
)
#endif
if
(
.not.
(
success
))
then
#ifdef WITH_NVIDIA_GPU_VERSION
print
*
,
"error in cuda_getdevicecount"
#endif
#ifdef WITH_AMPD_GPU_VERSION
print
*
,
"error in hip_getdevicecount"
#endif
stop
1
endif
...
...
@@ -171,19 +201,39 @@ module mod_check_for_gpu
endif
deviceNumber
=
mod
(
myid
,
numberOfDevices
)
#ifdef WITH_NIVDIA_GPU_VERSION
success
=
cuda_setdevice
(
deviceNumber
)
#endif
#ifdef WITH_AMD_GPU_VERSION
success
=
hip_setdevice
(
deviceNumber
)
#endif
if
(
.not.
(
success
))
then
#ifdef WITH_NIVDIA_GPU_VERSION
print
*
,
"Cannot set CudaDevice"
#endif
#ifdef WITH_AMD_GPU_VERSION
print
*
,
"Cannot set hipDevice"
#endif
stop
1
endif
if
(
wantDebugMessage
)
then
print
'(3(a,i0))'
,
'MPI rank '
,
myid
,
' uses GPU #'
,
deviceNumber
endif
#ifdef WITH_NIVDIA_GPU_VERSION
success
=
cublas_create
(
cublasHandle
)
#endif
#ifdef WITH_AMD_GPU_VERSION
success
=
rocblas_create
(
cublasHandle
)
#endif
if
(
.not.
(
success
))
then
#ifdef WITH_NIVDIA_GPU_VERSION
print
*
,
"Cannot create cublas handle"
#endif
#ifdef WITH_AMD_GPU_VERSION
print
*
,
"Cannot create rocblas handle"
#endif
stop
1
endif
...
...
src/GPU/mod_vendor_agnostic_layer.F90
0 → 100644
View file @
3e3b9706
#include "config-f90.h"
module
elpa_gpu
use
iso_c_binding
integer
(
kind
=
c_int
),
parameter
::
nvidia_gpu
=
1
integer
(
kind
=
c_int
),
parameter
::
amd_gpu
=
2
integer
(
kind
=
c_int
),
parameter
::
no_gpu
=
-1
integer
(
kind
=
c_int
)
::
use_gpu_vendor
integer
(
kind
=
c_int
)
::
gpuHostRegisterDefault
integer
(
kind
=
c_int
)
::
gpuMemcpyHostToDevice
integer
(
kind
=
c_int
)
::
gpuMemcpyDeviceToHost
integer
(
kind
=
c_int
)
::
gpuMemcpyDeviceToDevice
integer
(
kind
=
c_int
)
::
gpuHostRegisterMapped
integer
(
kind
=
c_int
)
::
gpuHostRegisterPortable
contains
function
gpu_vendor
()
result
(
vendor
)
use
precision
implicit
none
integer
(
kind
=
c_int
)
::
vendor
! default
vendor
=
no_gpu
#ifdef WITH_NVIDIA_GPU_VERSION
vendor
=
nvidia_gpu
#endif
#ifdef WITH_AMD_GPU_VERSION
vendor
=
amd_gpu
#endif
use_gpu_vendor
=
vendor
return
end
function
subroutine
set_gpu_parameters
use
cuda_functions
use
hip_functions
implicit
none
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
cudaMemcpyHostToDevice
=
cuda_memcpyHostToDevice
()
gpuMemcpyHostToDevice
=
cudaMemcpyHostToDevice
cudaMemcpyDeviceToHost
=
cuda_memcpyDeviceToHost
()
gpuMemcpyDeviceToHost
=
cudaMemcpyDeviceToHost
cudaMemcpyDeviceToDevice
=
cuda_memcpyDeviceToDevice
()
gpuMemcpyDeviceToDevice
=
cudaMemcpyDeviceToDevice
cudaHostRegisterPortable
=
cuda_hostRegisterPortable
()
gpuHostRegisterPortable
=
cudaHostRegisterPortable
cudaHostRegisterMapped
=
cuda_hostRegisterMapped
()
gpuHostRegisterMapped
=
cudaHostRegisterMapped
cudaHostRegisterDefault
=
cuda_hostRegisterDefault
()
gpuHostRegisterDefault
=
cudaHostRegisterDefault
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
hipMemcpyHostToDevice
=
hip_memcpyHostToDevice
()
gpuMemcpyHostToDevice
=
hipMemcpyHostToDevice
hipMemcpyDeviceToHost
=
hip_memcpyDeviceToHost
()
gpuMemcpyDeviceToHost
=
hipMemcpyDeviceToHost
hipMemcpyDeviceToDevice
=
hip_memcpyDeviceToDevice
()
gpuMemcpyDeviceToDevice
=
hipMemcpyDeviceToDevice
hipHostRegisterPortable
=
hip_hostRegisterPortable
()
gpuHostRegisterPortable
=
hipHostRegisterPortable
hipHostRegisterMapped
=
hip_hostRegisterMapped
()
gpuHostRegisterMapped
=
hipHostRegisterMapped
hipHostRegisterDefault
=
hip_hostRegisterDefault
()
gpuHostRegisterDefault
=
hipHostRegisterDefault
endif
end
subroutine
function
gpu_malloc_host
(
array
,
elements
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
type
(
c_ptr
)
::
array
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
elements
logical
::
success
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
success
=
cuda_malloc_host
(
array
,
elements
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
success
=
hip_host_malloc
(
array
,
elements
)
endif
end
function
function
gpu_malloc
(
array
,
elements
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
integer
(
kind
=
C_intptr_T
)
::
array
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
elements
logical
::
success
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
success
=
cuda_malloc
(
array
,
elements
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
success
=
hip_malloc
(
array
,
elements
)
endif
end
function
function
gpu_host_register
(
array
,
elements
,
flag
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
integer
(
kind
=
C_intptr_t
)
::
array
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
elements
integer
(
kind
=
C_INT
),
intent
(
in
)
::
flag
logical
::
success
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
success
=
cuda_host_register
(
array
,
elements
,
flag
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
success
=
hip_host_register
(
array
,
elements
,
flag
)
endif
end
function
function
gpu_memcpy
(
dst
,
src
,
size
,
dir
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
integer
(
kind
=
C_intptr_t
)
::
dst
integer
(
kind
=
C_intptr_t
)
::
src
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
size
integer
(
kind
=
C_INT
),
intent
(
in
)
::
dir
logical
::
success
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
success
=
cuda_memcpy
(
dst
,
src
,
size
,
dir
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
success
=
hip_memcpy
(
dst
,
src
,
size
,
dir
)
endif
end
function
function
gpu_memset
(
a
,
val
,
size
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
integer
(
kind
=
c_intptr_t
)
::
a
integer
(
kind
=
ik
)
::
val
integer
(
kind
=
c_intptr_t
),
intent
(
in
)
::
size
integer
(
kind
=
C_INT
)
::
istat
logical
::
success
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
success
=
cuda_memset
(
a
,
val
,
size
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
success
=
hip_memset
(
a
,
val
,
size
)
endif
end
function
function
gpu_free
(
a
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
integer
(
kind
=
c_intptr_t
)
::
a
logical
::
success
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
success
=
cuda_free
(
a
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
success
=
hip_free
(
a
)
endif
end
function
function
gpu_free_host
(
a
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
type
(
c_ptr
),
value
::
a
logical
::
success
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
success
=
cuda_free_host
(
a
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
success
=
hip_host_free
(
a
)
endif
end
function
function
gpu_host_unregister
(
a
)
result
(
success
)
use
,
intrinsic
::
iso_c_binding
use
cuda_functions
use
hip_functions
implicit
none
integer
(
kind
=
c_intptr_t
)
::
a
logical
::
success
if
(
use_gpu_vendor
==
nvidia_gpu
)
then
success
=
cuda_host_unregister
(
a
)
endif
if
(
use_gpu_vendor
==
amd_gpu
)
then
success
=
hip_host_unregister
(
a
)
endif
end
function
end
module
src/elpa1/elpa1_template.F90
View file @
3e3b9706
...
...
@@ -73,6 +73,8 @@ function elpa_solve_evp_&
#endif
use
precision
use
cuda_functions
use
hip_functions
use
elpa_gpu
use
mod_check_for_gpu
use
,
intrinsic
::
iso_c_binding
use
elpa_abstract_impl
...
...
@@ -294,11 +296,22 @@ function elpa_solve_evp_&
obj
%
eigenvalues_only
=
.true.
endif
call
obj
%
get
(
"nvidia-gpu"
,
gpu
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for gpu. Aborting..."
stop
if
(
gpu_vendor
()
==
NVIDIA_GPU
)
then
call
obj
%
get
(
"nvidia-gpu"
,
gpu
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for NVIDIA GPU. Aborting..."
stop
endif
else
if
(
gpu_vendor
()
==
AMD_GPU
)
then
call
obj
%
get
(
"amd-gpu"
,
gpu
,
error
)
if
(
error
.ne.
ELPA_OK
)
then
print
*
,
"Problem getting option for AMD GPU. Aborting..."
stop
endif
else
gpu
=
0
endif
if
(
gpu
.eq.
1
)
then
useGPU
=
.true.
else
...
...
@@ -344,11 +357,7 @@ function elpa_solve_evp_&
if
(
check_for_gpu
(
obj
,
my_pe
,
numberOfGPUDevices
,
wantDebug
=
wantDebug
))
then
do_useGPU
=
.true.
! set the neccessary parameters
cudaMemcpyHostToDevice
=
cuda_memcpyHostToDevice
()
cudaMemcpyDeviceToHost
=
cuda_memcpyDeviceToHost
()
cudaMemcpyDeviceToDevice
=
cuda_memcpyDeviceToDevice
()
cudaHostRegisterPortable
=
cuda_hostRegisterPortable
()
cudaHostRegisterMapped
=
cuda_hostRegisterMapped
()
call
set_gpu_parameters
()
else
print
*
,
"GPUs are requested but not detected! Aborting..."
success
=
.false.
...
...
src/elpa1/elpa1_tridiag_template.F90
View file @
3e3b9706
...
...
@@ -104,6 +104,7 @@ subroutine tridiag_&
use
matrix_plot
use
elpa_omp
use
elpa_blas_interfaces
use
elpa_gpu
implicit
none
#include "../general/precision_kinds.F90"
...
...
@@ -289,33 +290,33 @@ subroutine tridiag_&
if
(
useGPU
)
then
num
=
(
max_local_rows
+1
)
*
size_of_datatype
successCUDA
=
cuda
_malloc_host
(
v_row_host
,
num
)
successCUDA
=
gpu
_malloc_host
(
v_row_host
,
num
)
check_host_alloc_cuda
(
"tridiag: v_row_host"
,
successCUDA
)
call
c_f_pointer
(
v_row_host
,
v_row
,(/(
max_local_rows
+1
)/))
num
=
(
max_local_cols
)
*
size_of_datatype
successCUDA
=
cuda
_malloc_host
(
v_col_host
,
num
)
successCUDA
=
gpu
_malloc_host
(
v_col_host
,
num
)
check_host_alloc_cuda
(
"tridiag: v_col_host"
,
successCUDA
)
call
c_f_pointer
(
v_col_host
,
v_col
,(/(
max_local_cols
)/))
num
=
(
max_local_cols
)
*
size_of_datatype
successCUDA
=
cuda
_malloc_host
(
u_col_host
,
num
)
successCUDA
=
gpu
_malloc_host
(
u_col_host
,
num
)
check_host_alloc_cuda
(
"tridiag: u_col_host"
,
successCUDA
)
call
c_f_pointer
(
u_col_host
,
u_col
,(/(
max_local_cols
)/))
num
=
(
max_local_rows
)
*
size_of_datatype
successCUDA
=
cuda
_malloc_host
(
u_row_host
,
num
)
successCUDA
=
gpu
_malloc_host
(
u_row_host
,
num
)
check_host_alloc_cuda
(
"tridiag: u_row_host"
,
successCUDA
)
call
c_f_pointer
(
u_row_host
,
u_row
,(/(
max_local_rows
)/))
num
=
(
max_local_rows
*
2
*
max_stored_uv
)
*
size_of_datatype
successCUDA
=
cuda
_host_register
(
int
(
loc
(
vu_stored_rows
),
kind
=
c_intptr_t
),
num
,&
cuda
HostRegisterDefault
)
successCUDA
=
gpu
_host_register
(
int
(
loc
(
vu_stored_rows
),
kind
=
c_intptr_t
),
num
,&
gpu
HostRegisterDefault
)
check_host_register_cuda
(
"tridiag: vu_stored_roes"
,
successCUDA
)
num
=
(
max_local_cols
*
2
*
max_stored_uv
)
*
size_of_datatype
successCUDA
=
cuda
_host_register
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
),
num
,&
cuda
HostRegisterDefault
)
successCUDA
=
gpu
_host_register
(
int
(
loc
(
uv_stored_cols
),
kind
=
c_intptr_t
),
num
,&
gpu
HostRegisterDefault
)
check_host_register_cuda
(
"tridiag: uv_stored_cols"
,
successCUDA
)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
...
...
@@ -323,8 +324,8 @@ subroutine tridiag_&
#else
num
=
na
*
4
#endif
successCUDA
=
cuda
_host_register
(
int
(
loc
(
e_vec
),
kind
=
c_intptr_t
),
num
,&
cuda
HostRegisterDefault
)
successCUDA
=
gpu
_host_register
(
int
(
loc
(
e_vec
),
kind
=
c_intptr_t
),
num
,&
gpu
HostRegisterDefault
)
check_host_register_cuda
(
"tridiag: e_vec"
,
successCUDA
)
#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
...
...
@@ -332,8 +333,8 @@ subroutine tridiag_&
#else
num
=
na
*
4
#endif
successCUDA
=
cuda
_host_register
(
int
(
loc
(
d_vec
),
kind
=
c_intptr_t
),
num
,&
cuda
HostRegisterDefault
)
successCUDA
=
gpu
_host_register
(
int
(
loc
(
d_vec
),
kind
=
c_intptr_t
),
num
,&
gpu
HostRegisterDefault
)
check_host_register_cuda
(
"tridiag: d_vec"
,
successCUDA
)
else
...
...
@@ -372,23 +373,23 @@ subroutine tridiag_&
u_col
=
0
if
(
useGPU
)
then
successCUDA
=
cuda
_malloc
(
v_row_dev
,
max_local_rows
*
size_of_datatype
)
successCUDA
=
gpu
_malloc
(
v_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: v_row_dev"
,
successCUDA
)
successCUDA
=
cuda
_malloc
(
u_row_dev
,
max_local_rows
*
size_of_datatype
)
successCUDA
=
gpu
_malloc
(
u_row_dev
,
max_local_rows
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: u_row_dev"
,
successCUDA
)
successCUDA
=
cuda
_malloc
(
v_col_dev
,
max_local_cols
*
size_of_datatype
)
successCUDA
=
gpu
_malloc
(
v_col_dev
,
max_local_cols
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: v_col_dev"
,
successCUDA
)
successCUDA
=
cuda
_malloc
(
u_col_dev
,
max_local_cols
*
size_of_datatype
)
successCUDA
=
gpu
_malloc
(
u_col_dev
,
max_local_cols
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: u_col_dev"
,
successCUDA
)
successCUDA
=
cuda
_malloc
(
vu_stored_rows_dev
,
max_local_rows
*
2
*
max_stored_uv
*
size_of_datatype
)
successCUDA
=
gpu
_malloc
(
vu_stored_rows_dev
,
max_local_rows
*
2
*
max_stored_uv
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: vu_stored_rows_dev"
,
successCUDA
)
successCUDA
=
cuda
_malloc
(
uv_stored_cols_dev
,
max_local_cols
*
2
*
max_stored_uv
*
size_of_datatype
)
successCUDA
=
gpu
_malloc
(
uv_stored_cols_dev
,
max_local_cols
*
2
*
max_stored_uv
*
size_of_datatype
)
check_alloc_cuda
(
"tridiag: vu_stored_rows_dev"
,
successCUDA
)
endif
!useGPU
...
...
@@ -415,15 +416,15 @@ subroutine tridiag_&
num
=
matrixRows
*
matrixCols
*
size_of_datatype
successCUDA
=
cuda
_malloc
(
a_dev
,
num
)
successCUDA
=
gpu
_malloc
(
a_dev
,
num
)
check_alloc_cuda
(
"tridiag: a_dev"
,
successCUDA
)
successCUDA
=
cuda
_host_register
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
),
num
,&
cuda
HostRegisterDefault
)
successCUDA
=
gpu
_host_register
(
int
(
loc
(
a_mat
),
kind
=
c_intptr_t
),
num
,&
gpu
HostRegisterDefault
)
check_host_register_cuda
(
"tridiag: a_mat"
,
successCUDA
)
successCUDA
=
cuda
_memcpy
(
a_dev
,
int
(
loc
(
a_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
num
,
cuda
MemcpyHostToDevice
)
successCUDA
=
gpu
_memcpy
(
a_dev
,
int
(
loc
(
a_mat
(
1
,
1
)),
kind
=
c_intptr_t
),
&
num
,
gpu
MemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: a_dev"
,
successCUDA
)
endif
...
...
@@ -450,8 +451,8 @@ subroutine tridiag_&
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset,
! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA
=
cuda
_memcpy
(
int
(
loc
(
v_row
),
kind
=
c_intptr_t
),
&
a_dev
+
a_offset
,
(
l_rows
)
*
size_of_datatype
,
cuda
MemcpyDeviceToHost
)
successCUDA
=
gpu
_memcpy
(
int
(
loc
(
v_row
),
kind
=
c_intptr_t
),
&
a_dev
+
a_offset
,
(
l_rows
)
*
size_of_datatype
,
gpu
MemcpyDeviceToHost
)
check_memcpy_cuda
(
"tridiag a_dev 1"
,
successCUDA
)
else
v_row
(
1
:
l_rows
)
=
a_mat
(
1
:
l_rows
,
l_cols
+1
)
...
...
@@ -563,19 +564,19 @@ subroutine tridiag_&
u_row
(
1
:
l_rows
)
=
0
if
(
l_rows
>
0
.and.
l_cols
>
0
)
then
if
(
useGPU
)
then
successCUDA
=
cuda
_memset
(
u_col_dev
,
0
,
l_cols
*
size_of_datatype
)
successCUDA
=
gpu
_memset
(
u_col_dev
,
0
,
l_cols
*
size_of_datatype
)
check_memcpy_cuda
(
"tridiag: u_col_dev"
,
successCUDA
)
successCUDA
=
cuda
_memset
(
u_row_dev
,
0
,
l_rows
*
size_of_datatype
)
successCUDA
=
gpu
_memset
(
u_row_dev
,
0
,
l_rows
*
size_of_datatype
)
check_memcpy_cuda
(
"tridiag: u_row_dev"
,
successCUDA
)
successCUDA
=
cuda
_memcpy
(
v_col_dev
,
int
(
loc
(
v_col
(
1
)),
kind
=
c_intptr_t
),
&
l_cols
*
size_of_datatype
,
cuda
MemcpyHostToDevice
)
successCUDA
=
gpu
_memcpy
(
v_col_dev
,
int
(
loc
(
v_col
(
1
)),
kind
=
c_intptr_t
),
&
l_cols
*
size_of_datatype
,
gpu
MemcpyHostToDevice
)
check_memcpy_cuda
(
"tridiag: v_col_dev"
,
successCUDA
)