Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
ba80938c
Commit
ba80938c
authored
Dec 14, 2021
by
Andreas Marek
Browse files
Merge branch 'cusolver_passing_ci' into master_pre_stage
parents
1bbf7864
6b96dfde
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/cholesky/GPU/CUDA/elpa_cholesky_cuda.cu
View file @
ba80938c
...
...
@@ -62,7 +62,7 @@
__global__
void
copy_double_a_tmatc_kernel
(
double
*
a_dev
,
double
*
tmatc_dev
,
const
int
l_cols
,
const
int
matrixRows
,
const
int
l_colx
,
const
int
l_row1
,
const
int
nblk
){
int
ii_index
=
threadIdx
.
x
+
1
;
// range 1..nblk
int
jj_index
=
blockIdx
.
x
+
1
;
// range 1..l_col
x
-l_col
s-
1
int
jj_index
=
blockIdx
.
x
+
1
;
// range 1..l_col
s
-l_col
x+
1
tmatc_dev
[
l_colx
-
1
+
jj_index
-
1
+
(
ii_index
-
1
)
*
l_cols
]
=
a_dev
[
l_row1
-
1
+
ii_index
-
1
+
(
l_colx
-
1
+
jj_index
-
1
)
*
matrixRows
];
}
...
...
@@ -86,7 +86,7 @@ extern "C" void copy_double_a_tmatc_FromC(double *a_dev, double *tmatc_dev, int
__global__
void
copy_float_a_tmatc_kernel
(
float
*
a_dev
,
float
*
tmatc_dev
,
const
int
l_cols
,
const
int
matrixRows
,
const
int
l_colx
,
const
int
l_row1
,
const
int
nblk
){
int
ii_index
=
threadIdx
.
x
+
1
;
// range 1..nblk
int
jj_index
=
blockIdx
.
x
+
1
;
// range 1..l_col
x
-l_col
s-
1
int
jj_index
=
blockIdx
.
x
+
1
;
// range 1..l_col
s
-l_col
x+
1
tmatc_dev
[
l_colx
-
1
+
jj_index
-
1
+
(
ii_index
-
1
)
*
l_cols
]
=
a_dev
[
l_row1
-
1
+
ii_index
-
1
+
(
l_colx
-
1
+
jj_index
-
1
)
*
matrixRows
];
}
...
...
@@ -110,7 +110,7 @@ extern "C" void copy_float_a_tmatc_FromC(float *a_dev, float *tmatc_dev, int *nb
__global__
void
copy_double_complex_a_tmatc_kernel
(
cuDoubleComplex
*
a_dev
,
cuDoubleComplex
*
tmatc_dev
,
const
int
l_cols
,
const
int
matrixRows
,
const
int
l_colx
,
const
int
l_row1
){
int
ii_index
=
threadIdx
.
x
+
1
;
// range 1..nblk
int
jj_index
=
blockIdx
.
x
+
1
;
// range 1..l_col
x
-l_col
s-
1
int
jj_index
=
blockIdx
.
x
+
1
;
// range 1..l_col
s
-l_col
x+
1
tmatc_dev
[
l_colx
-
1
+
jj_index
-
1
+
(
ii_index
-
1
)
*
l_cols
]
=
cuConj
(
a_dev
[
l_row1
-
1
+
ii_index
-
1
+
(
l_colx
-
1
+
jj_index
-
1
)
*
matrixRows
]);
}
...
...
@@ -137,7 +137,7 @@ extern "C" void copy_double_complex_a_tmatc_FromC(double _Complex *a_dev, double
__global__
void
copy_float_complex_a_tmatc_kernel
(
cuFloatComplex
*
a_dev
,
cuFloatComplex
*
tmatc_dev
,
const
int
l_cols
,
const
int
matrixRows
,
const
int
l_colx
,
const
int
l_row1
){
int
ii_index
=
threadIdx
.
x
+
1
;
// range 1..nblk
int
jj_index
=
blockIdx
.
x
+
1
;
// range 1..l_col
x
-l_col
s-
1
int
jj_index
=
blockIdx
.
x
+
1
;
// range 1..l_col
s
-l_col
x+
1
tmatc_dev
[
l_colx
-
1
+
jj_index
-
1
+
(
ii_index
-
1
)
*
l_cols
]
=
cuConjf
(
a_dev
[
l_row1
-
1
+
ii_index
-
1
+
(
l_colx
-
1
+
jj_index
-
1
)
*
matrixRows
]);
}
...
...
src/cholesky/elpa_cholesky_template.F90
View file @
ba80938c
...
...
@@ -55,7 +55,6 @@
use
mod_check_for_gpu
use
invert_trm_cuda
,
only
:
copy_PRECISION_tmp1_tmp2
,
&
copy_PRECISION_a_tmp1
use
cholesky_cuda
implicit
none
#include "../general/precision_kinds.F90"
...
...
@@ -498,8 +497,11 @@
#else
tmp1_mpi_dev
=
transfer
(
tmp1_dev
,
tmp1_mpi_dev
)
! and associate a fortran pointer
call
c_f_pointer
(
tmp1_mpi_dev
,
tmp1_mpi_fortran_ptr
,
[
nblk
*
nblk
])
call
c_f_pointer
(
tmp1_mpi_dev
,
tmp1_mpi_fortran_ptr
,
[
nblk
,
nblk
])
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cuda_aware_device_synchronize"
)
successGPU
=
gpu_devicesynchronize
()
check_memcpy_gpu
(
"cholesky: device_synchronize"
,
successGPU
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"cuda_aware_device_synchronize"
)
call
obj
%
timer
%
start
(
"mpi_cuda_communication"
)
call
MPI_Bcast
(
tmp1_mpi_fortran_ptr
,
int
(
nblk
*
(
nblk
+1
)/
2
,
kind
=
MPI_KIND
),
&
...
...
@@ -559,7 +561,9 @@
if
(
useGPU
)
then
if
(
my_prow
==
prow
(
n
,
nblk
,
np_rows
))
then
call
copy_PRECISION_a_tmatc
(
a_dev
,
tmatc_dev
,
nblk
,
matrixRows
,
l_cols
,
l_colx
,
l_row1
)
! if l_cols-l_colx+1 == 0 kernel launch with 0 blocks => raises error
if
(
l_cols
-
l_colx
+1
>
0
)
&
call
copy_PRECISION_a_tmatc
(
a_dev
,
tmatc_dev
,
nblk
,
matrixRows
,
l_cols
,
l_colx
,
l_row1
)
endif
else
! useGPU
do
i
=
1
,
nblk
...
...
@@ -600,6 +604,11 @@
tmatc_mpi_dev
=
transfer
(
tmatc_dev
,
tmatc_mpi_dev
)
! and associate a fortran pointer
call
c_f_pointer
(
tmatc_mpi_dev
,
tmatc_mpi_fortran_ptr
,
[
l_cols
,
nblk
])
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"cuda_aware_device_synchronize"
)
successGPU
=
gpu_devicesynchronize
()
check_memcpy_gpu
(
"cholesky: device_synchronize"
,
successGPU
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"cuda_aware_device_synchronize"
)
do
i
=
1
,
nblk
call
obj
%
timer
%
start
(
"mpi_cuda_communication"
)
...
...
src/invert_trm/GPU/CUDA/elpa_invert_trm_cuda.cu
View file @
ba80938c
...
...
@@ -614,3 +614,4 @@ extern "C" void copy_float_complex_a_tmp1_FromC(float _Complex *a_dev, float _Co
printf
(
"Error in executing copy_float_complex_a_tmp1_kernel: %s
\n
"
,
cudaGetErrorString
(
cuerr
));
}
}
src/invert_trm/elpa_invert_trm_template.F90
View file @
ba80938c
...
...
@@ -94,13 +94,13 @@
integer
(
kind
=
c_int
)
::
gpu
,
numGPU
integer
(
kind
=
c_intptr_t
)
::
tmat1_dev
,
tmat2_dev
,
a_dev
,
tmp1_dev
,
tmp2_dev
,
zero_dev
type
(
c_ptr
)
::
tmp1_mpi_dev
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
tmp1_mpi_fortran_ptr
(:
,:
)
type
(
c_ptr
)
::
tmat1_mpi_dev
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
tmat1_mpi_fortran_ptr
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
tmp1_mpi_fortran_ptr
(:)
type
(
c_ptr
)
::
tmat1_mpi_dev
,
tmat2_mpi_dev
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
tmat1_mpi_fortran_ptr
(:,:)
,
tmat2_mpi_fortran_ptr
(:,:)
type
(
c_ptr
)
::
tmp2_mpi_dev
,
a_mpi_dev
integer
(
kind
=
c_intptr_t
)
::
a_off
,
tmat2_off
,
tmp1_off
,
tmp2_off
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
a_mpi_deviceptr
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
a_mpi_deviceptr
(:,:)
,
initializer_ptr
(:)
!DEB
integer
(
kind
=
c_intptr_t
)
::
num
integer
(
kind
=
c_int
)
::
gpu_invert_trm
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
...
...
@@ -399,9 +399,16 @@
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("
mpi_communication
")
#else
tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk*nblk])
if (wantDebug) call obj%timer%start("
cuda_aware_device_synchronize
")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("
invert_trm
:
device_synchronize
", successGPU)
if (wantDebug) call obj%timer%stop("
cuda_aware_device_synchronize
")
if (wantDebug) call obj%timer%start("
cuda_mpi_communication
")
call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nb*(nb+1)/2,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
...
...
@@ -437,7 +444,6 @@
call gpublas_PRECISION_TRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, ONE, tmp2_dev, &
nblk, a_dev+a_off, matrixRows)
!successGPU = gpu_devicesynchronize()
endif
call obj%timer%stop("
gpublas
")
...
...
@@ -497,7 +503,10 @@
tmat1_mpi_dev = transfer(tmat1_dev, tmat1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmat1_mpi_dev, tmat1_mpi_fortran_ptr, [l_rows,nblk])
if (wantDebug) call obj%timer%start("
cuda_aware_device_synchronize
")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("
invert_trm
:
device_synchronize
", successGPU)
if (wantDebug) call obj%timer%stop("
cuda_aware_device_synchronize
")
call obj%timer%start("
mpi_cuda_communication
")
do i=1,nb
call MPI_Bcast(tmat1_mpi_fortran_ptr(1,i), int(l_row1-1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
...
...
@@ -534,9 +543,6 @@
check_memcpy_gpu("
elpa_invert_trm
:
tmat2_dev
to
tmat2
", successGPU)
endif
endif
#else
#error "
not
yet
implemented
"
#endif
call obj%timer%start("
mpi_communication
")
if (l_cols-l_col1+1 > 0) &
...
...
@@ -545,7 +551,6 @@
call obj%timer%stop("
mpi_communication
")
#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
if (l_cols-l_col1+1 > 0) then
num = nblk*l_cols*size_of_datatype
...
...
@@ -553,11 +558,23 @@
gpuMemcpyHostToDevice)
check_memcpy_gpu("
elpa_invert_trm
:
tmat2
to
tmat2_dev
", successGPU)
endif
endif
#else
#error "
not
yet
implemented
"
#endif
tmat2_mpi_dev = transfer(tmat2_dev, tmat2_mpi_dev)
call c_f_pointer(tmat2_mpi_dev, tmat2_mpi_fortran_ptr, [nblk,l_cols])
if (wantDebug) call obj%timer%start("
cuda_aware_device_synchronize
")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("
invert_trm
:
device_synchronize
", successGPU)
if (wantDebug) call obj%timer%stop("
cuda_aware_device_synchronize
")
call obj%timer%start("
mpi_cuda_communication
")
if (l_cols-l_col1+1 > 0) &
call MPI_Bcast(tmat2_mpi_fortran_ptr(1,l_col1), int((l_cols-l_col1+1)*nblk,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION, int(prow(n, nblk, np_rows),kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("
mpi_cuda_communication
")
endif
#
endif
#endif /* WITH_MPI */
if (useGPU) then
...
...
src/invert_trm/mod_invert_trm_cuda.F90
View file @
ba80938c
...
...
@@ -492,6 +492,5 @@ module invert_trm_cuda
#endif
end
subroutine
end
module
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment