Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
03ef1ee4
Commit
03ef1ee4
authored
Jun 09, 2016
by
Andreas Marek
Browse files
Remove some unecessary cuda_memcpy in real case
The same should be done for the complex case
parent
29f31c21
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
src/elpa2.F90
View file @
03ef1ee4
...
...
@@ -183,6 +183,7 @@ contains
integer
(
kind
=
ik
)
::
my_pe
,
n_pes
,
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
nbw
,
num_blocks
real
(
kind
=
rk8
),
allocatable
::
tmat
(:,:,:),
e
(:)
integer
(
kind
=
c_intptr_t
)
::
tmat_dev
,
q_dev
,
a_dev
real
(
kind
=
c_double
)
::
ttt0
,
ttt1
,
ttts
! MPI_WTIME always needs double
integer
(
kind
=
ik
)
::
i
logical
::
success
...
...
@@ -322,11 +323,11 @@ contains
ttt0
=
MPI_Wtime
()
ttts
=
ttt0
#ifdef DOUBLE_PRECISION_REAL
call
bandred_real_double
(
na
,
a
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
call
bandred_real_double
(
na
,
a
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
tmat_dev
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
#else
call
bandred_real_single
(
na
,
a
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
call
bandred_real_single
(
na
,
a
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
tmat_dev
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
#endif
if
(
.not.
(
success
))
return
ttt1
=
MPI_Wtime
()
...
...
@@ -395,11 +396,11 @@ contains
ttt0
=
MPI_Wtime
()
#ifdef DOUBLE_PRECISION_REAL
call
trans_ev_tridi_to_band_real_double
(
na
,
nev
,
nblk
,
nbw
,
q
,
ldq
,
matrixCols
,
hh_trans_real
,
&
call
trans_ev_tridi_to_band_real_double
(
na
,
nev
,
nblk
,
nbw
,
q
,
q_dev
,
ldq
,
matrixCols
,
hh_trans_real
,
&
mpi_comm_rows
,
mpi_comm_cols
,
wantDebug
,
useGPU
,
success
,
&
THIS_REAL_ELPA_KERNEL
)
#else
call
trans_ev_tridi_to_band_real_single
(
na
,
nev
,
nblk
,
nbw
,
q
,
ldq
,
matrixCols
,
hh_trans_real
,
&
call
trans_ev_tridi_to_band_real_single
(
na
,
nev
,
nblk
,
nbw
,
q
,
q_dev
,
ldq
,
matrixCols
,
hh_trans_real
,
&
mpi_comm_rows
,
mpi_comm_cols
,
wantDebug
,
useGPU
,
success
,
&
THIS_REAL_ELPA_KERNEL
)
#endif
...
...
@@ -421,10 +422,10 @@ contains
print
*
,
"useGPU== "
,
useGPU
ttt0
=
MPI_Wtime
()
#ifdef DOUBLE_PRECISION_REAL
call
trans_ev_band_to_full_real_double
(
na
,
nev
,
nblk
,
nbw
,
a
,
lda
,
tmat
,
q
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
call
trans_ev_band_to_full_real_double
(
na
,
nev
,
nblk
,
nbw
,
a
,
a_dev
,
lda
,
tmat
,
tmat_dev
,
q
,
q_dev
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
mpi_comm_cols
,
useGPU
,
useQRActual
)
#else
call
trans_ev_band_to_full_real_single
(
na
,
nev
,
nblk
,
nbw
,
a
,
lda
,
tmat
,
q
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
call
trans_ev_band_to_full_real_single
(
na
,
nev
,
nblk
,
nbw
,
a
,
a_dev
,
lda
,
tmat
,
tmat_dev
,
q
,
q_dev
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
mpi_comm_cols
,
useGPU
,
useQRActual
)
#endif
...
...
@@ -534,6 +535,7 @@ contains
integer
(
kind
=
ik
)
::
my_pe
,
n_pes
,
my_prow
,
my_pcol
,
np_rows
,
np_cols
,
mpierr
integer
(
kind
=
ik
)
::
nbw
,
num_blocks
real
(
kind
=
rk4
),
allocatable
::
tmat
(:,:,:),
e
(:)
integer
(
kind
=
c_intptr_t
)
::
tmat_dev
,
q_dev
,
a_dev
real
(
kind
=
c_double
)
::
ttt0
,
ttt1
,
ttts
! MPI_WTIME always needs double
integer
(
kind
=
ik
)
::
i
logical
::
success
...
...
@@ -672,11 +674,11 @@ contains
ttt0
=
MPI_Wtime
()
ttts
=
ttt0
#ifdef DOUBLE_PRECISION_REAL
call
bandred_real_double
(
na
,
a
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
call
bandred_real_double
(
na
,
a
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
tmat_dev
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
#else
call
bandred_real_single
(
na
,
a
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
call
bandred_real_single
(
na
,
a
,
a_dev
,
lda
,
nblk
,
nbw
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
mpi_comm_cols
,
&
tmat
,
tmat_dev
,
wantDebug
,
useGPU
,
success
,
useQRActual
)
#endif
if
(
.not.
(
success
))
return
ttt1
=
MPI_Wtime
()
...
...
@@ -745,11 +747,11 @@ contains
ttt0
=
MPI_Wtime
()
#ifdef DOUBLE_PRECISION_REAL
call
trans_ev_tridi_to_band_real_double
(
na
,
nev
,
nblk
,
nbw
,
q
,
ldq
,
matrixCols
,
hh_trans_real
,
&
call
trans_ev_tridi_to_band_real_double
(
na
,
nev
,
nblk
,
nbw
,
q
,
q_dev
,
ldq
,
matrixCols
,
hh_trans_real
,
&
mpi_comm_rows
,
mpi_comm_cols
,
wantDebug
,
useGPU
,
success
,
&
THIS_REAL_ELPA_KERNEL
)
#else
call
trans_ev_tridi_to_band_real_single
(
na
,
nev
,
nblk
,
nbw
,
q
,
ldq
,
matrixCols
,
hh_trans_real
,
&
call
trans_ev_tridi_to_band_real_single
(
na
,
nev
,
nblk
,
nbw
,
q
,
q_dev
,
ldq
,
matrixCols
,
hh_trans_real
,
&
mpi_comm_rows
,
mpi_comm_cols
,
wantDebug
,
useGPU
,
success
,
&
THIS_REAL_ELPA_KERNEL
)
#endif
...
...
@@ -771,10 +773,10 @@ contains
print
*
,
"useGPU== "
,
useGPU
ttt0
=
MPI_Wtime
()
#ifdef DOUBLE_PRECISION_REAL
call
trans_ev_band_to_full_real_double
(
na
,
nev
,
nblk
,
nbw
,
a
,
lda
,
tmat
,
q
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
call
trans_ev_band_to_full_real_double
(
na
,
nev
,
nblk
,
nbw
,
a
,
lda
,
tmat
,
tmat_dev
,
q
,
q_dev
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
mpi_comm_cols
,
useGPU
,
useQRActual
)
#else
call
trans_ev_band_to_full_real_single
(
na
,
nev
,
nblk
,
nbw
,
a
,
lda
,
tmat
,
q
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
call
trans_ev_band_to_full_real_single
(
na
,
nev
,
nblk
,
nbw
,
a
,
lda
,
tmat
,
tmat_dev
,
q
,
q_dev
,
ldq
,
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
mpi_comm_cols
,
useGPU
,
useQRActual
)
#endif
...
...
src/elpa2_compute_real_template.X90
View file @
03ef1ee4
...
...
@@ -62,11 +62,11 @@
#endif
#ifdef DOUBLE_PRECISION_REAL
subroutine bandred_real_double(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQR)
subroutine bandred_real_double(na, a,
a_dev,
lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat,
tmat_dev,
wantDebug, useGPU, success, useQR)
#else
subroutine bandred_real_single(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQR)
subroutine bandred_real_single(na, a,
a_dev,
lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat,
tmat_dev,
wantDebug, useGPU, success, useQR)
#endif
!-------------------------------------------------------------------------------
...
...
@@ -135,6 +135,7 @@
real(kind=REAL_DATATYPE) :: dwork_size(1)
real(kind=REAL_DATATYPE), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
! a_dev is passed from bandred_real to trans_ev_band
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
...
...
@@ -1240,7 +1241,9 @@
1.0_rk4, (a_dev+(lcs-1)*lda*size_of_single_real_datatype), lda)
#endif
enddo
else ! do not useGPU
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -1422,6 +1425,7 @@
enddo ! istep
if (useGPU) then
! this is not needed since a_dev is passed along from one subroutine to the other
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memcpy ( loc (a), a_dev, lda*na_cols*size_of_double_real_datatype,cudaMemcpyDeviceToHost)
#else
...
...
@@ -1438,11 +1442,12 @@
stop
endif
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaFree"
stop
endif
! this is not necessart tmat_dev is passed (unchanged) from one routine to the other
! successCUDA = cuda_free(tmat_dev)
! if (.not.(successCUDA)) then
! print *,"bandred_real: error in cudaFree"
! stop
! endif
successCUDA = cuda_free(vav_dev)
if (.not.(successCUDA)) then
...
...
@@ -1609,11 +1614,11 @@
#endif
#ifdef DOUBLE_PRECISION_REAL
subroutine trans_ev_band_to_full_real_double(na, nqc, nblk, nbw, a, lda, tmat,
q, ldq, matrixCols, numBlocks, mpi_comm_row
s, &
mpi_comm_cols, useGPU, useQR)
subroutine trans_ev_band_to_full_real_double(na, nqc, nblk, nbw, a,
a_dev,
lda, tmat,
tmat_dev, q, q_dev, ldq, matrixCol
s, &
numBlocks, mpi_comm_rows,
mpi_comm_cols, useGPU, useQR)
#else
subroutine trans_ev_band_to_full_real_single(na, nqc, nblk, nbw, a, lda, tmat,
q, ldq, matrixCols, numBlocks, mpi_comm_row
s, &
mpi_comm_cols, useGPU, useQR)
subroutine trans_ev_band_to_full_real_single(na, nqc, nblk, nbw, a,
a_dev,
lda, tmat,
tmat_dev, q, q_dev, ldq, matrixCol
s, &
numBlocks, mpi_comm_rows,
mpi_comm_cols, useGPU, useQR)
#endif
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_real:
...
...
@@ -1663,6 +1668,9 @@
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, &
max_local_cols
...
...
@@ -1671,6 +1679,10 @@
real(kind=REAL_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
! hvm_dev is fist used and set in this routine
! q is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
! tmat_dev is passed along from bandred_real
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmp_dev, tmat_dev
integer(kind=ik) :: i
...
...
@@ -1743,37 +1755,45 @@
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
#else
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_double_real_datatype)
#else
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
! already existent on GPU
!#ifdef DOUBLE_PRECISION_REAL
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
!#else
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
! q_dev already living on device
!#ifdef DOUBLE_PRECISION_REAL
! successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_double_real_datatype)
!#else
! successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_single_real_datatype)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
! q_temp(:,:) = 0.0
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
!#ifdef DOUBLE_PRECISION_REAL
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_double_real_datatype, cudaMemcpyHostToDevice)
!#else
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_single_real_datatype, cudaMemcpyHostToDevice)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_double_real_datatype, cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_single_real_datatype, cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
! if MPI is NOT used the following steps could be done on the GPU and memory transfers could be avoided
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_double_real_datatype)
#else
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_single_real_datatype)
...
...
@@ -1868,7 +1888,11 @@
q_dev, ldq , 0.0_rk4, tmp_dev, n_cols)
#endif
#ifdef WITH_MPI
! copy data from device to host for a later MPI_ALLREDUCE
#ifdef DOUBLE_PRECISION_REAL
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_double_real_datatype, cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_single_real_datatype, cudaMemcpyDeviceToHost)
...
...
@@ -1877,19 +1901,31 @@
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#endif /* WITH_MPI */
else
!#ifdef WITH_GPU_VERSION
! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_real_datatype)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_real: error in cudaMemset"
! stop
! endif
!
!#else
else ! l_rows>0
#ifdef WITH_MPI
tmp1(1:l_cols*n_cols) = 0
!#endif
#else
! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device
#ifdef WITH_GPU_VERSION
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_double_real_datatype)
#else
successCUDA = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemset"
stop
endif
#endif
#endif /* WITH_MPI */
endif ! l_rows>0
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*size_of_real_datatype,cudaMemcpyDeviceToHost)
...
...
@@ -1918,7 +1954,10 @@
!#endif
if (l_rows>0) then
#ifdef WITH_MPI
! after the allreduce we have to copy back to the device
#ifdef DOUBLE_PRECISION_REAL
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*size_of_double_real_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*size_of_single_real_datatype,cudaMemcpyHostToDevice)
...
...
@@ -1927,15 +1966,21 @@
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#endif /* WITH_MPI */
! already existend on GPU
!#ifdef DOUBLE_PRECISION_REAL
! ! copy to device, maybe this can be avoided tmat is input from bandred_real
!
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
!#else
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
! stop
! endif
#ifdef DOUBLE_PRECISION_REAL
call cublas_dtrmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk8, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_dgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk8, hvm_dev, max_local_rows, &
...
...
@@ -1946,15 +1991,17 @@
tmp_dev, n_cols, 1.0_rk4, q_dev, ldq)
#endif
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
!#ifdef DOUBLE_PRECISION_REAL
! ! copy to host maybe this can be avoided
! ! this is not necessary hvm is not used anymore
! successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
!#else
! successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
! stop
! endif
endif ! l_rows > 0
!#ifdef WITH_GPU_VERSION
...
...
@@ -2108,13 +2155,15 @@
q, ldq, 0.0_rk4, tmp1, n_cols)
#endif
else
else ! l_rows>0
#ifdef DOUBLE_PRECISION_REAL
tmp1(1:l_cols*n_cols) = 0._rk8
#else
tmp1(1:l_cols*n_cols) = 0._rk4
#endif
endif
endif ! l_rows>0
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -2167,6 +2216,7 @@
stop
endif
#ifdef DOUBLE_PRECISION_REAL
! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols*size_of_double_real_datatype, cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols*size_of_single_real_datatype, cudaMemcpyDeviceToHost)
...
...
@@ -3336,11 +3386,11 @@
#endif
#ifdef DOUBLE_PRECISION_REAL
subroutine trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
subroutine trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q,
q_dev,
ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
#else
subroutine trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, &
subroutine trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q,
q_dev,
ldq, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
#endif
...
...
@@ -3389,6 +3439,9 @@
#else
real(kind=REAL_DATATYPE) :: q(ldq,matrixCols)
#endif
integer(kind=c_intptr_t) :: q_dev
real(kind=REAL_DATATYPE), intent(in) :: hh_trans_real(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
...
...
@@ -3411,12 +3464,12 @@
logical :: flag
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), pointer :: a(:,:,:,:)
real(kind=REAL_DATATYPE), pointer :: a
Intern
(:,:,:,:)
#else
real(kind=REAL_DATATYPE), pointer :: a(:,:,:)
real(kind=REAL_DATATYPE), pointer :: a
Intern
(:,:,:)
#endif
real(kind=REAL_DATATYPE) :: a_real
type(c_ptr) :: a_ptr
type(c_ptr) :: a
Intern
_ptr
real(kind=REAL_DATATYPE) , allocatable :: row(:)
real(kind=REAL_DATATYPE) , allocatable :: row_group(:,:)
...
...
@@ -3438,7 +3491,7 @@
! real*8, allocatable, device :: hh_dot_dev(:)
! real*8, allocatable, device :: hh_tau_dev(:)
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: a
Intern
_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1
...
...
@@ -3588,17 +3641,17 @@
if (useGPU) then
#ifdef DOUBLE_PRECISION_REAL
num = (stripe_width*a_dim2*stripe_count)*size_of_double_real_datatype
successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_double_real_datatype)
successCUDA = cuda_malloc(a
Intern
_dev, stripe_width*a_dim2*stripe_count*size_of_double_real_datatype)
#else
num = (stripe_width*a_dim2*stripe_count)*size_of_single_real_datatype
successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_single_real_datatype)
successCUDA = cuda_malloc(a
Intern
_dev, stripe_width*a_dim2*stripe_count*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage
stop
endif
successCUDA = cuda_memset(a_dev , 0, num)
successCUDA = cuda_memset(a
Intern
_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage
stop
...
...
@@ -3606,32 +3659,32 @@
else ! GPUs are not used
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: a
!DEC$ ATTRIBUTES ALIGN: 64:: a
Intern
#endif
#ifdef WITH_OPENMP
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
if (posix_memalign(a
Intern
_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a
Intern
"//errorMessage
stop
endif
call c_f_pointer(a_ptr, a, [stripe_width,a_dim2,stripe_count,max_threads])
! allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
call c_f_pointer(a
Intern
_ptr, a
Intern
, [stripe_width,a_dim2,stripe_count,max_threads])
! allocate(a
Intern
(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
! a
Intern
(:,:,:,:) should be set to 0 in a parallel region, not here!
#else
if (posix_memalign(a_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
if (posix_memalign(a
Intern
_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_real)) /= 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a
Intern
"//errorMessage
stop
endif
call c_f_pointer(a_ptr, a,[stripe_width,a_dim2,stripe_count] )
!allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
call c_f_pointer(a
Intern
_ptr, a
Intern
,[stripe_width,a_dim2,stripe_count] )
!allocate(a
Intern
(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
#ifdef DOUBLE_PRECISION_REAL
a(:,:,:) = 0._rk8
a
Intern
(:,:,:) = 0._rk8
#else
a(:,:,:) = 0._rk4
a
Intern
(:,:,:) = 0._rk4
#endif
#endif
...
...
@@ -3715,9 +3768,9 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
a(:,:,:,my_thread) = 0.0_rk8 ! if possible, do first touch allocation!
a
Intern
(:,:,:,my_thread) = 0.0_rk8 ! if possible, do first touch allocation!
#else
a(:,:,:,my_thread) = 0.0_rk4 ! if possible, do first touch allocation!
a
Intern
(:,:,:,my_thread) = 0.0_rk4 ! if possible, do first touch allocation!
#endif
enddo
!$omp end parallel do
...
...
@@ -3766,10 +3819,10 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_openmp_double(a, row, i-limits(ip), my_thread, stripe_count, &
call unpack_row_real_cpu_openmp_double(a
Intern
, row, i-limits(ip), my_thread, stripe_count, &
thread_width, stripe_width, l_nev)
#else
call unpack_row_real_cpu_openmp_single(a, row, i-limits(ip), my_thread, stripe_count, &
call unpack_row_real_cpu_openmp_single(a
Intern
, row, i-limits(ip), my_thread, stripe_count, &
thread_width, stripe_width, l_nev)
#endif
enddo
...
...
@@ -3788,11 +3841,11 @@
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a_dev, stripe_count, &
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a
Intern
_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
#else
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, a_dev, stripe_count, &
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, a
Intern
_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
#endif
...
...
@@ -3823,9 +3876,9 @@
#endif /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_double(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
call unpack_row_real_cpu_double(a
Intern
, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#else
call unpack_row_real_cpu_single(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
call unpack_row_real_cpu_single(a
Intern
, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#endif
endif
...
...
@@ -3852,10 +3905,10 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_openmp_double(a, row, i-limits(ip), my_thread, &
call unpack_row_real_cpu_openmp_double(a
Intern
, row, i-limits(ip), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#else
call unpack_row_real_cpu_openmp_single(a, row, i-limits(ip), my_thread, &
call unpack_row_real_cpu_openmp_single(a
Intern
, row, i-limits(ip), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#endif
enddo
...
...
@@ -3873,11 +3926,11 @@
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a_dev, stripe_count, &
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a
Intern
_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
#else
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, a_dev, stripe_count, &
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, a
Intern
_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, i - limits(ip), .false.)
...
...
@@ -3885,9 +3938,9 @@
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_double(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
call unpack_row_real_cpu_double(a
Intern
, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#else
call unpack_row_real_cpu_single(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
call unpack_row_real_cpu_single(a
Intern
, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#endif
endif
#endif /* WITH_OPENMP */
...
...
@@ -3966,10 +4019,10 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call unpack_row_real_cpu_openmp_double(a, row, i-limits(my_prow), my_thread, &
call unpack_row_real_cpu_openmp_double(a
Intern
, row, i-limits(my_prow), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#else
call unpack_row_real_cpu_openmp_single(a, row, i-limits(my_prow), my_thread, &
call unpack_row_real_cpu_openmp_single(a
Intern
, row, i-limits(my_prow), my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
#endif
enddo
...
...
@@ -3986,11 +4039,11 @@
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, a
Intern
_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(my_prow), .false.)