Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
On Thursday, 7th July from 1 to 3 pm there will be a maintenance with a short downtime of GitLab.
Open sidebar
elpa
elpa
Commits
ff3f445a
Commit
ff3f445a
authored
Feb 22, 2021
by
Andreas Marek
Browse files
Change dimension of buffer arrays in trans_ev_tridi_to_band
parent
f0bdb04a
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2/elpa2_bandred_template.F90
View file @
ff3f445a
...
...
@@ -629,65 +629,65 @@ max_threads)
aux1
=
0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc
=
0
! number of local columns
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
aux1
(
nlc
)
=
dot_product
(
vr
(
1
:
lr
),
a_mat
(
1
:
lr
,
lcx
))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
if
(
nlc
>
0
)
call
mpi_allreduce
(
aux1
,
aux2
,
int
(
nlc
,
kind
=
MPI_KIND
),
MPI_COMPLEX_PRECISION
,
MPI_SUM
,
&
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
mpierr
)
! Transform
nlc
=
0
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a_mat
(
1
:
lr
,
lcx
)
=
a_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux2
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
! Transform
nlc
=
0
do
j
=
1
,
lc
-1
lcx
=
local_index
(
istep
*
nbw
+
j
,
my_pcol
,
np_cols
,
nblk
,
0
)
if
(
lcx
>
0
)
then
nlc
=
nlc
+1
a_mat
(
1
:
lr
,
lcx
)
=
a_mat
(
1
:
lr
,
lcx
)
-
conjg
(
tau
)
*
aux1
(
nlc
)
*
vr
(
1
:
lr
)
endif
enddo
#endif /* WITH_MPI */
!#if 0
! ! original complex implementation without openmp. check performance
! nlc = 0 ! number of local columns
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
! endif
! enddo
!
! ! Transform
! ! Get global dot products
!#ifdef WITH_MPI
! if (wantDebug) call obj%timer%start("mpi_communication")
! if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
! int(mpi_comm_rows,kind=MPI_KIND), mpierr)
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
#endif /* if 0 */
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
! endif
! enddo
!
!
! if (wantDebug) call obj%timer%stop("mpi_communication")
!
!#else /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
! endif
! enddo
!
!#endif /* WITH_MPI */
!!
!! ! Transform
!!
!! nlc = 0
!! do j=1,lc-1
!! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!! if (lcx>0) then
!! nlc = nlc+1
!! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
!! endif
!! enddo
!#endif /* if 0 */
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
...
...
@@ -909,35 +909,35 @@ max_threads)
! of the tiles, so we can use strips of the matrix
#if 0
! original complex implemetation check for performance
umcCPU
(
1
:
l_cols
,
1
:
n_cols
)
=
0.0_rck
vmrCPU
(
1
:
l_rows
,
n_cols
+1
:
2
*
n_cols
)
=
0.0_rck
if
(
l_cols
>
0
.and.
l_rows
>
0
)
then
do
i
=
0
,(
istep
*
nbw
-1
)/
tile_size
lcs
=
i
*
l_cols_tile
+1
lce
=
min
(
l_cols
,(
i
+1
)
*
l_cols_tile
)
if
(
lce
<
lcs
)
cycle
lre
=
min
(
l_rows
,(
i
+1
)
*
l_rows_tile
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'C'
,
'N'
,
lce
-
lcs
+1
,
n_cols
,
lre
,
ONE
,
a_mat
(
1
,
lcs
),
ubound
(
a_mat
,
dim
=
1
),
&
vmrCPU
,
ubound
(
vmrCPU
,
dim
=
1
),
ONE
,
umcCPU
(
lcs
,
1
),
ubound
(
umcCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
if
(
i
==
0
)
cycle
lre
=
min
(
l_rows
,
i
*
l_rows_tile
)
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
'N'
,
lre
,
n_cols
,
lce
-
lcs
+1
,
ONE
,
a_mat
(
1
,
lcs
),
lda
,
&
umcCPU
(
lcs
,
n_cols
+1
),
ubound
(
umcCPU
,
dim
=
1
),
ONE
,
vmrCPU
(
1
,
n_cols
+1
),
ubound
(
vmrCPU
,
dim
=
1
))
call
obj
%
timer
%
stop
(
"blas"
)
enddo
endif
! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */
!
#if 0
!
! original complex implemetation check for performance
!
umcCPU(1:l_cols,1:n_cols) = 0.0_rck
!
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
!
!
if (l_cols>0 .and. l_rows>0) then
!
do i=0,(istep*nbw-1)/tile_size
!
!
lcs = i*l_cols_tile+1
!
lce = min(l_cols,(i+1)*l_cols_tile)
!
if (lce<lcs) cycle
!
!
lre = min(l_rows,(i+1)*l_rows_tile)
!
!
call obj%timer%start("blas")
!
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
!
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
!
call obj%timer%stop("blas")
!
!
if (i==0) cycle
!
lre = min(l_rows,i*l_rows_tile)
!
call obj%timer%start("blas")
!
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
!
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
!
call obj%timer%stop("blas")
!
enddo
!
!
endif ! (l_cols>0 .and. l_rows>0)
!
#endif /* if 0 */
!Code for Algorithm 4
...
...
@@ -1396,7 +1396,11 @@ max_threads)
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
!$omp parallel &
!$omp default(none) &
!$omp private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) &
!$omp shared(n_threads, istep, tile_size, nbw, n_cols, obj, vmrcpu, l_cols_tile, l_rows, l_rows_tile, &
!$omp& umccpu, l_cols, a_dev, vmr_dev, useGPU, cur_l_rows, umc_dev, cur_l_cols, lda )
n_threads
=
omp_get_num_threads
()
if
(
mod
(
n_threads
,
2
)
==
0
)
then
...
...
@@ -1424,13 +1428,30 @@ max_threads)
myend
=
mystart
+
work_per_thread
-
1
if
(
myend
>
lre
)
myend
=
lre
if
(
myend
-
mystart
+1
<
1
)
cycle
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
int
(
myend
-
mystart
+1
,
kind
=
BLAS_KIND
),
&
int
(
lce
-
lcs
+1
,
kind
=
BLAS_KIND
),
int
(
2
*
n_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
&
vmrCPU
(
mystart
,
1
),
int
(
ubound
(
vmrCPU
,
1
),
kind
=
BLAS_KIND
),
&
umcCPU
(
lcs
,
1
),
int
(
ubound
(
umcCPU
,
1
),
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
mystart
,
lcs
),
int
(
ubound
(
a_mat
,
1
),
kind
=
BLAS_KIND
)
)
call
obj
%
timer
%
stop
(
"blas"
)
if
(
useGPU
)
then
if
(
n_way
.gt.
1
)
then
print
*
,
"error more than 1 openmp thread used in GPU part of elpa2_bandred"
print
*
,
"this should never happen"
stop
endif
call
obj
%
timer
%
start
(
"cublas"
)
call
cublas_PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
myend
-
mystart
+1
,
&
lce
-
lcs
+1
,
2
*
n_cols
,
-
ONE
,
&
vmr_dev
,
cur_l_rows
,
(
umc_dev
+
(
lcs
-1
)
*
&
size_of_datatype
),
&
cur_l_cols
,
ONE
,
(
a_dev
+
(
lcs
-1
)
*
lda
*
&
size_of_datatype
),
lda
)
call
obj
%
timer
%
stop
(
"cublas"
)
else
call
obj
%
timer
%
start
(
"blas"
)
call
PRECISION_GEMM
(
'N'
,
BLAS_TRANS_OR_CONJ
,
int
(
myend
-
mystart
+1
,
kind
=
BLAS_KIND
),
&
int
(
lce
-
lcs
+1
,
kind
=
BLAS_KIND
),
int
(
2
*
n_cols
,
kind
=
BLAS_KIND
),
-
ONE
,
&
vmrCPU
(
mystart
,
1
),
int
(
ubound
(
vmrCPU
,
1
),
kind
=
BLAS_KIND
),
&
umcCPU
(
lcs
,
1
),
int
(
ubound
(
umcCPU
,
1
),
kind
=
BLAS_KIND
),
&
ONE
,
a_mat
(
mystart
,
lcs
),
int
(
ubound
(
a_mat
,
1
),
kind
=
BLAS_KIND
)
)
call
obj
%
timer
%
stop
(
"blas"
)
endif
enddo
!$omp end parallel
...
...
src/elpa2/elpa2_template.F90
View file @
ff3f445a
...
...
@@ -391,6 +391,18 @@
success
=
.false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call
obj
%
get
(
"omp_threads"
,
nrThreads
,
error
)
if
(
nrThreads
.ne.
1
)
then
print
*
,
"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print
*
,
"setting 1 openmp thread now"
call
obj
%
set
(
"omp_threads"
,
1
,
error
)
nrThreads
=
1
call
omp_set_num_threads
(
nrThreads
)
endif
#endif
call
obj
%
timer
%
stop
(
"check_for_gpu"
)
endif
...
...
@@ -518,7 +530,7 @@
endif
#endif
#endif
#endif
/* REALCASE == 1 */
! consistency check: is user set kernel still identical with "kernel" or did
! we change it above? This is a mess and should be cleaned up
...
...
@@ -614,9 +626,7 @@
else
useQR
=
.false.
endif
#endif
#endif /* REALCASE == 1 */
#if REALCASE == 1
useQRActual
=
.false.
...
...
@@ -1009,8 +1019,6 @@
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call
omp_set_num_threads
(
omp_threads_caller
)
#endif
...
...
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
View file @
ff3f445a
...
...
@@ -141,16 +141,20 @@ subroutine trans_ev_tridi_to_band_&
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
row
(:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
row_group
(:,:)
#ifdef WITH_OPENMP_TRADITIONAL
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_send_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_recv_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_send_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_recv_buffer
(:,:)
#ifdef WITH_OPENMP_TRADITIONAL
!MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:)
!MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:)
!MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:)
!MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:)
#else
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_send_buffer
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_recv_buffer
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_send_buffer
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_recv_buffer
(:,:,:)
!
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:)
!
MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:,:)
!
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:)
!
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:,:)
#endif
integer
(
kind
=
c_intptr_t
)
::
aIntern_dev
...
...
@@ -987,22 +991,34 @@ subroutine trans_ev_tridi_to_band_&
#else /* WITH_OPENMP_TRADITIONAL */
allocate
(
top_border_send_buffer
(
stripe_width
,
nbw
,
stripe_count
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
top_border_send_buffer
(
stripe_width
*
nbw
,
stripe_count
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_tridi_to_band: top_border_send_buffer"
,
istat
,
errorMessage
)
!allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
!check_allocate("trans_ev_tridi_to_band: top_border_send_buffer", istat, errorMessage)
allocate
(
top_border_recv_buffer
(
stripe_width
,
nbw
,
stripe_count
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
top_border_recv_buffer
(
stripe_width
*
nbw
*
max_threads
,
stripe_count
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_tridi_to_band: top_border_recv_buffer"
,
istat
,
errorMessage
)
!allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
!check_allocate("trans_ev_tridi_to_band: top_border_recv_buffer", istat, errorMessage)
allocate
(
bottom_border_send_buffer
(
stripe_width
,
nbw
,
stripe_count
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
bottom_border_send_buffer
(
stripe_width
*
nbw
*
max_threads
,
stripe_count
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_tridi_to_band: bottom_border_send_buffer"
,
istat
,
errorMessage
)
!allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
!check_allocate("trans_ev_tridi_to_band: bottom_border_send_buffer", istat, errorMessage)
allocate
(
bottom_border_recv_buffer
(
stripe_width
,
nbw
,
stripe_count
),
stat
=
istat
,
errmsg
=
errorMessage
)
allocate
(
bottom_border_recv_buffer
(
stripe_width
*
nbw
*
max_threads
,
stripe_count
),
stat
=
istat
,
errmsg
=
errorMessage
)
check_allocate
(
"trans_ev_tridi_to_band: bottom_border_recv_buffer"
,
istat
,
errorMessage
)
!allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
!check_allocate("trans_ev_tridi_to_band: bottom_border_recv_buffer", istat, errorMessage)
top_border_send_buffer
(:,:,:)
=
0.0_rck
top_border_recv_buffer
(:,:,:)
=
0.0_rck
bottom_border_send_buffer
(:,:,:)
=
0.0_rck
bottom_border_recv_buffer
(:,:,:)
=
0.0_rck
top_border_send_buffer
(:,:)
=
0.0_rck
top_border_recv_buffer
(:,:)
=
0.0_rck
bottom_border_send_buffer
(:,:)
=
0.0_rck
bottom_border_recv_buffer
(:,:)
=
0.0_rck
!top_border_send_buffer(:,:,:) = 0.0_rck
!top_border_recv_buffer(:,:,:) = 0.0_rck
!bottom_border_send_buffer(:,:,:) = 0.0_rck
!bottom_border_recv_buffer(:,:,:) = 0.0_rck
if
(
useGPU
)
then
successCUDA
=
cuda_host_register
(
int
(
loc
(
top_border_send_buffer
),
kind
=
c_intptr_t
),
&
...
...
@@ -1120,7 +1136,7 @@ subroutine trans_ev_tridi_to_band_&
#else /* WITH_OPENMP_TRADITIONAL */
#ifdef WITH_MPI
call
MPI_Irecv
(
bottom_border_recv_buffer
(
1
,
1
,
i
),
int
(
nbw
*
stripe_width
,
kind
=
MPI_KIND
),
&
call
MPI_Irecv
(
bottom_border_recv_buffer
(
1
,
i
),
int
(
nbw
*
stripe_width
,
kind
=
MPI_KIND
),
&
MPI_MATH_DATATYPE_PRECISION_EXPL
,
int
(
my_prow
+1
,
kind
=
MPI_KIND
),
&
int
(
bottom_recv_tag
,
kind
=
MPI_KIND
),
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
&
bottom_recv_request
(
i
),
mpierr
)
...
...
@@ -1244,13 +1260,14 @@ subroutine trans_ev_tridi_to_band_&
if
(
useGPU
)
then
dev_offset
=
(
0
+
(
n_off
*
stripe_width
)
+
(
(
i
-1
)
*
stripe_width
*
a_dim2
))
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
aIntern_dev
+
dev_offset
,
&
int
(
loc
(
bottom_border_recv_buffer
(
1
,
1
,
i
)),
kind
=
c_intptr_t
),
&
int
(
loc
(
bottom_border_recv_buffer
(
1
,
i
)),
kind
=
c_intptr_t
),
&
stripe_width
*
nbw
*
size_of_datatype
,
&
cudaMemcpyHostToDevice
)
check_memcpy_cuda
(
"trans_ev_tridi_to_band: bottom_border_recv_buffer -> aIntern_dev"
,
successCUDA
)
else
aIntern
(:,
n_off
+1
:
n_off
+
nbw
,
i
)
=
bottom_border_recv_buffer
(:,
1
:
nbw
,
i
)
aIntern
(:,
n_off
+1
:
n_off
+
nbw
,
i
)
=
reshape
(
&
bottom_border_recv_buffer
(
1
:
stripe_width
*
nbw
,
i
),(/
stripe_width
,
nbw
/))
endif
#endif /* WITH_OPENMP_TRADITIONAL */
...
...
@@ -1281,7 +1298,7 @@ subroutine trans_ev_tridi_to_band_&
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Irecv
(
bottom_border_recv_buffer
(
1
,
1
,
i
),
int
(
nbw
*
stripe_width
,
kind
=
MPI_KIND
),
&
call
MPI_Irecv
(
bottom_border_recv_buffer
(
1
,
i
),
int
(
nbw
*
stripe_width
,
kind
=
MPI_KIND
),
&
MPI_MATH_DATATYPE_PRECISION_EXPL
,
int
(
my_prow
+1
,
kind
=
MPI_KIND
),
&
int
(
bottom_recv_tag
,
kind
=
MPI_KIND
),
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
&
bottom_recv_request
(
i
),
mpierr
)
...
...
@@ -1328,12 +1345,13 @@ subroutine trans_ev_tridi_to_band_&
if
(
useGPU
)
then
dev_offset
=
(
0
+
(
a_off
*
stripe_width
)
+
(
(
i
-1
)
*
stripe_width
*
a_dim2
))
*
size_of_datatype
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA
=
cuda_memcpy
(
aIntern_dev
+
dev_offset
,
int
(
loc
(
top_border_recv_buffer
(
1
,
1
,
i
)),
kind
=
c_intptr_t
),
&
successCUDA
=
cuda_memcpy
(
aIntern_dev
+
dev_offset
,
int
(
loc
(
top_border_recv_buffer
(
1
,
i
)),
kind
=
c_intptr_t
),
&
stripe_width
*
top_msg_length
*
size_of_datatype
,
&
cudaMemcpyHostToDevice
)
check_memcpy_cuda
(
"trans_ev_tridi_to_band: top_border_recv_buffer -> aIntern_dev"
,
successCUDA
)
else
! useGPU
aIntern
(:,
a_off
+1
:
a_off
+
top_msg_length
,
i
)
=
top_border_recv_buffer
(:,
1
:
top_msg_length
,
i
)
aIntern
(:,
a_off
+1
:
a_off
+
top_msg_length
,
i
)
=
&
reshape
(
top_border_recv_buffer
(
1
:
stripe_width
*
top_msg_length
,
i
),(/
stripe_width
,
top_msg_length
/))
endif
! useGPU
#endif /* WITH_OPENMP_TRADITIONAL */
endif
! top_msg_length
...
...
@@ -1411,24 +1429,25 @@ subroutine trans_ev_tridi_to_band_&
if
(
useGPU
)
then
dev_offset
=
(
0
+
(
n_off
*
stripe_width
)
+
(
(
i
-1
)
*
stripe_width
*
a_dim2
))
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
int
(
loc
(
bottom_border_send_buffer
(
1
,
1
,
i
)),
kind
=
c_intptr_t
),
aIntern_dev
+
dev_offset
,
&
successCUDA
=
cuda_memcpy
(
int
(
loc
(
bottom_border_send_buffer
(
1
,
i
)),
kind
=
c_intptr_t
),
aIntern_dev
+
dev_offset
,
&
stripe_width
*
bottom_msg_length
*
size_of_datatype
,
&
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"trans_ev_tridi_to_band: aIntern_dev -> bottom_border_send_buffer"
,
successCUDA
)
else
bottom_border_send_buffer
(:,
1
:
bottom_msg_length
,
i
)
=
aIntern
(:,
n_off
+1
:
n_off
+
bottom_msg_length
,
i
)
bottom_border_send_buffer
(
1
:
stripe_width
*
bottom_msg_length
,
i
)
=
reshape
(&
aIntern
(:,
n_off
+1
:
n_off
+
bottom_msg_length
,
i
),(/
stripe_width
*
bottom_msg_length
/))
endif
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Isend
(
bottom_border_send_buffer
(
1
,
1
,
i
),
int
(
bottom_msg_length
*
stripe_width
,
kind
=
MPI_KIND
),
&
call
MPI_Isend
(
bottom_border_send_buffer
(
1
,
i
),
int
(
bottom_msg_length
*
stripe_width
,
kind
=
MPI_KIND
),
&
MPI_MATH_DATATYPE_PRECISION_EXPL
,
int
(
my_prow
+1
,
kind
=
MPI_KIND
),
int
(
top_recv_tag
,
kind
=
MPI_KIND
),
&
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
bottom_send_request
(
i
),
mpierr
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
if
(
next_top_msg_length
>
0
)
then
top_border_recv_buffer
(
1
:
stripe_width
,
1
:
next_top_msg_length
,
i
)
=
&
bottom_border_send_buffer
(
1
:
stripe_width
,
1
:
next_top_msg_length
,
i
)
top_border_recv_buffer
(
1
:
stripe_width
*
next_top_msg_length
,
i
)
=
&
bottom_border_send_buffer
(
1
:
stripe_width
*
next_top_msg_length
,
i
)
endif
#endif /* WITH_MPI */
...
...
@@ -1511,24 +1530,25 @@ subroutine trans_ev_tridi_to_band_&
if
(
useGPU
)
then
dev_offset
=
(
0
+
(
n_off
*
stripe_width
)
+
(
(
i
-1
)
*
stripe_width
*
a_dim2
))
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
int
(
loc
(
bottom_border_send_buffer
(
1
,
1
,
i
)),
kind
=
c_intptr_t
),
aIntern_dev
+
dev_offset
,
&
successCUDA
=
cuda_memcpy
(
int
(
loc
(
bottom_border_send_buffer
(
1
,
i
)),
kind
=
c_intptr_t
),
aIntern_dev
+
dev_offset
,
&
stripe_width
*
bottom_msg_length
*
size_of_datatype
,
&
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"trans_ev_tridi_to_band: aIntern_dev -> bottom_border_send_buffer"
,
successCUDA
)
else
bottom_border_send_buffer
(:,
1
:
bottom_msg_length
,
i
)
=
aIntern
(:,
n_off
+1
:
n_off
+
bottom_msg_length
,
i
)
bottom_border_send_buffer
(
1
:
stripe_width
*
bottom_msg_length
,
i
)
=
reshape
(&
aIntern
(:,
n_off
+1
:
n_off
+
bottom_msg_length
,
i
),(/
stripe_width
*
bottom_msg_length
/))
endif
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Isend
(
bottom_border_send_buffer
(
1
,
1
,
i
),
int
(
bottom_msg_length
*
stripe_width
,
kind
=
MPI_KIND
),
&
call
MPI_Isend
(
bottom_border_send_buffer
(
1
,
i
),
int
(
bottom_msg_length
*
stripe_width
,
kind
=
MPI_KIND
),
&
MPI_MATH_DATATYPE_PRECISION_EXPL
,
int
(
my_prow
+1
,
kind
=
MPI_KIND
),
int
(
top_recv_tag
,
kind
=
MPI_KIND
),
&
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
bottom_send_request
(
i
),
mpierr
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
if
(
next_top_msg_length
>
0
)
then
top_border_recv_buffer
(
1
:
stripe_width
,
1
:
next_top_msg_length
,
i
)
=
&
bottom_border_send_buffer
(
1
:
stripe_width
,
1
:
next_top_msg_length
,
i
)
top_border_recv_buffer
(
1
:
stripe_width
*
next_top_msg_length
,
i
)
=
&
bottom_border_send_buffer
(
1
:
stripe_width
*
next_top_msg_length
,
i
)
endif
#endif /* WITH_MPI */
...
...
@@ -1597,12 +1617,13 @@ subroutine trans_ev_tridi_to_band_&
#endif
if
(
useGPU
)
then
dev_offset
=
(
0
+
(
a_off
*
stripe_width
)
+
(
(
i
-1
)
*
stripe_width
*
a_dim2
))
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
aIntern_dev
+
dev_offset
,
int
(
loc
(
top_border_recv_buffer
(:,
1
,
i
)),
kind
=
c_intptr_t
),
&
successCUDA
=
cuda_memcpy
(
aIntern_dev
+
dev_offset
,
int
(
loc
(
top_border_recv_buffer
(:,
i
)),
kind
=
c_intptr_t
),
&
stripe_width
*
top_msg_length
*
size_of_datatype
,
&
cudaMemcpyHostToDevice
)
check_memcpy_cuda
(
"trans_ev_tridi_to_band: top_border_recv_buffer -> aIntern_dev"
,
successCUDA
)
else
aIntern
(:,
a_off
+1
:
a_off
+
top_msg_length
,
i
)
=
top_border_recv_buffer
(:,
1
:
top_msg_length
,
i
)
aIntern
(:,
a_off
+1
:
a_off
+
top_msg_length
,
i
)
=
&
reshape
(
top_border_recv_buffer
(
1
:
stripe_width
*
top_msg_length
,
i
),(/
stripe_width
,
top_msg_length
/))
endif
#endif /* WITH_OPENMP_TRADITIONAL */
endif
...
...
@@ -1667,7 +1688,7 @@ subroutine trans_ev_tridi_to_band_&
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Irecv
(
top_border_recv_buffer
(
1
,
1
,
i
),
int
(
next_top_msg_length
*
stripe_width
,
kind
=
MPI_KIND
),
&
call
MPI_Irecv
(
top_border_recv_buffer
(
1
,
i
),
int
(
next_top_msg_length
*
stripe_width
,
kind
=
MPI_KIND
),
&
MPI_MATH_DATATYPE_PRECISION_EXPL
,
int
(
my_prow
-1
,
kind
=
MPI_KIND
),
int
(
top_recv_tag
,
kind
=
MPI_KIND
),
&
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
top_recv_request
(
i
),
mpierr
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
...
...
@@ -1717,25 +1738,25 @@ subroutine trans_ev_tridi_to_band_&
#endif
if
(
useGPU
)
then
dev_offset
=
(
0
+
(
a_off
*
stripe_width
)
+
(
(
i
-1
)
*
stripe_width
*
a_dim2
))
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
int
(
loc
(
top_border_send_buffer
(:,
1
,
i
)),
kind
=
c_intptr_t
),
aIntern_dev
+
dev_offset
,
&
successCUDA
=
cuda_memcpy
(
int
(
loc
(
top_border_send_buffer
(:,
i
)),
kind
=
c_intptr_t
),
aIntern_dev
+
dev_offset
,
&
stripe_width
*
nbw
*
size_of_datatype
,
&
cudaMemcpyDeviceToHost
)
check_memcpy_cuda
(
"trans_ev_tridi_to_band: aIntern_dev -> top_border_send_buffer"
,
successCUDA
)
else
top_border_send_buffer
(:,
1
:
nbw
,
i
)
=
aIntern
(:,
a_off
+1
:
a_off
+
nbw
,
i
)
top_border_send_buffer
(:,
i
)
=
reshape
(
aIntern
(:,
a_off
+1
:
a_off
+
nbw
,
i
)
,(/
stripe_width
*
nbw
/))
endif
#ifdef WITH_MPI
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
MPI_Isend
(
top_border_send_buffer
(
1
,
1
,
i
),
int
(
nbw
*
stripe_width
,
kind
=
MPI_KIND
),
MPI_MATH_DATATYPE_PRECISION_EXPL
,
&
call
MPI_Isend
(
top_border_send_buffer
(
1
,
i
),
int
(
nbw
*
stripe_width
,
kind
=
MPI_KIND
),
MPI_MATH_DATATYPE_PRECISION_EXPL
,
&
int
(
my_prow
-1
,
kind
=
MPI_KIND
),
int
(
bottom_recv_tag
,
kind
=
MPI_KIND
),
int
(
mpi_comm_rows
,
kind
=
MPI_KIND
),
&
top_send_request
(
i
),
mpierr
)
if
(
wantDebug
)
call
obj
%
timer
%
stop
(
"mpi_communication"
)
#else /* WITH_MPI */
if
(
sweep
==
0
.and.
current_n_end
<
current_n
.and.
l_nev
>
0
)
then
bottom_border_recv_buffer
(
1
:
nbw
*
stripe_width
,
1
,
i
)
=
top_border_send_buffer
(
1
:
nbw
*
stripe_width
,
1
,
i
)
bottom_border_recv_buffer
(
1
:
nbw
*
stripe_width
,
i
)
=
top_border_send_buffer
(
1
:
nbw
*
stripe_width
,
i
)
endif
if
(
next_n_end
<
next_n
)
then
bottom_border_recv_buffer
(
1
:
stripe_width
,
1
:
nbw
,
i
)
=
top_border_send_buffer
(
1
:
stripe_width
,
1
:
nbw
,
i
)
bottom_border_recv_buffer
(
1
:
stripe_width
*
nbw
,
i
)
=
top_border_send_buffer
(
1
:
stripe_width
*
nbw
,
i
)
endif
#endif /* WITH_MPI */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment