Commit ff3f445a authored by Andreas Marek's avatar Andreas Marek
Browse files

Change dimension of buffer arrays in trans_ev_tridi_to_band

parent f0bdb04a
......@@ -629,65 +629,65 @@ max_threads)
aux1 = 0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
endif
enddo
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_MPI */
!#if 0
! ! original complex implementation without openmp. check performance
! nlc = 0 ! number of local columns
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
! endif
! enddo
!
! ! Transform
! ! Get global dot products
!#ifdef WITH_MPI
! if (wantDebug) call obj%timer%start("mpi_communication")
! if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
! int(mpi_comm_rows,kind=MPI_KIND), mpierr)
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
#endif /* if 0 */
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
! endif
! enddo
!
!
! if (wantDebug) call obj%timer%stop("mpi_communication")
!
!#else /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
! endif
! enddo
!
!#endif /* WITH_MPI */
!!
!! ! Transform
!!
!! nlc = 0
!! do j=1,lc-1
!! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!! if (lcx>0) then
!! nlc = nlc+1
!! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
!! endif
!! enddo
!#endif /* if 0 */
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
......@@ -909,35 +909,35 @@ max_threads)
! of the tiles, so we can use strips of the matrix
#if 0
! original complex implemetation check for performance
umcCPU(1:l_cols,1:n_cols) = 0.0_rck
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
call obj%timer%start("blas")
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call obj%timer%stop("blas")
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call obj%timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call obj%timer%stop("blas")
enddo
endif ! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */
!#if 0
! ! original complex implemetation check for performance
! umcCPU(1:l_cols,1:n_cols) = 0.0_rck
! vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
!
! if (l_cols>0 .and. l_rows>0) then
! do i=0,(istep*nbw-1)/tile_size
!
! lcs = i*l_cols_tile+1
! lce = min(l_cols,(i+1)*l_cols_tile)
! if (lce<lcs) cycle
!
! lre = min(l_rows,(i+1)*l_rows_tile)
!
! call obj%timer%start("blas")
! call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
! vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
! call obj%timer%stop("blas")
!
! if (i==0) cycle
! lre = min(l_rows,i*l_rows_tile)
! call obj%timer%start("blas")
! call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
! umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
! call obj%timer%stop("blas")
! enddo
!
! endif ! (l_cols>0 .and. l_rows>0)
!#endif /* if 0 */
!Code for Algorithm 4
......@@ -1396,7 +1396,11 @@ max_threads)
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
!$omp parallel &
!$omp default(none) &
!$omp private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) &
!$omp shared(n_threads, istep, tile_size, nbw, n_cols, obj, vmrcpu, l_cols_tile, l_rows, l_rows_tile, &
!$omp& umccpu, l_cols, a_dev, vmr_dev, useGPU, cur_l_rows, umc_dev, cur_l_cols, lda )
n_threads = omp_get_num_threads()
if (mod(n_threads, 2) == 0) then
......@@ -1424,13 +1428,30 @@ max_threads)
myend = mystart + work_per_thread - 1
if ( myend > lre ) myend = lre
if ( myend-mystart+1 < 1) cycle
call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(myend-mystart+1,kind=BLAS_KIND), &
int(lce-lcs+1,kind=BLAS_KIND), int(2*n_cols,kind=BLAS_KIND), -ONE, &
vmrCPU(mystart, 1), int(ubound(vmrCPU,1),kind=BLAS_KIND), &
umcCPU(lcs,1), int(ubound(umcCPU,1),kind=BLAS_KIND), &
ONE, a_mat(mystart,lcs), int(ubound(a_mat,1),kind=BLAS_KIND) )
call obj%timer%stop("blas")
if (useGPU) then
if (n_way .gt. 1) then
print *,"error more than 1 openmp thread used in GPU part of elpa2_bandred"
print *,"this should never happen"
stop
endif
call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, myend-mystart+1, &
lce-lcs+1, 2*n_cols, -ONE, &
vmr_dev, cur_l_rows, (umc_dev +(lcs-1)* &
size_of_datatype), &
cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
size_of_datatype), lda)
call obj%timer%stop("cublas")
else
call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(myend-mystart+1,kind=BLAS_KIND), &
int(lce-lcs+1,kind=BLAS_KIND), int(2*n_cols,kind=BLAS_KIND), -ONE, &
vmrCPU(mystart, 1), int(ubound(vmrCPU,1),kind=BLAS_KIND), &
umcCPU(lcs,1), int(ubound(umcCPU,1),kind=BLAS_KIND), &
ONE, a_mat(mystart,lcs), int(ubound(a_mat,1),kind=BLAS_KIND) )
call obj%timer%stop("blas")
endif
enddo
!$omp end parallel
......
......@@ -391,6 +391,18 @@
success = .false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu")
endif
......@@ -518,7 +530,7 @@
endif
#endif
#endif
#endif /* REALCASE == 1 */
! consistency check: is user set kernel still identical with "kernel" or did
! we change it above? This is a mess and should be cleaned up
......@@ -614,9 +626,7 @@
else
useQR = .false.
endif
#endif
#endif /* REALCASE == 1 */
#if REALCASE == 1
useQRActual = .false.
......@@ -1009,8 +1019,6 @@
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
#endif
......
......@@ -141,16 +141,20 @@ subroutine trans_ev_tridi_to_band_&
MATH_DATATYPE(kind=rck), allocatable :: row(:)
MATH_DATATYPE(kind=rck), pointer :: row_group(:,:)
#ifdef WITH_OPENMP_TRADITIONAL
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:)
#ifdef WITH_OPENMP_TRADITIONAL
!MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:)
!MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:)
!MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:)
!MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:)
#else
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:,:)
!MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:)
!MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:,:)
!MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:)
!MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:,:)
#endif
integer(kind=c_intptr_t) :: aIntern_dev
......@@ -987,22 +991,34 @@ subroutine trans_ev_tridi_to_band_&
#else /* WITH_OPENMP_TRADITIONAL */
allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
allocate(top_border_send_buffer(stripe_width*nbw, stripe_count), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_tridi_to_band: top_border_send_buffer", istat, errorMessage)
!allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
!check_allocate("trans_ev_tridi_to_band: top_border_send_buffer", istat, errorMessage)
allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_tridi_to_band: top_border_recv_buffer", istat, errorMessage)
!allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
!check_allocate("trans_ev_tridi_to_band: top_border_recv_buffer", istat, errorMessage)
allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_tridi_to_band: bottom_border_send_buffer", istat, errorMessage)
!allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
!check_allocate("trans_ev_tridi_to_band: bottom_border_send_buffer", istat, errorMessage)
allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
check_allocate("trans_ev_tridi_to_band: bottom_border_recv_buffer", istat, errorMessage)
!allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
!check_allocate("trans_ev_tridi_to_band: bottom_border_recv_buffer", istat, errorMessage)
top_border_send_buffer(:,:,:) = 0.0_rck
top_border_recv_buffer(:,:,:) = 0.0_rck
bottom_border_send_buffer(:,:,:) = 0.0_rck
bottom_border_recv_buffer(:,:,:) = 0.0_rck
top_border_send_buffer(:,:) = 0.0_rck
top_border_recv_buffer(:,:) = 0.0_rck
bottom_border_send_buffer(:,:) = 0.0_rck
bottom_border_recv_buffer(:,:) = 0.0_rck
!top_border_send_buffer(:,:,:) = 0.0_rck
!top_border_recv_buffer(:,:,:) = 0.0_rck
!bottom_border_send_buffer(:,:,:) = 0.0_rck
!bottom_border_recv_buffer(:,:,:) = 0.0_rck
if (useGPU) then
successCUDA = cuda_host_register(int(loc(top_border_send_buffer),kind=c_intptr_t), &
......@@ -1120,7 +1136,7 @@ subroutine trans_ev_tridi_to_band_&
#else /* WITH_OPENMP_TRADITIONAL */
#ifdef WITH_MPI
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), int(nbw*stripe_width,kind=MPI_KIND), &
call MPI_Irecv(bottom_border_recv_buffer(1,i), int(nbw*stripe_width,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), &
int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
bottom_recv_request(i), mpierr)
......@@ -1244,13 +1260,14 @@ subroutine trans_ev_tridi_to_band_&
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , &
int(loc(bottom_border_recv_buffer(1,1,i)),kind=c_intptr_t), &
int(loc(bottom_border_recv_buffer(1,i)),kind=c_intptr_t), &
stripe_width*nbw* size_of_datatype, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev_tridi_to_band: bottom_border_recv_buffer -> aIntern_dev", successCUDA)
else
aIntern(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
aIntern(:,n_off+1:n_off+nbw,i) = reshape( &
bottom_border_recv_buffer(1:stripe_width*nbw,i),(/stripe_width,nbw/))
endif
#endif /* WITH_OPENMP_TRADITIONAL */
......@@ -1281,7 +1298,7 @@ subroutine trans_ev_tridi_to_band_&
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), int(nbw*stripe_width,kind=MPI_KIND), &
call MPI_Irecv(bottom_border_recv_buffer(1,i), int(nbw*stripe_width,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), &
int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
bottom_recv_request(i), mpierr)
......@@ -1328,12 +1345,13 @@ subroutine trans_ev_tridi_to_band_&
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , int(loc(top_border_recv_buffer(1,1,i)),kind=c_intptr_t), &
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , int(loc(top_border_recv_buffer(1,i)),kind=c_intptr_t), &
stripe_width*top_msg_length* size_of_datatype, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev_tridi_to_band: top_border_recv_buffer -> aIntern_dev", successCUDA)
else ! useGPU
aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
aIntern(:,a_off+1:a_off+top_msg_length,i) = &
reshape(top_border_recv_buffer(1:stripe_width*top_msg_length,i),(/stripe_width,top_msg_length/))
endif ! useGPU
#endif /* WITH_OPENMP_TRADITIONAL */
endif ! top_msg_length
......@@ -1411,24 +1429,25 @@ subroutine trans_ev_tridi_to_band_&
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( int(loc(bottom_border_send_buffer(1,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( int(loc(bottom_border_send_buffer(1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_datatype, &
cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev_tridi_to_band: aIntern_dev -> bottom_border_send_buffer", successCUDA)
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
bottom_border_send_buffer(1:stripe_width*bottom_msg_length,i) = reshape(&
aIntern(:,n_off+1:n_off+bottom_msg_length,i),(/stripe_width*bottom_msg_length/))
endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Isend(bottom_border_send_buffer(1,1,i), int(bottom_msg_length*stripe_width,kind=MPI_KIND), &
call MPI_Isend(bottom_border_send_buffer(1,i), int(bottom_msg_length*stripe_width,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), bottom_send_request(i), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = &
bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
top_border_recv_buffer(1:stripe_width*next_top_msg_length,i) = &
bottom_border_send_buffer(1:stripe_width*next_top_msg_length,i)
endif
#endif /* WITH_MPI */
......@@ -1511,24 +1530,25 @@ subroutine trans_ev_tridi_to_band_&
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy(int(loc(bottom_border_send_buffer(1,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy(int(loc(bottom_border_send_buffer(1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width*bottom_msg_length* size_of_datatype, &
cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev_tridi_to_band: aIntern_dev -> bottom_border_send_buffer", successCUDA)
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
bottom_border_send_buffer(1:stripe_width*bottom_msg_length,i) = reshape(&
aIntern(:,n_off+1:n_off+bottom_msg_length,i),(/stripe_width*bottom_msg_length/))
endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Isend(bottom_border_send_buffer(1,1,i), int(bottom_msg_length*stripe_width,kind=MPI_KIND), &
call MPI_Isend(bottom_border_send_buffer(1,i), int(bottom_msg_length*stripe_width,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow+1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), bottom_send_request(i), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
if (next_top_msg_length > 0) then
top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = &
bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
top_border_recv_buffer(1:stripe_width*next_top_msg_length,i) = &
bottom_border_send_buffer(1:stripe_width*next_top_msg_length,i)
endif
#endif /* WITH_MPI */
......@@ -1597,12 +1617,13 @@ subroutine trans_ev_tridi_to_band_&
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset ,int(loc( top_border_recv_buffer(:,1,i)),kind=c_intptr_t), &
successCUDA = cuda_memcpy( aIntern_dev + dev_offset ,int(loc( top_border_recv_buffer(:,i)),kind=c_intptr_t), &
stripe_width * top_msg_length * size_of_datatype, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev_tridi_to_band: top_border_recv_buffer -> aIntern_dev", successCUDA)
else
aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
aIntern(:,a_off+1:a_off+top_msg_length,i) = &
reshape(top_border_recv_buffer(1:stripe_width*top_msg_length,i),(/stripe_width,top_msg_length/))
endif
#endif /* WITH_OPENMP_TRADITIONAL */
endif
......@@ -1667,7 +1688,7 @@ subroutine trans_ev_tridi_to_band_&
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Irecv(top_border_recv_buffer(1,1,i), int(next_top_msg_length*stripe_width,kind=MPI_KIND), &
call MPI_Irecv(top_border_recv_buffer(1,i), int(next_top_msg_length*stripe_width,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION_EXPL, int(my_prow-1,kind=MPI_KIND), int(top_recv_tag,kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), top_recv_request(i), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
......@@ -1717,25 +1738,25 @@ subroutine trans_ev_tridi_to_band_&
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( int(loc(top_border_send_buffer(:,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( int(loc(top_border_send_buffer(:,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width*nbw * size_of_datatype, &
cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev_tridi_to_band: aIntern_dev -> top_border_send_buffer", successCUDA)
else
top_border_send_buffer(:,1:nbw,i) = aIntern(:,a_off+1:a_off+nbw,i)
top_border_send_buffer(:,i) = reshape(aIntern(:,a_off+1:a_off+nbw,i),(/stripe_width*nbw/))
endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Isend(top_border_send_buffer(1,1,i), int(nbw*stripe_width,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
call MPI_Isend(top_border_send_buffer(1,i), int(nbw*stripe_width,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION_EXPL, &
int(my_prow-1,kind=MPI_KIND), int(bottom_recv_tag,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), &
top_send_request(i), mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
bottom_border_recv_buffer(1:nbw*stripe_width,i) = top_border_send_buffer(1:nbw*stripe_width,i)
endif
if (next_n_end < next_n) then
bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i)
bottom_border_recv_buffer(1:stripe_width*nbw,i) = top_border_send_buffer(1:stripe_width*nbw,i)
endif
#endif /* WITH_MPI */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment