Commit c21d968a authored by Andreas Marek's avatar Andreas Marek

Keep aMatrix on device until redist_band

parent f64b038b
......@@ -1937,37 +1937,37 @@
enddo ! istep
if (useGPU) then
! this is not needed since a_dev is passed along from one subroutine to the other
successCUDA = cuda_memcpy ( &
#if REALCASE == 1
loc(a), &
#endif
#if COMPLEXCASE == 1
loc(a(1,1)), &
#endif
a_dev, lda*na_cols* &
#if REALCASE == 1
size_of_PRECISION_real, &
#endif
#if COMPLEXCASE ==1
size_of_PRECISION_complex,&
#endif
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
successCUDA = cuda_free(a_dev)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaFree"
stop
endif
! ! this is not needed since a_dev is passed along from one subroutine to the other
!
! successCUDA = cuda_memcpy ( &
!#if REALCASE == 1
! loc(a), &
!#endif
!#if COMPLEXCASE == 1
! loc(a(1,1)), &
!#endif
! a_dev, lda*na_cols* &
!#if REALCASE == 1
! size_of_PRECISION_real, &
!#endif
!#if COMPLEXCASE ==1
! size_of_PRECISION_complex,&
!#endif
! cudaMemcpyDeviceToHost)
! if (.not.(successCUDA)) then
! print *,"bandred_&
! &MATH_DATATYPE&
! &: error in cudaMemcpy"
! stop
! endif
!
! successCUDA = cuda_free(a_dev)
! if (.not.(successCUDA)) then
! print *,"bandred_&
! &MATH_DATATYPE&
! &: error in cudaFree"
! stop
! endif
!#ifdef WITH_MPI
!! it should be possible to keep tmat dev on the device and not copy it arround
......
......@@ -323,7 +323,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
(na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
(na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, useGPU)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
......
......@@ -63,7 +63,7 @@
#if COMPLEXCASE == 1
hh_trans_complex, &
#endif
mpi_comm_rows, mpi_comm_cols, mpi_comm)
mpi_comm_rows, mpi_comm_cols, mpi_comm, useGPU)
!-------------------------------------------------------------------------------
! tridiag_band_real/complex:
! Reduces a real symmetric band matrix to tridiagonal form
......@@ -101,7 +101,8 @@
use iso_c_binding
implicit none
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(in) :: aMatrix(lda,*)
......@@ -269,7 +270,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
&(aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
&(aMatrix, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab, useGPU)
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
......
......@@ -60,12 +60,12 @@ subroutine redist_band_&
#endif
a_dev, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, &
#if REALCASE == 1
r_ab)
r_ab, &
#endif
#if COMPLEXCASE == 1
c_ab)
c_ab, &
#endif
useGPU)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
......@@ -74,8 +74,10 @@ subroutine redist_band_&
use elpa2_workload
use precision
use iso_c_binding
use cuda_functions
implicit none
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
#if REALCASE == 1
real(kind=REAL_DATATYPE), intent(in) :: r_a(lda, matrixCols)
......@@ -105,12 +107,46 @@ subroutine redist_band_&
nfact, np, npr, npc, mpierr, is, js
integer(kind=ik) :: nblocks_total, il, jl, l_rows, l_cols, n_off
logical :: successCUDA
call timer%start("redist_band_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
if (useGPU) then
! copy a_dev to aMatrix
successCUDA = cuda_memcpy ( &
#if REALCASE == 1
loc(r_a), &
#endif
#if COMPLEXCASE == 1
loc(c_a(1,1)), &
#endif
a_dev, lda*matrixCols* &
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
size_of_double_real_datatype, &
#else
size_of_single_real_datatype, &
#endif
#endif
#if COMPLEXCASE ==1
#ifdef DOUBLE_PRECISION_COMPLEX
size_of_double_complex_datatype,&
#else
size_of_single_complex_datatype,&
#endif
#endif
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"redist_band_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
endif ! useGPU
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_size(mpi_comm,n_pes,mpierr)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment