Commit af7bb4a0 authored by Wenzhe Yu's avatar Wenzhe Yu 😎
Browse files

GPU memory optimization in ELPA2

* Removed redundant malloc, memset and memcpy
* Use pinned host memory
* Implemented blocking for GPU code path in step5
* Removed unused code
parent 6e5c03a6
......@@ -233,7 +233,7 @@
call PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx )
call obj%timer%stop("blas")
! Calculate the allowable deflation tolerance
! Calculate the allowable deflation tolerance
zmax = maxval(abs(z))
dmax = maxval(abs(d))
......@@ -618,12 +618,6 @@
successCUDA = cuda_malloc(qtmp2_dev, gemm_dim_k * gemm_dim_m * size_of_datatype)
check_alloc_cuda("merge_systems: qtmp2_dev", successCUDA)
successCUDA = cuda_memset(qtmp1_dev, 0, gemm_dim_k * gemm_dim_l * size_of_datatype)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
successCUDA = cuda_memset(qtmp2_dev, 0, gemm_dim_k * gemm_dim_m * size_of_datatype)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
! Gather nonzero upper/lower components of old matrix Q
......@@ -784,20 +778,6 @@
endif ! useGPU
endif
if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt
!TODO: actually this will be done after the second mutiplication
!TODO or actually maybe I should copy the half of the qtmp2 array
!here and the rest after the next gemm
!TODO either copy only half of the matrix here, and half after the
!second gemm, or copy whole array after the next gemm
! successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
! gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
! check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
! Compute eigenvectors of the rank-1 modified matrix.
! Parts for multiplying with lower half of Q:
......@@ -874,7 +854,7 @@
check_dealloc_cuda("merge_systems: ev_dev", successCUDA)
endif
endif !very outer test (na1==1 .or. na1==2)
endif !very outer test (na1==1 .or. na1==2)
#ifdef WITH_OPENMP
deallocate(z_p, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......
......@@ -257,9 +257,6 @@ subroutine solve_tridi_&
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik) :: np_off, nprocs
integer(kind=ik) :: np1, np2, noff, nlen, nmid, n
#ifdef WITH_MPI
! integer(kind=ik) :: my_mpi_status(mpi_status_size)
#endif
logical, intent(in) :: useGPU, wantDebug
logical, intent(out) :: success
......@@ -580,7 +577,7 @@ subroutine solve_tridi_&
end subroutine solve_tridi_col_&
&PRECISION_AND_SUFFIX
recursive subroutine solve_tridi_single_problem_&
subroutine solve_tridi_single_problem_&
&PRECISION_AND_SUFFIX &
(obj, nlen, d, e, q, ldq, wantDebug, success)
......
This diff is collapsed.
......@@ -100,8 +100,6 @@
MATH_DATATYPE(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
integer(kind=c_intptr_t) :: tmat_dev, q_dev, a_dev
integer(kind=c_int) :: i
logical :: success, successCUDA
logical :: wantDebug
......@@ -546,18 +544,6 @@
stop 1
endif
! if either of full->band or band->full steps are to be done on GPU,
! allocate also corresponding array on GPU.
if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
successCUDA = cuda_malloc(tmat_dev, nbw*nbw* size_of_datatype)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc tmat_dev 1"
stop 1
endif
endif
do_bandred = .true.
do_solve_tridi = .true.
do_trans_to_band = .true.
......@@ -577,8 +563,8 @@
&_&
&PRECISION &
(obj, na, a, &
a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
tmat_dev, wantDebug, do_useGPU_bandred, success, &
lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
wantDebug, do_useGPU_bandred, success, &
#if REALCASE == 1
useQRActual, &
#endif
......@@ -610,7 +596,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
(obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
(obj, na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
do_useGPU_tridiag_band, wantDebug, nrThreads)
#ifdef WITH_MPI
......@@ -723,7 +709,6 @@
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, q, &
q_dev, &
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
nrThreads, success=success, kernel=kernel)
#ifdef HAVE_LIKWID
......@@ -744,47 +729,13 @@
endif
endif ! do_trans_to_band
! the array q might reside on device or host, depending on whether GPU is
! used or not. We thus have to transfer he data manually, if one of the
! routines is run on GPU and the other not.
! first deal with the situation that first backward step was on GPU
if(do_useGPU_trans_ev_tridi_to_band) then
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(int(loc(q),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
endif
endif
! if the last step is not required at all, or will be performed on CPU,
! release the memmory allocated on the device
if((.not. do_trans_to_full) .or. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_free(q_dev)
endif
endif
!TODO check that the memory is properly dealocated on the host in case that
!the last step is not required
! the array q (currently) always resides on host even when using GPU
if (do_trans_to_full) then
call obj%timer%start("trans_ev_to_full")
#ifdef HAVE_LIKWID
call likwid_markerStartRegion("trans_ev_to_full")
#endif
if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device"
stop 1
endif
endif
! Backtransform stage 2
......@@ -792,9 +743,7 @@
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, a, &
a_dev, lda, tmat, tmat_dev, q, &
q_dev, &
(obj, na, nev, nblk, nbw, a, lda, tmat, q, &
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
#if REALCASE == 1
, useQRActual &
......@@ -816,16 +765,6 @@
call obj%timer%stop("trans_ev_to_full")
endif ! do_trans_to_full
if(do_bandred .or. do_trans_to_full) then
if (do_useGPU_bandred .or. do_useGPU_trans_ev_band_to_full) then
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"elpa2_template: error in cudaFree, tmat_dev"
stop 1
endif
endif
endif
if (obj%eigenvalues_only) then
deallocate(q_dummy, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......
......@@ -56,7 +56,7 @@
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, q, q_dev, ldq, matrixCols, &
(obj, na, nev, nblk, nbw, q, ldq, matrixCols, &
hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, max_threads, success, &
kernel)
......@@ -78,8 +78,6 @@
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! q_dev GPU device pointer to q
!
! ldq Leading dimension of q
! matrixCols local columns of matrix q
!
......@@ -114,7 +112,6 @@
#endif
MATH_DATATYPE(kind=rck), intent(in) :: hh_trans(:,:)
integer(kind=c_intptr_t) :: q_dev
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
......@@ -139,8 +136,8 @@
type(c_ptr) :: aIntern_ptr
MATH_DATATYPE(kind=rck) , allocatable :: row(:)
MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:)
MATH_DATATYPE(kind=rck), allocatable :: row(:)
MATH_DATATYPE(kind=rck), pointer :: row_group(:,:)
#ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:)
......@@ -162,11 +159,13 @@
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=ik) :: row_group_size, unpack_idx
type(c_ptr) :: row_group_host, bcast_buffer_host
integer(kind=ik) :: n_times
integer(kind=ik) :: chunk, this_chunk
MATH_DATATYPE(kind=rck), allocatable :: result_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bcast_buffer(:,:)
MATH_DATATYPE(kind=rck), pointer :: bcast_buffer(:,:)
integer(kind=ik) :: n_off
......@@ -497,7 +496,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"//errorMessage
&: error in cudaMalloc aIntern_dev "//errorMessage
stop 1
endif
......@@ -505,26 +504,28 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemset"//errorMessage
&: error in cudaMemset aIntern_dev "//errorMessage
stop 1
endif
! "row_group" and "row_group_dev" are needed for GPU optimizations
allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error when allocating row_group"//errorMessage
successCUDA = cuda_malloc_host(row_group_host,l_nev*nblk*size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMallocHost row_group_host"
stop 1
endif
call c_f_pointer(row_group_host, row_group, (/l_nev,nblk/))
row_group(:, :) = 0.0_rck
num = (l_nev*nblk)* size_of_datatype
successCUDA = cuda_malloc(row_group_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"//errorMessage
&: error in cudaMalloc row_group_dev "//errorMessage
stop 1
endif
......@@ -532,7 +533,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemset"//errorMessage
&: error in cudaMemset row_group_dev "//errorMessage
stop 1
endif
......@@ -1056,22 +1057,35 @@
! Initialize broadcast buffer
allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error when allocating bcast_buffer"//errorMessage
stop 1
endif
if (useGPU) then
successCUDA = cuda_malloc_host(bcast_buffer_host,nbw*max_blk_size*size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMallocHost bcast_buffer_host"
stop 1
endif
call c_f_pointer(bcast_buffer_host, bcast_buffer, (/nbw,max_blk_size/))
else
allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error when allocating bcast_buffer"//errorMessage
stop 1
endif
endif
bcast_buffer = 0.0_rck
if (useGPU) then
num = ( nbw * max_blk_size) * size_of_datatype
successCUDA = cuda_malloc(bcast_buffer_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
&: error in cudaMalloc bcast_buffer_dev"
stop 1
endif
......@@ -1079,7 +1093,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemset"
&: error in cudaMemset bcast_buffer_dev"
stop 1
endif
......@@ -1088,7 +1102,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
&: error in cudaMalloc hh_tau_dev"
stop 1
endif
......@@ -1096,7 +1110,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemset"
&: error in cudaMemset hh_tau_dev"
stop 1
endif
endif ! useGPU
......@@ -1201,7 +1215,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
&: error in cudaMemcpy bcast_buffer_dev H2D"
stop 1
endif
......@@ -1222,7 +1236,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemset"
&: error in cudaMemset bcast_buffer_dev"
stop 1
endif
......@@ -1301,7 +1315,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
&: error in cudaMemcpy aIntern_dev H2D"
stop 1
endif
......@@ -1386,7 +1400,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
&: error in cudaMemcpy aIntern_dev H2D"
stop 1
endif
else ! useGPU
......@@ -1473,7 +1487,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
&: error in cudaMemcpy aIntern_dev D2H"
stop 1
endif
else
......@@ -1576,7 +1590,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error cudaMemcpy"
&: error cudaMemcpy aIntern_dev D2H"
stop 1
endif
else
......@@ -1666,7 +1680,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
&: error in cudaMemcpy aIntern_dev H2D"
stop 1
endif
else
......@@ -1788,7 +1802,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
&: error in cudaMemcpy aIntern_dev D2H"
stop 1
endif
......@@ -2060,7 +2074,7 @@
if (.not. successCUDA) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error cudaMemcpy"
&: error cudaMemcpy aIntern_dev D2D"
stop 1
end if
end do
......@@ -2119,28 +2133,6 @@
if (my_prow==0 .and. my_pcol==0 .and.print_flops == 1) &
write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6
if (useGPU) then
! copy q to q_dev needed in trans_ev_band_to_full
successCUDA = cuda_malloc(q_dev, ldq*matrixCols* size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop 1
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop 1
endif
! endif
endif !use GPU
! deallocate all working space
if (.not.(useGPU)) then
......@@ -2220,12 +2212,24 @@
stop 1
endif
deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error when deallocating bcast_buffer "//errorMessage
stop 1
if (useGPU) then
nullify(bcast_buffer)
successCUDA = cuda_free_host(bcast_buffer_host)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaFreeHost bcast_buffer_host"
stop 1
endif
else
deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error when deallocating bcast_buffer "//errorMessage
stop 1
endif
endif
deallocate(top_send_request, stat=istat, errmsg=errorMessage)
......@@ -2263,7 +2267,7 @@
if (useGPU) then
successCUDA = cuda_free(aIntern_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
print *,"trans_ev_tridi_to_band_complex: error in cudaFree aIntern_dev"
stop 1
endif
......@@ -2271,15 +2275,17 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
&: error in cudaFree hh_tau_dev"
stop 1
endif
deallocate(row_group, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
nullify(row_group)
successCUDA = cuda_free_host(row_group_host)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error when deallocating row_group "//errorMessage
&: error in cudaFreeHost row_group_host"
stop 1
endif
......@@ -2287,7 +2293,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
&: error in cudaFree row_group_dev"
stop 1
endif
......@@ -2295,7 +2301,7 @@
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
&: error in cudaFree bcast_buffer_dev"
stop 1
endif
endif ! useGPU
......
......@@ -55,7 +55,7 @@
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nb, nblk, a_mat, a_dev, lda, d, e, matrixCols, &
(obj, na, nb, nblk, a_mat, lda, d, e, matrixCols, &
hh_trans, mpi_comm_rows, mpi_comm_cols, communicator, useGPU, wantDebug, nrThreads)
!-------------------------------------------------------------------------------
! tridiag_band_real/complex:
......@@ -104,7 +104,6 @@
#else
MATH_DATATYPE(kind=rck), intent(in) :: a_mat(lda,matrixCols)
#endif
integer(kind=c_intptr_t) :: a_dev
real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
MATH_DATATYPE(kind=rck), intent(out), allocatable :: hh_trans(:,:)
......@@ -240,7 +239,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
&(obj,a_mat, a_dev, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
&(obj,a_mat, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
......
......@@ -51,7 +51,7 @@ subroutine redist_band_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, a_mat, a_dev, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
(obj, a_mat, lda, na, nblk, nbw, matrixCols, mpi_comm_rows, mpi_comm_cols, communicator, ab, useGPU)
use elpa_abstract_impl
use elpa2_workload