Commit 2f62f542 authored by Pavel Kus's avatar Pavel Kus

individual routines of ELPA 2 can run on GPU

parent 0735c1a6
......@@ -125,6 +125,7 @@
real(kind=rk) :: eps
#endif
logical, intent(in) :: useGPU
character(20) :: gpuString
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows
......@@ -188,12 +189,18 @@
logical :: useGPU_reduction_lower_block_to_tridiagonal
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("bandred_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
PRECISION_SUFFIX // &
gpuString )
useGPU_reduction_lower_block_to_tridiagonal = .false.
if (useGPU) then
......@@ -1701,8 +1708,8 @@
call obj%timer%stop("bandred_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX //&
gpuString)
end subroutine bandred_&
&MATH_DATATYPE&
......
......@@ -106,7 +106,7 @@
integer(kind=c_int) :: istat, gpu, debug, qr
character(200) :: errorMessage
logical :: do_useGPU, do_useGPU_bandred, &
do_useGPU_tridi_band, do_useGPU_solve_tridi, &
do_useGPU_tridiag_band, do_useGPU_solve_tridi, &
do_useGPU_trans_ev_tridi_to_band, &
do_useGPU_trans_ev_band_to_full
integer(kind=c_int) :: numberOfGPUDevices
......@@ -214,38 +214,17 @@
print *,"Problem getting option. Aborting..."
stop
endif
! check consistency between request for GPUs and defined kernel
! GPU settings
call obj%get("gpu", gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
if (gpu == 1) then
if (kernel .ne. GPU_KERNEL) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
else if (nblk .ne. 128) then
kernel = GENERIC_KERNEL
endif
endif
if (kernel .eq. GPU_KERNEL) then
if (gpu .ne. 1) then
write(error_unit,*) "ELPA: Warning, GPU usage has NOT been requested but compute kernel &
&is defined as the GPU kernel! Aborting..."
stop
!TODO do error handling properly
endif
endif
if (gpu .eq. 1) then
useGPU = .true.
else
useGPU = .false.
endif
do_useGPU = .false.
do_useGPU_trans_ev_tridi_to_band =.false.
useGPU = (gpu == 1)
do_useGPU = .false.
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
......@@ -264,21 +243,92 @@
endif
endif
do_useGPU_bandred = do_useGPU
do_useGPU_tridiag_band = do_useGPU
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev_tridi_to_band = do_useGPU
do_useGPU_trans_ev_band_to_full = do_useGPU
! only if we want (and can) use GPU in general, look what are the
! requirements for individual routines. Implicitly they are all set to 1, so
! unles specified otherwise by the user, GPU versions of all individual
! routines should be used
if(do_useGPU) then
call obj%get("gpu_bandred", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_bandred = (gpu == 1)
call obj%get("gpu_tridiag_band", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_tridiag_band = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_solve_tridi = (gpu == 1)
call obj%get("gpu_trans_ev_tridi_to_band", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_trans_ev_tridi_to_band = (gpu == 1)
call obj%get("gpu_trans_ev_band_to_full", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_trans_ev_band_to_full = (gpu == 1)
endif
! check consistency between request for GPUs and defined kernel
if (do_useGPU) then
if (nblk .ne. 128) then
! cannot run on GPU with this blocksize
! disable GPU usage for trans_ev_tridi
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
else
if (kernel .eq. GPU_KERNEL) then
do_useGPU_trans_ev_tridi_to_band = .true.
else
do_useGPU_trans_ev_tridi_to_band = .false.
endif
else if (nblk .ne. 128) then
write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
kernel = GENERIC_KERNEL
endif
endif
! check again, now kernel and do_useGPU_trans_ev_tridi_to_band sould be
! finally consistent
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel! Aborting..."
stop
endif
if (nblk .ne. 128) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize! Aborting..."
stop
endif
else
if (kernel .eq. GPU_KERNEL) then
! combination not allowed
write(error_unit,*) "ELPA: Warning, GPU usage has NOT been requested but compute kernel &
&is defined as the GPU kernel! Aborting..."
stop
!TODO do error handling properly
endif
endif
#if REALCASE == 1
#ifdef SINGLE_PRECISION_REAL
! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
......@@ -446,7 +496,7 @@
&PRECISION &
(obj, na, a, &
a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
tmat_dev, wantDebug, do_useGPU, success &
tmat_dev, wantDebug, do_useGPU_bandred, success &
#if REALCASE == 1
, useQRActual &
#endif
......@@ -473,7 +523,7 @@
&_&
&PRECISION&
(obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
do_useGPU, wantDebug)
do_useGPU_tridiag_band, wantDebug)
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
......@@ -510,7 +560,7 @@
#if COMPLEXCASE == 1
q_real, ubound(q_real,dim=1), &
#endif
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU, wantDebug, success)
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success)
call obj%timer%stop("solve")
if (.not.(success)) return
endif ! do_solve_tridi
......@@ -592,9 +642,31 @@
endif
endif ! do_trans_to_band
! the array q might reside on device or host, depending on whether GPU is
! used or not. We thus have to transfer he data manually, if one of the
! routines is run on GPU and the other not.
! first deal with the situation that first backward step was on GPU
if(do_useGPU_trans_ev_tridi_to_band) then
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
endif
! if the last step is not required at all, or will be performed on CPU,
! release the memmory allocated on the device
if((.not. do_trans_to_full) .or. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_free(q_dev)
endif
endif
!TODO check that the memory is properly dealocated on the host in case that
!the last step is not required
if (do_trans_to_full) then
call obj%timer%start("trans_ev_to_full")
if ( (do_useGPU) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
......@@ -610,7 +682,7 @@
(obj, na, nev, nblk, nbw, a, &
a_dev, lda, tmat, tmat_dev, q, &
q_dev, &
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU &
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
#if REALCASE == 1
, useQRActual &
#endif
......
......@@ -138,17 +138,26 @@
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
integer :: blocking_factor, error
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("trans_ev_band_to_full_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX //&
gpuString)
#ifdef BAND_TO_FULL_BLOCKING
call obj%get("blocking_in_band_to_full",blocking_factor,error)
if (error .ne. ELPA_OK) then
......@@ -832,8 +841,8 @@
call obj%timer%stop("trans_ev_band_to_full_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX&
)
&PRECISION_SUFFIX //&
gpuString)
end subroutine trans_ev_band_to_full_&
&MATH_DATATYPE&
......
......@@ -189,6 +189,7 @@
logical :: success
integer(kind=ik) :: istat, print_flops
character(200) :: errorMessage
character(20) :: gpuString
logical :: successCUDA
#ifndef WITH_MPI
integer(kind=ik) :: j1
......@@ -198,11 +199,17 @@
&_&
&MATH_DATATYPE
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX //&
gpuString)
n_times = 0
if (useGPU) then
......@@ -2376,8 +2383,8 @@
call obj%timer%stop("trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX&
)
&PRECISION_SUFFIX //&
gpuString)
return
!#if COMPLEXCASE == 1
......
......@@ -129,15 +129,24 @@
MATH_DATATYPE(kind=rck), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
integer :: istat
character(200) :: errorMessage
character(20) :: gpuString
#ifndef WITH_MPI
integer(kind=ik) :: startAddr
#endif
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("tridiag_band_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX //&
gpuString)
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(communicator,my_pe,mpierr)
call mpi_comm_size(communicator,n_pes,mpierr)
......@@ -1198,8 +1207,8 @@
call obj%timer%stop("tridiag_band_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX&
)
&PRECISION_SUFFIX //&
gpuString)
! intel compiler bug makes these ifdefs necessary
#if REALCASE == 1
......
......@@ -157,6 +157,14 @@ static const elpa_index_int_entry_t int_entries[] = {
INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \
number_of_real_kernels, real_kernel_enumerate, \
real_kernel_is_valid, real_kernel_name),
INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
cardinality_bool, enumerate_identity, NULL, NULL),
INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
cardinality_bool, enumerate_identity, NULL, NULL),
INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
cardinality_bool, enumerate_identity, NULL, NULL),
INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
cardinality_bool, enumerate_identity, NULL, NULL),
INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \
number_of_complex_kernels, complex_kernel_enumerate, \
complex_kernel_is_valid, complex_kernel_name),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment