Commit 2f62f542 authored by Pavel Kus's avatar Pavel Kus
Browse files

individual routines of ELPA 2 can run on GPU

parent 0735c1a6
...@@ -125,6 +125,7 @@ ...@@ -125,6 +125,7 @@
real(kind=rk) :: eps real(kind=rk) :: eps
#endif #endif
logical, intent(in) :: useGPU logical, intent(in) :: useGPU
character(20) :: gpuString
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows integer(kind=ik) :: l_cols, l_rows
...@@ -188,12 +189,18 @@ ...@@ -188,12 +189,18 @@
logical :: useGPU_reduction_lower_block_to_tridiagonal logical :: useGPU_reduction_lower_block_to_tridiagonal
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("bandred_& call obj%timer%start("bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & PRECISION_SUFFIX // &
) gpuString )
useGPU_reduction_lower_block_to_tridiagonal = .false. useGPU_reduction_lower_block_to_tridiagonal = .false.
if (useGPU) then if (useGPU) then
...@@ -1701,8 +1708,8 @@ ...@@ -1701,8 +1708,8 @@
call obj%timer%stop("bandred_& call obj%timer%stop("bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX //&
) gpuString)
end subroutine bandred_& end subroutine bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
......
...@@ -106,7 +106,7 @@ ...@@ -106,7 +106,7 @@
integer(kind=c_int) :: istat, gpu, debug, qr integer(kind=c_int) :: istat, gpu, debug, qr
character(200) :: errorMessage character(200) :: errorMessage
logical :: do_useGPU, do_useGPU_bandred, & logical :: do_useGPU, do_useGPU_bandred, &
do_useGPU_tridi_band, do_useGPU_solve_tridi, & do_useGPU_tridiag_band, do_useGPU_solve_tridi, &
do_useGPU_trans_ev_tridi_to_band, & do_useGPU_trans_ev_tridi_to_band, &
do_useGPU_trans_ev_band_to_full do_useGPU_trans_ev_band_to_full
integer(kind=c_int) :: numberOfGPUDevices integer(kind=c_int) :: numberOfGPUDevices
...@@ -214,38 +214,17 @@ ...@@ -214,38 +214,17 @@
print *,"Problem getting option. Aborting..." print *,"Problem getting option. Aborting..."
stop stop
endif endif
! check consistency between request for GPUs and defined kernel
! GPU settings
call obj%get("gpu", gpu,error) call obj%get("gpu", gpu,error)
if (error .ne. ELPA_OK) then if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..." print *,"Problem getting option. Aborting..."
stop stop
endif endif
if (gpu == 1) then
if (kernel .ne. GPU_KERNEL) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
else if (nblk .ne. 128) then
kernel = GENERIC_KERNEL
endif
endif
if (kernel .eq. GPU_KERNEL) then
if (gpu .ne. 1) then
write(error_unit,*) "ELPA: Warning, GPU usage has NOT been requested but compute kernel &
&is defined as the GPU kernel! Aborting..."
stop
!TODO do error handling properly
endif
endif
if (gpu .eq. 1) then
useGPU = .true.
else
useGPU = .false.
endif
do_useGPU = .false. useGPU = (gpu == 1)
do_useGPU_trans_ev_tridi_to_band =.false.
do_useGPU = .false.
if (useGPU) then if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
...@@ -264,21 +243,92 @@ ...@@ -264,21 +243,92 @@
endif endif
endif endif
do_useGPU_bandred = do_useGPU
do_useGPU_tridiag_band = do_useGPU
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev_tridi_to_band = do_useGPU
do_useGPU_trans_ev_band_to_full = do_useGPU
! only if we want (and can) use GPU in general, look what are the
! requirements for individual routines. Implicitly they are all set to 1, so
! unles specified otherwise by the user, GPU versions of all individual
! routines should be used
if(do_useGPU) then
call obj%get("gpu_bandred", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_bandred = (gpu == 1)
call obj%get("gpu_tridiag_band", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_tridiag_band = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_solve_tridi = (gpu == 1)
call obj%get("gpu_trans_ev_tridi_to_band", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_trans_ev_tridi_to_band = (gpu == 1)
call obj%get("gpu_trans_ev_band_to_full", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
do_useGPU_trans_ev_band_to_full = (gpu == 1)
endif
! check consistency between request for GPUs and defined kernel ! check consistency between request for GPUs and defined kernel
if (do_useGPU) then
if (nblk .ne. 128) then if (do_useGPU_trans_ev_tridi_to_band) then
! cannot run on GPU with this blocksize if (kernel .ne. GPU_KERNEL) then
! disable GPU usage for trans_ev_tridi write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false. do_useGPU_trans_ev_tridi_to_band = .false.
else else if (nblk .ne. 128) then
if (kernel .eq. GPU_KERNEL) then write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
do_useGPU_trans_ev_tridi_to_band = .true. write(error_unit,*) "The compute kernel will be executed on CPUs!"
else do_useGPU_trans_ev_tridi_to_band = .false.
do_useGPU_trans_ev_tridi_to_band = .false. kernel = GENERIC_KERNEL
endif
endif endif
endif endif
! check again, now kernel and do_useGPU_trans_ev_tridi_to_band sould be
! finally consistent
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel! Aborting..."
stop
endif
if (nblk .ne. 128) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize! Aborting..."
stop
endif
else
if (kernel .eq. GPU_KERNEL) then
! combination not allowed
write(error_unit,*) "ELPA: Warning, GPU usage has NOT been requested but compute kernel &
&is defined as the GPU kernel! Aborting..."
stop
!TODO do error handling properly
endif
endif
#if REALCASE == 1 #if REALCASE == 1
#ifdef SINGLE_PRECISION_REAL #ifdef SINGLE_PRECISION_REAL
! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now ! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
...@@ -446,7 +496,7 @@ ...@@ -446,7 +496,7 @@
&PRECISION & &PRECISION &
(obj, na, a, & (obj, na, a, &
a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, & a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
tmat_dev, wantDebug, do_useGPU, success & tmat_dev, wantDebug, do_useGPU_bandred, success &
#if REALCASE == 1 #if REALCASE == 1
, useQRActual & , useQRActual &
#endif #endif
...@@ -473,7 +523,7 @@ ...@@ -473,7 +523,7 @@
&_& &_&
&PRECISION& &PRECISION&
(obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, & (obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
do_useGPU, wantDebug) do_useGPU_tridiag_band, wantDebug)
#ifdef WITH_MPI #ifdef WITH_MPI
call obj%timer%start("mpi_communication") call obj%timer%start("mpi_communication")
...@@ -510,7 +560,7 @@ ...@@ -510,7 +560,7 @@
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
q_real, ubound(q_real,dim=1), & q_real, ubound(q_real,dim=1), &
#endif #endif
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU, wantDebug, success) nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success)
call obj%timer%stop("solve") call obj%timer%stop("solve")
if (.not.(success)) return if (.not.(success)) return
endif ! do_solve_tridi endif ! do_solve_tridi
...@@ -592,9 +642,31 @@ ...@@ -592,9 +642,31 @@
endif endif
endif ! do_trans_to_band endif ! do_trans_to_band
! the array q might reside on device or host, depending on whether GPU is
! used or not. We thus have to transfer he data manually, if one of the
! routines is run on GPU and the other not.
! first deal with the situation that first backward step was on GPU
if(do_useGPU_trans_ev_tridi_to_band) then
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
endif
! if the last step is not required at all, or will be performed on CPU,
! release the memmory allocated on the device
if((.not. do_trans_to_full) .or. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_free(q_dev)
endif
endif
!TODO check that the memory is properly dealocated on the host in case that
!the last step is not required
if (do_trans_to_full) then if (do_trans_to_full) then
call obj%timer%start("trans_ev_to_full") call obj%timer%start("trans_ev_to_full")
if ( (do_useGPU) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
! copy to device if we want to continue on GPU ! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype) successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
...@@ -610,7 +682,7 @@ ...@@ -610,7 +682,7 @@
(obj, na, nev, nblk, nbw, a, & (obj, na, nev, nblk, nbw, a, &
a_dev, lda, tmat, tmat_dev, q, & a_dev, lda, tmat, tmat_dev, q, &
q_dev, & q_dev, &
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU & ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
#if REALCASE == 1 #if REALCASE == 1
, useQRActual & , useQRActual &
#endif #endif
......
...@@ -138,17 +138,26 @@ ...@@ -138,17 +138,26 @@
integer(kind=ik) :: istat integer(kind=ik) :: istat
character(200) :: errorMessage character(200) :: errorMessage
character(20) :: gpuString
logical :: successCUDA logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION& &PRECISION&
&_& &_&
&MATH_DATATYPE &MATH_DATATYPE
integer :: blocking_factor, error integer :: blocking_factor, error
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("trans_ev_band_to_full_& call obj%timer%start("trans_ev_band_to_full_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX //&
) gpuString)
#ifdef BAND_TO_FULL_BLOCKING #ifdef BAND_TO_FULL_BLOCKING
call obj%get("blocking_in_band_to_full",blocking_factor,error) call obj%get("blocking_in_band_to_full",blocking_factor,error)
if (error .ne. ELPA_OK) then if (error .ne. ELPA_OK) then
...@@ -832,8 +841,8 @@ ...@@ -832,8 +841,8 @@
call obj%timer%stop("trans_ev_band_to_full_& call obj%timer%stop("trans_ev_band_to_full_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX& &PRECISION_SUFFIX //&
) gpuString)
end subroutine trans_ev_band_to_full_& end subroutine trans_ev_band_to_full_&
&MATH_DATATYPE& &MATH_DATATYPE&
......
...@@ -189,6 +189,7 @@ ...@@ -189,6 +189,7 @@
logical :: success logical :: success
integer(kind=ik) :: istat, print_flops integer(kind=ik) :: istat, print_flops
character(200) :: errorMessage character(200) :: errorMessage
character(20) :: gpuString
logical :: successCUDA logical :: successCUDA
#ifndef WITH_MPI #ifndef WITH_MPI
integer(kind=ik) :: j1 integer(kind=ik) :: j1
...@@ -198,11 +199,17 @@ ...@@ -198,11 +199,17 @@
&_& &_&
&MATH_DATATYPE &MATH_DATATYPE
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("trans_ev_tridi_to_band_& call obj%timer%start("trans_ev_tridi_to_band_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX //&
) gpuString)
n_times = 0 n_times = 0
if (useGPU) then if (useGPU) then
...@@ -2376,8 +2383,8 @@ ...@@ -2376,8 +2383,8 @@
call obj%timer%stop("trans_ev_tridi_to_band_& call obj%timer%stop("trans_ev_tridi_to_band_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX& &PRECISION_SUFFIX //&
) gpuString)
return return
!#if COMPLEXCASE == 1 !#if COMPLEXCASE == 1
......
...@@ -129,15 +129,24 @@ ...@@ -129,15 +129,24 @@
MATH_DATATYPE(kind=rck), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) MATH_DATATYPE(kind=rck), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
integer :: istat integer :: istat
character(200) :: errorMessage character(200) :: errorMessage
character(20) :: gpuString
#ifndef WITH_MPI #ifndef WITH_MPI
integer(kind=ik) :: startAddr integer(kind=ik) :: startAddr
#endif #endif
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("tridiag_band_& call obj%timer%start("tridiag_band_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX //&
) gpuString)
if (wantDebug) call obj%timer%start("mpi_communication") if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(communicator,my_pe,mpierr) call mpi_comm_rank(communicator,my_pe,mpierr)
call mpi_comm_size(communicator,n_pes,mpierr) call mpi_comm_size(communicator,n_pes,mpierr)
...@@ -1198,8 +1207,8 @@ ...@@ -1198,8 +1207,8 @@
call obj%timer%stop("tridiag_band_& call obj%timer%stop("tridiag_band_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX& &PRECISION_SUFFIX //&
) gpuString)
! intel compiler bug makes these ifdefs necessary ! intel compiler bug makes these ifdefs necessary
#if REALCASE == 1 #if REALCASE == 1
......
...@@ -157,6 +157,14 @@ static const elpa_index_int_entry_t int_entries[] = { ...@@ -157,6 +157,14 @@ static const elpa_index_int_entry_t int_entries[] = {
INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \ INT_ENTRY("real_kernel", "Real kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_REAL_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, \
number_of_real_kernels, real_kernel_enumerate, \ number_of_real_kernels, real_kernel_enumerate, \
real_kernel_is_valid, real_kernel_name), real_kernel_is_valid, real_kernel_name),
INT_ENTRY("gpu_bandred", "Use GPU acceleration for ELPA2 band reduction", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
cardinality_bool, enumerate_identity, NULL, NULL),
INT_ENTRY("gpu_tridiag_band", "Use GPU acceleration for ELPA2 tridiagonalization", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
cardinality_bool, enumerate_identity, NULL, NULL),
INT_ENTRY("gpu_trans_ev_tridi_to_band", "Use GPU acceleration for ELPA2 trans_ev_tridi_to_band", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
cardinality_bool, enumerate_identity, NULL, NULL),
INT_ENTRY("gpu_trans_ev_band_to_full", "Use GPU acceleration for ELPA2 trans_ev_band_to_full", 1, ELPA_AUTOTUNE_NOT_TUNABLE, ELPA_AUTOTUNE_DOMAIN_ANY,
cardinality_bool, enumerate_identity, NULL, NULL),
INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \ INT_ENTRY("complex_kernel", "Complex kernel to use if 'solver' is set to ELPA_SOLVER_2STAGE", ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_COMPLEX, \
number_of_complex_kernels, complex_kernel_enumerate, \ number_of_complex_kernels, complex_kernel_enumerate, \
complex_kernel_is_valid, complex_kernel_name), complex_kernel_is_valid, complex_kernel_name),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment