Commit 0735c1a6 authored by Pavel Kus's avatar Pavel Kus

reordering elpa2_template to bring gpu handling logic together

parent 8225bb5b
......@@ -105,7 +105,10 @@
logical :: wantDebug
integer(kind=c_int) :: istat, gpu, debug, qr
character(200) :: errorMessage
logical :: do_useGPU, do_useGPU_trans_ev_tridi
logical :: do_useGPU, do_useGPU_bandred, &
do_useGPU_tridi_band, do_useGPU_solve_tridi, &
do_useGPU_trans_ev_tridi_to_band, &
do_useGPU_trans_ev_band_to_full
integer(kind=c_int) :: numberOfGPUDevices
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
......@@ -156,6 +159,32 @@
nblk = obj%nblk
matrixCols = obj%local_ncols
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("mpi_comm_parent",mpi_comm_all,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call obj%timer%stop("mpi_communication")
! special case na = 1
if (na .eq. 1) then
#if REALCASE == 1
......@@ -208,6 +237,48 @@
endif
endif
if (gpu .eq. 1) then
useGPU = .true.
else
useGPU = .false.
endif
do_useGPU = .false.
do_useGPU_trans_ev_tridi_to_band =.false.
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
! check consistency between request for GPUs and defined kernel
if (do_useGPU) then
if (nblk .ne. 128) then
! cannot run on GPU with this blocksize
! disable GPU usage for trans_ev_tridi
do_useGPU_trans_ev_tridi_to_band = .false.
else
if (kernel .eq. GPU_KERNEL) then
do_useGPU_trans_ev_tridi_to_band = .true.
else
do_useGPU_trans_ev_tridi_to_band = .false.
endif
endif
endif
#if REALCASE == 1
#ifdef SINGLE_PRECISION_REAL
! special case at the moment NO single precision kernels on POWER 8 -> set GENERIC for now
......@@ -230,27 +301,6 @@
#endif
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("mpi_comm_parent",mpi_comm_all,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
if (gpu .eq. 1) then
useGPU = .true.
else
useGPU = .false.
endif
#if REALCASE == 1
call obj%get("qr",qr,error)
......@@ -265,15 +315,6 @@
endif
#endif
call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call obj%timer%stop("mpi_communication")
call obj%get("debug",debug,error)
if (error .ne. ELPA_OK) then
......@@ -282,8 +323,6 @@
endif
wantDebug = debug == 1
do_useGPU = .false.
do_useGPU_trans_ev_tridi =.false.
#if REALCASE == 1
......@@ -304,39 +343,6 @@
endif
#endif /* REALCASE */
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
! check consistency between request for GPUs and defined kernel
if (do_useGPU) then
if (nblk .ne. 128) then
! cannot run on GPU with this blocksize
! disable GPU usage for trans_ev_tridi
do_useGPU_trans_ev_tridi = .false.
else
if (kernel .eq. GPU_KERNEL) then
do_useGPU_trans_ev_tridi = .true.
else
do_useGPU_trans_ev_tridi = .false.
endif
endif
endif
if (.not. obj%eigenvalues_only) then
......@@ -569,7 +575,7 @@
&PRECISION &
(obj, na, nev, nblk, nbw, q, &
q_dev, &
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi, &
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
success=success, kernel=kernel)
call obj%timer%stop("trans_ev_to_band")
......@@ -588,7 +594,7 @@
if (do_trans_to_full) then
call obj%timer%start("trans_ev_to_full")
if ( (do_useGPU) .and. .not.(do_useGPU_trans_ev_tridi) ) then
if ( (do_useGPU) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment