Commit 1b92158d authored by Pavel Kus's avatar Pavel Kus

allocating gpu memory and making the logic call the BLAS kernel for GPU

as well. Has to be improved later (since maybe the whole GPU
infrastructure might change)
parent 633fb738
......@@ -128,16 +128,21 @@
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#undef BLAS_KERNEL
#define GPU_KERNEL ELPA_2STAGE_REAL_GPU
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
#define BLAS_KERNEL ELPA_2STAGE_REAL_BLAS_BLOCK4
#define KERNEL_STRING "real_kernel"
#endif
#if COMPLEXCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#undef BLAS_KERNEL
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
! TODO blas complex kernel
#define BLAS_KERNEL ELPA_2STAGE_REAL_BLAS_BLOCK4
#define KERNEL_STRING "complex_kernel"
#endif
......@@ -319,30 +324,34 @@
! check consistency between request for GPUs and defined kernel
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
else if (nblk .ne. 128) then
write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
kernel = GENERIC_KERNEL
endif
if (kernel .ne. BLAS_KERNEL) then
if (kernel .ne. GPU_KERNEL) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
else if (nblk .ne. 128) then
write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
kernel = GENERIC_KERNEL
endif
endif ! blas kernel TODO improve this logic
endif
! check again, now kernel and do_useGPU_trans_ev_tridi_to_band sould be
! finally consistent
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel! Aborting..."
stop
endif
if (nblk .ne. 128) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize! Aborting..."
stop
if (kernel .ne. BLAS_KERNEL) then ! for BLAS kernel both GPU and CPU possible TODO maybe should have BLAS_GPU_KERNEL?
if (kernel .ne. GPU_KERNEL) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel! Aborting..."
stop
endif
if (nblk .ne. 128) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize! Aborting..."
stop
endif
endif
else
if (kernel .eq. GPU_KERNEL) then
......
......@@ -192,6 +192,10 @@
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
! at the moment, maximal 4 eigenvector at a time
! TODO increase it
integer(kind=ik), parameter :: max_block_blas = 4
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
......@@ -611,6 +615,49 @@
#endif /* WITH_OPENMP */
endif !useGPU_LEGACY
if(useGPU_BLAS) then
! prepare device memory for the BLAS GPU kennel
! dimmensions correspondence:
! nbw = ldh = nb
! stripe_width = ldq = nq = nl
! max_block_blas = 4 (currently)
successCUDA = cuda_malloc( h_dev, max_block_blas * (nbw+3) * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop 1
endif
successCUDA = cuda_malloc( s_dev, max_block_blas * max_block_blas * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop 1
endif
successCUDA = cuda_malloc( q2_dev, stripe_width * (nbw+3) * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop 1
endif
successCUDA = cuda_malloc( w_dev, stripe_width * max_block_blas * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop 1
endif
endif !useGPU_BLAS
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
......@@ -2218,7 +2265,11 @@
if (my_prow==0 .and. my_pcol==0 .and.print_flops == 1) &
write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6
if (useGPU_LEGACY) then
!if (useGPU_LEGACY) then
! TODO at the moment, create q on device for both GPU implementations
! it is assumed to be there by the following routines
! TODO rewise this
if (useGPU) then
! copy q to q_dev needed in trans_ev_band_to_full
successCUDA = cuda_malloc(q_dev, ldq*matrixCols* size_of_datatype)
if (.not.(successCUDA)) then
......@@ -2417,6 +2468,41 @@
endif
endif ! useGPU_LEGACY
if(useGPU_BLAS) then
successCUDA = cuda_free( h_dev )
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
stop 1
endif
successCUDA = cuda_free( s_dev )
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
stop 1
endif
successCUDA = cuda_free( q2_dev )
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
stop 1
endif
successCUDA = cuda_free( w_dev )
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
stop 1
endif
endif ! useGPU_BLAS
call obj%timer%stop("trans_ev_tridi_to_band_&
&MATH_DATATYPE&
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment