Commit b48ea9a0 authored by Andreas Marek's avatar Andreas Marek
Browse files

Introduce memcpy

parent 499734d2
......@@ -70,7 +70,7 @@ function elpa_solve_evp_&
&_impl (obj, &
#endif /* ACTIVATE_SKEW */
aExtern, &
ev, &
evExtern, &
qExtern) result(success)
#else /* DEVICE_POINTER */
......@@ -88,7 +88,7 @@ function elpa_solve_evp_&
&_impl (obj, &
#endif /* ACTIVATE_SKEW */
aExtern, &
ev, &
evExtern, &
qExtern) result(success)
#endif /* DEVICE_POINTER */
......@@ -112,11 +112,12 @@ function elpa_solve_evp_&
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
#ifdef DEVICE_POINTER
type(c_ptr) :: ev
type(c_ptr) :: evExtern
#else
real(kind=REAL_DATATYPE), intent(out) :: ev(obj%na)
real(kind=REAL_DATATYPE), target, intent(out) :: evExtern(obj%na)
#endif
real(kind=REAL_DATATYPE), pointer :: ev(:)
#ifdef DEVICE_POINTER
!#ifdef REDISTRIBUTE_MATRIX
......@@ -242,6 +243,13 @@ function elpa_solve_evp_&
logical :: reDistributeMatrix, doRedistributeMatrix
logical :: successGPU
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
#ifdef ACTIVATE_SKEW
call obj%timer%start("elpa_solve_skew_evp_&
#else
......@@ -252,11 +260,144 @@ function elpa_solve_evp_&
&PRECISION&
&")
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
! check the number of threads that ELPA should use internally
#if defined(THREADING_SUPPORT_CHECK) && defined(ALLOW_THREAD_LIMITING) && !defined(HAVE_SUFFICIENT_MPI_THREADING_SUPPORT)
call obj%get("limit_openmp_threads",limitThreads,error)
if (limitThreads .eq. 0) then
#endif
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
#if defined(THREADING_SUPPORT_CHECK) && defined(ALLOW_THREAD_LIMITING) && !defined(HAVE_SUFFICIENT_MPI_THREADING_SUPPORT)
else
nrThreads = 1
call omp_set_num_threads(nrThreads)
endif
#endif
#else
nrThreads = 1
#endif /* WITH_OPENMP_TRADITIONAL */
if (gpu_vendor() == NVIDIA_GPU) then
call obj%get("gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for GPU. Aborting..."
stop
endif
if (gpu .eq. 1) then
print *,"You still use the deprecated option 'gpu', consider switching to 'nvidia-gpu'. Will set the new keyword &
& 'nvidia-gpu' now"
call obj%set("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for NVIDIA GPU. Aborting..."
stop
endif
endif
call obj%get("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for NVIDIA GPU. Aborting..."
stop
endif
else if (gpu_vendor() == AMD_GPU) then
call obj%get("amd-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for AMD GPU. Aborting..."
stop
endif
else if (gpu_vendor() == INTEL_GPU) then
call obj%get("intel-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
gpu = 0
endif
if (gpu .eq. 1) then
useGPU =.true.
else
#ifdef DEVICE_POINTER
print *,"You used the interface for device pointers but did not specify GPU usage!. Aborting..."
stop
#endif
useGPU = .false.
endif
do_useGPU = .false.
if (useGPU) then
call obj%timer%start("check_for_gpu")
if (check_for_gpu(obj, my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
call set_gpu_parameters()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu")
endif
do_useGPU_tridiag = do_useGPU
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev = do_useGPU
! only if we want (and can) use GPU in general, look what are the
! requirements for individual routines. Implicitly they are all set to 1, so
! unles specified otherwise by the user, GPU versions of all individual
! routines should be used
if(do_useGPU) then
call obj%get("gpu_tridiag", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_tridiag. Aborting..."
stop
endif
do_useGPU_tridiag = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_solve_tridi. Aborting..."
stop
endif
do_useGPU_solve_tridi = (gpu == 1)
call obj%get("gpu_trans_ev", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_trans_ev. Aborting..."
stop
endif
do_useGPU_trans_ev = (gpu == 1)
endif
! for elpa1 the easy thing is, that the individual phases of the algorithm
! do not share any data on the GPU.
#ifdef DEVICE_POINTER
#ifndef REDISTRIBUTE_MATRIX
! in case that aExtern and qExtern are device pointers
! we have to allocate aIntern and qIntern and
! point a and q to this
allocate(aIntern(1:obj%local_nrows,1:obj%local_ncols))
a => aIntern(1:obj%local_nrows,1:obj%local_ncols)
......@@ -270,15 +411,25 @@ function elpa_solve_evp_&
#endif
endif
#endif
#endif /* REDISTRIBUTE_MATRIX */
allocate(ev(1:obj%na))
! and copy aExtern to aIntern
!TODO: intel gpu
successGPU = gpu_memcpy(c_loc(aIntern(1,1)), aExtern, obj%local_nrows*obj%local_ncols*size_of_datatype, &
gpuMemcpyDeviceToHost)
check_memcpy_gpu("elpa1: aExtern -> aIntern", successGPU)
#else /* DEVICE_POINTER */
! aIntern, qIntern are normally pointers,
! aIntern, qIntern, are normally pointers,
! in case of redistribute aIntern, qIntern, are arrays storing the internally
! redistributed matrix
! in case of redistribute matrix the pointers will be reassigned
! ev is always a pointer
#ifndef REDISTRIBUTE_MATRIX
aIntern => aExtern(1:obj%local_nrows,1:obj%local_ncols)
a => aIntern(1:obj%local_nrows,1:obj%local_ncols)
......@@ -292,8 +443,9 @@ function elpa_solve_evp_&
q => qIntern(1:obj%local_nrows,1:obj%local_ncols)
#endif
endif
#endif /* REDISTRIBUTE_MATRIX */
#endif /* REDISTRIBUTE_MATRIX */
ev => evExtern(1:obj%na)
#endif /* DEVICE_POINTER */
reDistributeMatrix = .false.
......@@ -310,27 +462,6 @@ function elpa_solve_evp_&
call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
my_pe = int(my_peMPI,kind=c_int)
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
! check the number of threads that ELPA should use internally
#if defined(THREADING_SUPPORT_CHECK) && defined(ALLOW_THREAD_LIMITING) && !defined(HAVE_SUFFICIENT_MPI_THREADING_SUPPORT)
call obj%get("limit_openmp_threads",limitThreads,error)
if (limitThreads .eq. 0) then
#endif
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
#if defined(THREADING_SUPPORT_CHECK) && defined(ALLOW_THREAD_LIMITING) && !defined(HAVE_SUFFICIENT_MPI_THREADING_SUPPORT)
else
nrThreads = 1
call omp_set_num_threads(nrThreads)
endif
#endif
#else
nrThreads = 1
#endif
#ifdef WITH_NVTX
call nvtxRangePush("elpa1")
#endif
......@@ -349,18 +480,11 @@ function elpa_solve_evp_&
endif
success = .true.
#ifndef DEVICE_POINTER
!#ifdef REDISTRIBUTE_MATRIX
if (present(qExtern)) then
!#else
! if (present(q)) then
!#endif
obj%eigenvalues_only = .false.
else
obj%eigenvalues_only = .true.
endif
#endif
na = obj%na
nev = obj%nev
......@@ -383,11 +507,14 @@ function elpa_solve_evp_&
#ifdef REDISTRIBUTE_MATRIX
#include "../helpers/elpa_redistribute_template.F90"
#endif /* REDISTRIBUTE_MATRIX */
#else
#ifdef REDISTRIBUTE_MATRIX
print *,"Device pointer + REDIST"
#endif /* REDISTRIBUTE_MATRIX */
#endif
! special case na = 1
if (na .eq. 1) then
#ifndef DEVICE_POINTER
#if REALCASE == 1
ev(1) = a(1,1)
#endif
......@@ -397,7 +524,6 @@ function elpa_solve_evp_&
if (.not.(obj%eigenvalues_only)) then
q(1,1) = ONE
endif
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
......@@ -420,48 +546,6 @@ function elpa_solve_evp_&
obj%eigenvalues_only = .true.
endif
if (gpu_vendor() == NVIDIA_GPU) then
call obj%get("gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for GPU. Aborting..."
stop
endif
if (gpu .eq. 1) then
print *,"You still use the deprecated option 'gpu', consider switching to 'nvidia-gpu'. Will set the new keyword &
& 'nvidia-gpu' now"
call obj%set("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for NVIDIA GPU. Aborting..."
stop
endif
endif
call obj%get("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for NVIDIA GPU. Aborting..."
stop
endif
else if (gpu_vendor() == AMD_GPU) then
call obj%get("amd-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for AMD GPU. Aborting..."
stop
endif
else if (gpu_vendor() == INTEL_GPU) then
call obj%get("intel-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
gpu = 0
endif
if (gpu .eq. 1) then
useGPU =.true.
else
useGPU = .false.
endif
#ifdef ACTIVATE_SKEW
!call obj%get("is_skewsymmetric",skewsymmetric,error)
!if (error .ne. ELPA_OK) then
......@@ -496,70 +580,7 @@ function elpa_solve_evp_&
stop
endif
wantDebug = debug == 1
do_useGPU = .false.
if (useGPU) then
call obj%timer%start("check_for_gpu")
if (check_for_gpu(obj, my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
call set_gpu_parameters()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu")
endif
do_useGPU_tridiag = do_useGPU
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev = do_useGPU
! only if we want (and can) use GPU in general, look what are the
! requirements for individual routines. Implicitly they are all set to 1, so
! unles specified otherwise by the user, GPU versions of all individual
! routines should be used
if(do_useGPU) then
call obj%get("gpu_tridiag", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_tridiag. Aborting..."
stop
endif
do_useGPU_tridiag = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_solve_tridi. Aborting..."
stop
endif
do_useGPU_solve_tridi = (gpu == 1)
call obj%get("gpu_trans_ev", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_trans_ev. Aborting..."
stop
endif
do_useGPU_trans_ev = (gpu == 1)
endif
! for elpa1 the easy thing is, that the individual phases of the algorithm
! do not share any data on the GPU.
#ifndef DEVICE_POINTER
! allocate a dummy q_intern, if eigenvectors should not be commputed and thus q is NOT present
if (.not.(obj%eigenvalues_only)) then
q_actual => q(1:matrixRows,1:matrixCols)
......@@ -568,7 +589,6 @@ function elpa_solve_evp_&
check_allocate("elpa1_template: q_dummy", istat, errorMessage)
q_actual => q_dummy
endif
#endif
#if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
......@@ -597,14 +617,12 @@ function elpa_solve_evp_&
call nvtxRangePush("tridi")
#endif
#ifndef DEVICE_POINTER
call tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj, na, a, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, &
nrThreads, isSkewsymmetric)
#endif
#ifdef WITH_NVTX
call nvtxRangePop()
......@@ -623,7 +641,6 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX
call nvtxRangePush("solve")
#endif
#ifndef DEVICE_POINTER
call solve_tridi_&
&PRECISION&
& (obj, na, nev, ev, e, &
......@@ -635,7 +652,6 @@ function elpa_solve_evp_&
#endif
nblk, matrixCols, mpi_comm_all, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, &
success, nrThreads)
#endif
#ifdef WITH_NVTX
call nvtxRangePop()
......@@ -667,13 +683,11 @@ function elpa_solve_evp_&
endif
check_pd = 0
#ifndef DEVICE_POINTER
do i = 1, na
if (ev(i) .gt. thres_pd) then
check_pd = check_pd + 1
endif
enddo
#endif
if (check_pd .lt. na) then
! not positiv definite => eigenvectors needed
do_trans_ev = .true.
......@@ -685,16 +699,13 @@ function elpa_solve_evp_&
if (do_trans_ev) then
! q must be given thats why from here on we can use q and not q_actual
#ifndef DEVICE_POINTER
#if COMPLEXCASE == 1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
#endif
#endif
if (isSkewsymmetric) then
! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
! This makes the eigenvectors complex.
! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
#ifndef DEVICE_POINTER
q(1:matrixRows, matrixCols+1:2*matrixCols) = 0.0
do i = 1, matrixRows
! global_index = indxl2g(i, nblk, my_prow, 0, np_rows)
......@@ -714,7 +725,6 @@ function elpa_solve_evp_&
q(i,1:matrixCols) = 0
end if
end do
#endif
endif
call obj%timer%start("back")
......@@ -725,7 +735,6 @@ function elpa_solve_evp_&
call nvtxRangePush("trans_ev")
#endif
#ifndef DEVICE_POINTER
! In the skew-symmetric case this transforms the real part
call trans_ev_&
&MATH_DATATYPE&
......@@ -742,7 +751,6 @@ function elpa_solve_evp_&
& (obj, na, nev, a, matrixRows, tau, q(1:matrixRows, matrixCols+1:2*matrixCols), matrixRows, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
endif
#endif
#ifdef WITH_NVTX
call nvtxRangePop()
......@@ -774,7 +782,6 @@ function elpa_solve_evp_&
call omp_set_num_threads(omp_threads_caller)
#endif
#ifndef DEVICE_POINTER
#ifdef REDISTRIBUTE_MATRIX
! redistribute back if necessary
if (doRedistributeMatrix) then
......@@ -814,9 +821,23 @@ function elpa_solve_evp_&
call blacs_gridexit(blacs_ctxt_)
endif
#endif /* REDISTRIBUTE_MATRIX */
#ifdef DEVICE_POINTER
!copy qIntern and ev to provided device pointers
successGPU = gpu_memcpy(qExtern, c_loc(qIntern(1,1)), obj%local_nrows*obj%local_ncols*size_of_datatype, &
gpuMemcpyHostToDevice)
check_memcpy_gpu("elpa1: qIntern -> qExtern", successGPU)
successGPU = gpu_memcpy(evExtern, c_loc(ev(1)), obj%na*size_of_datatype, &
gpuMemcpyHostToDevice)
check_memcpy_gpu("elpa1: ev -> evExtern", successGPU)
deallocate(ev)
#else
nullify(ev)
#endif
#ifdef DEVICE_POINTER
#ifdef REDISTRIBUTE_MATRIX
! in case that aExtern and qExtern are device pointers
......@@ -825,11 +846,7 @@ function elpa_solve_evp_&
deallocate(aIntern)
if (present(qExtern)) then
#ifdef ACTIVATE_SKEW
deallocate(qIntern)
#else
deallocate(qIntern)
#endif
endif
#endif
#endif /* DEVICE_POINTER */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment