Commit b48ea9a0 authored by Andreas Marek's avatar Andreas Marek
Browse files

Introduce memcpy

parent 499734d2
...@@ -70,7 +70,7 @@ function elpa_solve_evp_& ...@@ -70,7 +70,7 @@ function elpa_solve_evp_&
&_impl (obj, & &_impl (obj, &
#endif /* ACTIVATE_SKEW */ #endif /* ACTIVATE_SKEW */
aExtern, & aExtern, &
ev, & evExtern, &
qExtern) result(success) qExtern) result(success)
#else /* DEVICE_POINTER */ #else /* DEVICE_POINTER */
...@@ -88,7 +88,7 @@ function elpa_solve_evp_& ...@@ -88,7 +88,7 @@ function elpa_solve_evp_&
&_impl (obj, & &_impl (obj, &
#endif /* ACTIVATE_SKEW */ #endif /* ACTIVATE_SKEW */
aExtern, & aExtern, &
ev, & evExtern, &
qExtern) result(success) qExtern) result(success)
#endif /* DEVICE_POINTER */ #endif /* DEVICE_POINTER */
...@@ -112,11 +112,12 @@ function elpa_solve_evp_& ...@@ -112,11 +112,12 @@ function elpa_solve_evp_&
#include "../general/precision_kinds.F90" #include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj class(elpa_abstract_impl_t), intent(inout) :: obj
#ifdef DEVICE_POINTER #ifdef DEVICE_POINTER
type(c_ptr) :: ev type(c_ptr) :: evExtern
#else #else
real(kind=REAL_DATATYPE), intent(out) :: ev(obj%na) real(kind=REAL_DATATYPE), target, intent(out) :: evExtern(obj%na)
#endif #endif
real(kind=REAL_DATATYPE), pointer :: ev(:)
#ifdef DEVICE_POINTER #ifdef DEVICE_POINTER
!#ifdef REDISTRIBUTE_MATRIX !#ifdef REDISTRIBUTE_MATRIX
...@@ -242,6 +243,13 @@ function elpa_solve_evp_& ...@@ -242,6 +243,13 @@ function elpa_solve_evp_&
logical :: reDistributeMatrix, doRedistributeMatrix logical :: reDistributeMatrix, doRedistributeMatrix
logical :: successGPU
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
#ifdef ACTIVATE_SKEW #ifdef ACTIVATE_SKEW
call obj%timer%start("elpa_solve_skew_evp_& call obj%timer%start("elpa_solve_skew_evp_&
#else #else
...@@ -252,11 +260,144 @@ function elpa_solve_evp_& ...@@ -252,11 +260,144 @@ function elpa_solve_evp_&
&PRECISION& &PRECISION&
&") &")
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
! check the number of threads that ELPA should use internally
#if defined(THREADING_SUPPORT_CHECK) && defined(ALLOW_THREAD_LIMITING) && !defined(HAVE_SUFFICIENT_MPI_THREADING_SUPPORT)
call obj%get("limit_openmp_threads",limitThreads,error)
if (limitThreads .eq. 0) then
#endif
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
#if defined(THREADING_SUPPORT_CHECK) && defined(ALLOW_THREAD_LIMITING) && !defined(HAVE_SUFFICIENT_MPI_THREADING_SUPPORT)
else
nrThreads = 1
call omp_set_num_threads(nrThreads)
endif
#endif
#else
nrThreads = 1
#endif /* WITH_OPENMP_TRADITIONAL */
if (gpu_vendor() == NVIDIA_GPU) then
call obj%get("gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for GPU. Aborting..."
stop
endif
if (gpu .eq. 1) then
print *,"You still use the deprecated option 'gpu', consider switching to 'nvidia-gpu'. Will set the new keyword &
& 'nvidia-gpu' now"
call obj%set("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for NVIDIA GPU. Aborting..."
stop
endif
endif
call obj%get("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for NVIDIA GPU. Aborting..."
stop
endif
else if (gpu_vendor() == AMD_GPU) then
call obj%get("amd-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for AMD GPU. Aborting..."
stop
endif
else if (gpu_vendor() == INTEL_GPU) then
call obj%get("intel-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
gpu = 0
endif
if (gpu .eq. 1) then
useGPU =.true.
else
#ifdef DEVICE_POINTER
print *,"You used the interface for device pointers but did not specify GPU usage!. Aborting..."
stop
#endif
useGPU = .false.
endif
do_useGPU = .false.
if (useGPU) then
call obj%timer%start("check_for_gpu")
if (check_for_gpu(obj, my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
call set_gpu_parameters()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu")
endif
do_useGPU_tridiag = do_useGPU
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev = do_useGPU
! only if we want (and can) use GPU in general, look what are the
! requirements for individual routines. Implicitly they are all set to 1, so
! unles specified otherwise by the user, GPU versions of all individual
! routines should be used
if(do_useGPU) then
call obj%get("gpu_tridiag", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_tridiag. Aborting..."
stop
endif
do_useGPU_tridiag = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_solve_tridi. Aborting..."
stop
endif
do_useGPU_solve_tridi = (gpu == 1)
call obj%get("gpu_trans_ev", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_trans_ev. Aborting..."
stop
endif
do_useGPU_trans_ev = (gpu == 1)
endif
! for elpa1 the easy thing is, that the individual phases of the algorithm
! do not share any data on the GPU.
#ifdef DEVICE_POINTER #ifdef DEVICE_POINTER
#ifndef REDISTRIBUTE_MATRIX #ifndef REDISTRIBUTE_MATRIX
! in case that aExtern and qExtern are device pointers ! in case that aExtern and qExtern are device pointers
! we have to allocate aIntern and qIntern and ! we have to allocate aIntern and qIntern and
! point a and q to this ! point a and q to this
allocate(aIntern(1:obj%local_nrows,1:obj%local_ncols)) allocate(aIntern(1:obj%local_nrows,1:obj%local_ncols))
a => aIntern(1:obj%local_nrows,1:obj%local_ncols) a => aIntern(1:obj%local_nrows,1:obj%local_ncols)
...@@ -270,15 +411,25 @@ function elpa_solve_evp_& ...@@ -270,15 +411,25 @@ function elpa_solve_evp_&
#endif #endif
endif endif
#endif #endif /* REDISTRIBUTE_MATRIX */
allocate(ev(1:obj%na))
! and copy aExtern to aIntern
!TODO: intel gpu
successGPU = gpu_memcpy(c_loc(aIntern(1,1)), aExtern, obj%local_nrows*obj%local_ncols*size_of_datatype, &
gpuMemcpyDeviceToHost)
check_memcpy_gpu("elpa1: aExtern -> aIntern", successGPU)
#else /* DEVICE_POINTER */ #else /* DEVICE_POINTER */
! aIntern, qIntern are normally pointers, ! aIntern, qIntern, are normally pointers,
! in case of redistribute aIntern, qIntern, are arrays storing the internally ! in case of redistribute aIntern, qIntern, are arrays storing the internally
! redistributed matrix ! redistributed matrix
! in case of redistribute matrix the pointers will be reassigned ! in case of redistribute matrix the pointers will be reassigned
! ev is always a pointer
#ifndef REDISTRIBUTE_MATRIX #ifndef REDISTRIBUTE_MATRIX
aIntern => aExtern(1:obj%local_nrows,1:obj%local_ncols) aIntern => aExtern(1:obj%local_nrows,1:obj%local_ncols)
a => aIntern(1:obj%local_nrows,1:obj%local_ncols) a => aIntern(1:obj%local_nrows,1:obj%local_ncols)
...@@ -292,8 +443,9 @@ function elpa_solve_evp_& ...@@ -292,8 +443,9 @@ function elpa_solve_evp_&
q => qIntern(1:obj%local_nrows,1:obj%local_ncols) q => qIntern(1:obj%local_nrows,1:obj%local_ncols)
#endif #endif
endif endif
#endif /* REDISTRIBUTE_MATRIX */
#endif /* REDISTRIBUTE_MATRIX */
ev => evExtern(1:obj%na)
#endif /* DEVICE_POINTER */ #endif /* DEVICE_POINTER */
reDistributeMatrix = .false. reDistributeMatrix = .false.
...@@ -310,27 +462,6 @@ function elpa_solve_evp_& ...@@ -310,27 +462,6 @@ function elpa_solve_evp_&
call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr) call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
my_pe = int(my_peMPI,kind=c_int) my_pe = int(my_peMPI,kind=c_int)
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
! check the number of threads that ELPA should use internally
#if defined(THREADING_SUPPORT_CHECK) && defined(ALLOW_THREAD_LIMITING) && !defined(HAVE_SUFFICIENT_MPI_THREADING_SUPPORT)
call obj%get("limit_openmp_threads",limitThreads,error)
if (limitThreads .eq. 0) then
#endif
call obj%get("omp_threads",nrThreads,error)
call omp_set_num_threads(nrThreads)
#if defined(THREADING_SUPPORT_CHECK) && defined(ALLOW_THREAD_LIMITING) && !defined(HAVE_SUFFICIENT_MPI_THREADING_SUPPORT)
else
nrThreads = 1
call omp_set_num_threads(nrThreads)
endif
#endif
#else
nrThreads = 1
#endif
#ifdef WITH_NVTX #ifdef WITH_NVTX
call nvtxRangePush("elpa1") call nvtxRangePush("elpa1")
#endif #endif
...@@ -349,18 +480,11 @@ function elpa_solve_evp_& ...@@ -349,18 +480,11 @@ function elpa_solve_evp_&
endif endif
success = .true. success = .true.
#ifndef DEVICE_POINTER
!#ifdef REDISTRIBUTE_MATRIX
if (present(qExtern)) then if (present(qExtern)) then
!#else
! if (present(q)) then
!#endif
obj%eigenvalues_only = .false. obj%eigenvalues_only = .false.
else else
obj%eigenvalues_only = .true. obj%eigenvalues_only = .true.
endif endif
#endif
na = obj%na na = obj%na
nev = obj%nev nev = obj%nev
...@@ -383,11 +507,14 @@ function elpa_solve_evp_& ...@@ -383,11 +507,14 @@ function elpa_solve_evp_&
#ifdef REDISTRIBUTE_MATRIX #ifdef REDISTRIBUTE_MATRIX
#include "../helpers/elpa_redistribute_template.F90" #include "../helpers/elpa_redistribute_template.F90"
#endif /* REDISTRIBUTE_MATRIX */ #endif /* REDISTRIBUTE_MATRIX */
#else
#ifdef REDISTRIBUTE_MATRIX
print *,"Device pointer + REDIST"
#endif /* REDISTRIBUTE_MATRIX */
#endif #endif
! special case na = 1 ! special case na = 1
if (na .eq. 1) then if (na .eq. 1) then
#ifndef DEVICE_POINTER
#if REALCASE == 1 #if REALCASE == 1
ev(1) = a(1,1) ev(1) = a(1,1)
#endif #endif
...@@ -397,7 +524,6 @@ function elpa_solve_evp_& ...@@ -397,7 +524,6 @@ function elpa_solve_evp_&
if (.not.(obj%eigenvalues_only)) then if (.not.(obj%eigenvalues_only)) then
q(1,1) = ONE q(1,1) = ONE
endif endif
#endif
! restore original OpenMP settings ! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL #ifdef WITH_OPENMP_TRADITIONAL
...@@ -420,48 +546,6 @@ function elpa_solve_evp_& ...@@ -420,48 +546,6 @@ function elpa_solve_evp_&
obj%eigenvalues_only = .true. obj%eigenvalues_only = .true.
endif endif
if (gpu_vendor() == NVIDIA_GPU) then
call obj%get("gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for GPU. Aborting..."
stop
endif
if (gpu .eq. 1) then
print *,"You still use the deprecated option 'gpu', consider switching to 'nvidia-gpu'. Will set the new keyword &
& 'nvidia-gpu' now"
call obj%set("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem setting option for NVIDIA GPU. Aborting..."
stop
endif
endif
call obj%get("nvidia-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for NVIDIA GPU. Aborting..."
stop
endif
else if (gpu_vendor() == AMD_GPU) then
call obj%get("amd-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for AMD GPU. Aborting..."
stop
endif
else if (gpu_vendor() == INTEL_GPU) then
call obj%get("intel-gpu",gpu,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for INTEL GPU. Aborting..."
stop
endif
else
gpu = 0
endif
if (gpu .eq. 1) then
useGPU =.true.
else
useGPU = .false.
endif
#ifdef ACTIVATE_SKEW #ifdef ACTIVATE_SKEW
!call obj%get("is_skewsymmetric",skewsymmetric,error) !call obj%get("is_skewsymmetric",skewsymmetric,error)
!if (error .ne. ELPA_OK) then !if (error .ne. ELPA_OK) then
...@@ -496,70 +580,7 @@ function elpa_solve_evp_& ...@@ -496,70 +580,7 @@ function elpa_solve_evp_&
stop stop
endif endif
wantDebug = debug == 1 wantDebug = debug == 1
do_useGPU = .false.
if (useGPU) then
call obj%timer%start("check_for_gpu")
if (check_for_gpu(obj, my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
call set_gpu_parameters()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu")
endif
do_useGPU_tridiag = do_useGPU
do_useGPU_solve_tridi = do_useGPU
do_useGPU_trans_ev = do_useGPU
! only if we want (and can) use GPU in general, look what are the
! requirements for individual routines. Implicitly they are all set to 1, so
! unles specified otherwise by the user, GPU versions of all individual
! routines should be used
if(do_useGPU) then
call obj%get("gpu_tridiag", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_tridiag. Aborting..."
stop
endif
do_useGPU_tridiag = (gpu == 1)
call obj%get("gpu_solve_tridi", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_solve_tridi. Aborting..."
stop
endif
do_useGPU_solve_tridi = (gpu == 1)
call obj%get("gpu_trans_ev", gpu, error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option for gpu_trans_ev. Aborting..."
stop
endif
do_useGPU_trans_ev = (gpu == 1)
endif
! for elpa1 the easy thing is, that the individual phases of the algorithm
! do not share any data on the GPU.
#ifndef DEVICE_POINTER
! allocate a dummy q_intern, if eigenvectors should not be commputed and thus q is NOT present ! allocate a dummy q_intern, if eigenvectors should not be commputed and thus q is NOT present
if (.not.(obj%eigenvalues_only)) then if (.not.(obj%eigenvalues_only)) then
q_actual => q(1:matrixRows,1:matrixCols) q_actual => q(1:matrixRows,1:matrixCols)
...@@ -568,7 +589,6 @@ function elpa_solve_evp_& ...@@ -568,7 +589,6 @@ function elpa_solve_evp_&
check_allocate("elpa1_template: q_dummy", istat, errorMessage) check_allocate("elpa1_template: q_dummy", istat, errorMessage)
q_actual => q_dummy q_actual => q_dummy
endif endif
#endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
...@@ -597,14 +617,12 @@ function elpa_solve_evp_& ...@@ -597,14 +617,12 @@ function elpa_solve_evp_&
call nvtxRangePush("tridi") call nvtxRangePush("tridi")
#endif #endif
#ifndef DEVICE_POINTER
call tridiag_& call tridiag_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION& &PRECISION&
& (obj, na, a, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, & & (obj, na, a, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, &
nrThreads, isSkewsymmetric) nrThreads, isSkewsymmetric)
#endif
#ifdef WITH_NVTX #ifdef WITH_NVTX
call nvtxRangePop() call nvtxRangePop()
...@@ -623,7 +641,6 @@ function elpa_solve_evp_& ...@@ -623,7 +641,6 @@ function elpa_solve_evp_&
#ifdef WITH_NVTX #ifdef WITH_NVTX
call nvtxRangePush("solve") call nvtxRangePush("solve")
#endif #endif
#ifndef DEVICE_POINTER
call solve_tridi_& call solve_tridi_&
&PRECISION& &PRECISION&
& (obj, na, nev, ev, e, & & (obj, na, nev, ev, e, &
...@@ -635,7 +652,6 @@ function elpa_solve_evp_& ...@@ -635,7 +652,6 @@ function elpa_solve_evp_&
#endif #endif
nblk, matrixCols, mpi_comm_all, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, & nblk, matrixCols, mpi_comm_all, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, &
success, nrThreads) success, nrThreads)
#endif
#ifdef WITH_NVTX #ifdef WITH_NVTX
call nvtxRangePop() call nvtxRangePop()
...@@ -667,13 +683,11 @@ function elpa_solve_evp_& ...@@ -667,13 +683,11 @@ function elpa_solve_evp_&
endif endif
check_pd = 0 check_pd = 0
#ifndef DEVICE_POINTER
do i = 1, na do i = 1, na
if (ev(i) .gt. thres_pd) then if (ev(i) .gt. thres_pd) then
check_pd = check_pd + 1 check_pd = check_pd + 1
endif endif
enddo enddo
#endif
if (check_pd .lt. na) then if (check_pd .lt. na) then
! not positiv definite => eigenvectors needed ! not positiv definite => eigenvectors needed
do_trans_ev = .true. do_trans_ev = .true.
...@@ -685,16 +699,13 @@ function elpa_solve_evp_& ...@@ -685,16 +699,13 @@ function elpa_solve_evp_&
if (do_trans_ev) then if (do_trans_ev) then
! q must be given thats why from here on we can use q and not q_actual ! q must be given thats why from here on we can use q and not q_actual
#ifndef DEVICE_POINTER
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
#endif
#endif #endif
if (isSkewsymmetric) then if (isSkewsymmetric) then
! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D. ! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
! This makes the eigenvectors complex. ! This makes the eigenvectors complex.
! For now real part of eigenvectors is generated in first half of q, imaginary part in second part. ! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
#ifndef DEVICE_POINTER
q(1:matrixRows, matrixCols+1:2*matrixCols) = 0.0 q(1:matrixRows, matrixCols+1:2*matrixCols) = 0.0
do i = 1, matrixRows do i = 1, matrixRows
! global_index = indxl2g(i, nblk, my_prow, 0, np_rows) ! global_index = indxl2g(i, nblk, my_prow, 0, np_rows)
...@@ -714,7 +725,6 @@ function elpa_solve_evp_& ...@@ -714,7 +725,6 @@ function elpa_solve_evp_&
q(i,1:matrixCols) = 0 q(i,1:matrixCols) = 0
end if end if
end do end do
#endif
endif endif
call obj%timer%start("back") call obj%timer%start("back")
...@@ -725,7 +735,6 @@ function elpa_solve_evp_& ...@@ -725,7 +735,6 @@ function elpa_solve_evp_&
call nvtxRangePush("trans_ev") call nvtxRangePush("trans_ev")
#endif #endif
#ifndef DEVICE_POINTER
! In the skew-symmetric case this transforms the real part