Commit cc266ee2 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master_pre_stage' into 'master'

Master pre stage

See merge request !58
parents 1d2b3806 20206932
......@@ -4,7 +4,7 @@ We are very happy and grateful if you are willing to help us improve *ELPA*.
Thus, we would like to make this process as simple as possible for you,
but at the same time still keep it manageable for us
For recommendations and suggestions, a simple email to us (*elpa-libray at mpcdf.mpg.de*) is sufficient!
For recommendations and suggestions, a simple email to us (*elpa-library at mpcdf.mpg.de*) is sufficient!
If you would like to share with us your improvements, we suggest the following ways:
......
......@@ -2,6 +2,9 @@ Changelog for next release
- not yet decided
Upcoming changes for ELPA 2021.05.001
- allow the user to set the mapping of MPI tasks to GPU id per set/get
Changelog for ELPA 2020.11.001
- this release containts mostly bugfixes:
......
......@@ -138,7 +138,7 @@ libelpa@SUFFIX@_private_la_SOURCES += \
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
libelpa@SUFFIX@_private_la_SOURCES += src/GPU/elpa_index_gpu.cu src/GPU/cudaFunctions.cu src/GPU/cuUtils.cu src/elpa2/GPU/ev_tridi_band_gpu_real.cu src/elpa2/GPU/ev_tridi_band_gpu_complex.cu
endif
if !WITH_MPI
......
......@@ -24,6 +24,10 @@ gpu_flag = {
0: "-DTEST_GPU=0",
1: "-DTEST_GPU=1",
}
gpu_id_flag = {
0: "-DTEST_GPU_SET_ID=0",
1: "-DTEST_GPU_SET_ID=1",
}
matrix_flag = {
"random": "-DTEST_MATRIX_RANDOM",
......@@ -57,9 +61,10 @@ split_comm_flag = {
"by_elpa": ""
}
for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
for lang, m, g, gid, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
sorted(matrix_flag.keys()),
sorted(gpu_flag.keys()),
sorted(gpu_id_flag.keys()),
sorted(qr_flag.keys()),
sorted(test_type_flag.keys()),
sorted(prec_flag.keys()),
......@@ -68,6 +73,9 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
sorted(layout_flag.keys()),
sorted(split_comm_flag.keys())):
if gid == 1 and (g == 0 ):
continue
if lang == "C" and (m == "analytic" or m == "toeplitz" or m == "frank" or lay == "all_layouts"):
continue
......@@ -178,11 +186,12 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
raise Exception("Oh no!")
endifs += 1
name = "validate{langsuffix}_{d}_{p}_{t}_{s}{kernelsuffix}_{gpusuffix}{qrsuffix}{m}{layoutsuffix}{spl}".format(
name = "validate{langsuffix}_{d}_{p}_{t}_{s}{kernelsuffix}_{gpusuffix}{gpuidsuffix}{qrsuffix}{m}{layoutsuffix}{spl}".format(
langsuffix=language_flag[lang],
d=d, p=p, t=t, s=s,
kernelsuffix="" if kernel == "nokernel" else "_" + kernel,
gpusuffix="gpu_" if g else "",
gpuidsuffix="set_gpu_id_" if gid else "",
qrsuffix="qr_" if q else "",
m=m,
layoutsuffix="_all_layouts" if lay == "all_layouts" else "",
......@@ -227,6 +236,7 @@ for lang, m, g, q, t, p, d, s, lay, spl in product(sorted(language_flag.keys()),
test_type_flag[t],
solver_flag[s],
gpu_flag[g],
gpu_id_flag[gid],
qr_flag[q],
matrix_flag[m]] + extra_flags))
......
......@@ -62,7 +62,7 @@ module mod_check_for_gpu
integer(kind=ik), intent(out) :: numberOfDevices
integer(kind=ik) :: deviceNumber, mpierr, maxNumberOfDevices
logical :: gpuAvailable
integer(kind=ik) :: error, mpi_comm_all
integer(kind=ik) :: error, mpi_comm_all, use_gpu_id, min_use_gpu_id
!character(len=1024) :: envname
if (.not.(present(wantDebug))) then
......@@ -83,42 +83,24 @@ module mod_check_for_gpu
stop
endif
if (cublasHandle .ne. -1) then
gpuAvailable = .true.
numberOfDevices = -1
if (myid == 0 .and. wantDebugMessage) then
print *, "Skipping GPU init, should have already been initialized "
endif
return
else
if (myid == 0 .and. wantDebugMessage) then
print *, "Initializing the GPU devices"
if (obj%is_set("use_gpu_id") == 1) then
call obj%get("use_gpu_id", use_gpu_id, error)
if (use_gpu_id == -99) then
print *,"Problem you did not set which gpu id this task should use"
endif
endif
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success = cuda_getdevicecount(numberOfDevices)
if (.not.(success)) then
print *,"error in cuda_getdevicecount"
stop 1
endif
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
! check whether gpu ud has been set for each proces
#ifdef WITH_MPI
call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, mpi_comm_all, mpierr)
call mpi_allreduce(use_gpu_id, min_use_gpu_id, 1, MPI_INTEGER, MPI_MAX, mpi_comm_all, mpierr)
if (maxNumberOfDevices .ne. numberOfDevices) then
print *,"Different number of GPU devices on MPI tasks!"
print *,"GPUs will NOT be used!"
gpuAvailable = .false.
return
endif
if (min_use_gpu_id .lt. 0) then
print *,"Not all tasks have set which GPU id should be used"
print *,"GPUs will NOT be used!"
gpuAvailable = .false.
return
endif
#endif
if (numberOfDevices .ne. 0) then
gpuAvailable = .true.
! Usage of GPU is possible since devices have been detected
if (myid==0) then
if (wantDebugMessage) then
......@@ -127,8 +109,7 @@ module mod_check_for_gpu
endif
endif
deviceNumber = mod(myid, numberOfDevices)
success = cuda_setdevice(deviceNumber)
success = cuda_setdevice(use_gpu_id)
if (.not.(success)) then
print *,"Cannot set CudaDevice"
......@@ -137,14 +118,77 @@ module mod_check_for_gpu
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
success = cublas_create(cublasHandle)
if (.not.(success)) then
print *,"Cannot create cublas handle"
stop 1
endif
endif
else
if (cublasHandle .ne. -1) then
gpuAvailable = .true.
numberOfDevices = -1
if (myid == 0 .and. wantDebugMessage) then
print *, "Skipping GPU init, should have already been initialized "
endif
return
else
if (myid == 0 .and. wantDebugMessage) then
print *, "Initializing the GPU devices"
endif
endif
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success = cuda_getdevicecount(numberOfDevices)
if (.not.(success)) then
print *,"error in cuda_getdevicecount"
stop 1
endif
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
#ifdef WITH_MPI
call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, mpi_comm_all, mpierr)
if (maxNumberOfDevices .ne. numberOfDevices) then
print *,"Different number of GPU devices on MPI tasks!"
print *,"GPUs will NOT be used!"
gpuAvailable = .false.
return
endif
#endif
if (numberOfDevices .ne. 0) then
gpuAvailable = .true.
! Usage of GPU is possible since devices have been detected
if (myid==0) then
if (wantDebugMessage) then
print *
print '(3(a,i0))','Found ', numberOfDevices, ' GPUs'
endif
endif
deviceNumber = mod(myid, numberOfDevices)
success = cuda_setdevice(deviceNumber)
if (.not.(success)) then
print *,"Cannot set CudaDevice"
stop 1
endif
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
success = cublas_create(cublasHandle)
if (.not.(success)) then
print *,"Cannot create cublas handle"
stop 1
endif
endif
endif
end function
end module
......@@ -57,7 +57,7 @@
#include "cuUtils_template.cu"
#undef DOUBLE_PRECISION_REAL
#if WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#include "cuUtils_template.cu"
......@@ -71,7 +71,7 @@
#include "cuUtils_template.cu"
#undef DOUBLE_PRECISION_COMPLEX
#if WANT_SINGLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#include "cuUtils_template.cu"
......
extern "C" {
int gpu_count() {
int count;
cudaError_t cuerr = cudaGetDeviceCount(&count);
if (cuerr != cudaSuccess) {
count = -1000;
}
return count;
}
}
......@@ -143,6 +143,16 @@
!> print *,"Could not setup ELPA object"
!> endif
!>
!> ! settings for GPU
!> call elpaInstance%set("gpu", 1, success) ! 1=on, 2=off
!> ! in case of GPU usage you have the choice whether ELPA
!> ! should automatically assign each MPI task to a certain GPU
!> ! (this is default) or whether you want to set this assignment
!> ! for _each_ task yourself
!> ! set assignment your self (only using one task here and assigning it
!> ! to GPU id 1)
!> if (my_rank .eq. 0) call elpaInstance%set("use_gpu_id", 1, success)
!>
!> ! if desired, set tunable run-time options
!> ! here we want to use the 2-stage solver
!> call elpaInstance%set("solver", ELPA_SOLVER_2STAGE, success)
......@@ -206,6 +216,16 @@
!> /* here we want to use the 2-stage solver */
!> elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error);
!>
!> /* settings for GPU */
!> elpa_set(handle, "gpu", 1, &error); /* 1=on, 2=off */
!> /* in case of GPU usage you have the choice whether ELPA
!> should automatically assign each MPI task to a certain GPU
!> (this is default) or whether you want to set this assignment
!> for _each_ task yourself
!> set assignment your self (only using one task here and assigning it
!> to GPU id 1) */
!> if (my_rank == 0) elpa_set(handle, "use_gpu_id", 1, &error);
!>
!> elpa_set(handle,"real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, &error);
!> \endcode
!> ... set and get all other options that are desired
......
......@@ -137,7 +137,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
......@@ -287,7 +287,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
......
......@@ -279,8 +279,6 @@ function elpa_solve_evp_&
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
#endif
call obj%timer%stop("elpa_solve_evp_&
......@@ -356,6 +354,18 @@ function elpa_solve_evp_&
success = .false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu")
endif
......@@ -589,8 +599,6 @@ function elpa_solve_evp_&
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
#endif
......
......@@ -612,26 +612,28 @@ subroutine tridiag_&
if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP_TRADITIONAL
if (mod(n_iter,n_threads) == my_thread) then
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
if (i/=j) then
if (isSkewsymmetric) then
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
else
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
if (.not. useGPU) then
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
if (i/=j) then
if (isSkewsymmetric) then
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
else
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
endif
endif
endif
if (wantDebug) call obj%timer%stop("blas")
......@@ -750,11 +752,12 @@ subroutine tridiag_&
#ifdef WITH_OPENMP_TRADITIONAL
!$OMP END PARALLEL
call obj%timer%stop("OpenMP parallel")
do i=0,max_threads-1
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
enddo
if (.not.(useGPU)) then
do i=0,max_threads-1
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
enddo
endif
#endif /* WITH_OPENMP_TRADITIONAL */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
......
......@@ -134,8 +134,11 @@ subroutine elpa_reduce_add_vectors_&
aux2(:) = 0
#ifdef WITH_OPENMP_TRADITIONAL
!call omp_set_num_threads(nrThreads)
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
!$omp parallel &
!$omp default(none) &
!$omp private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads) &
!$omp shared(nps, npt, lcm_s_t, nblk, vmat_t, vmat_s, myps, mypt, mpierr, obj, &
!$omp& comm_t, nblks_tot, aux2, aux1, nvr, nvc)
#endif
do n = 0, lcm_s_t-1
......
......@@ -148,7 +148,11 @@ subroutine ROUTINE_NAME&
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ), stat=istat, errmsg=errorMessage)
check_allocate("elpa_transpose_vectors: aux", istat, errorMessage)
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
do n = 0, lcm_s_t-1
......
......@@ -131,7 +131,11 @@ subroutine elpa_transpose_vectors_ss_&
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
check_allocate("elpa_transpose_vectors_ss: aux", istat, errorMessage)
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
do n = 0, lcm_s_t-1
......
......@@ -58,11 +58,10 @@ l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
#ifdef WITH_OPENMP_TRADITIONAL
my_thread, thread_width, &
my_thread, thread_width, kernel, last_stripe_width)
#else
last_stripe_width, &
last_stripe_width, kernel)
#endif
kernel)
use precision
use elpa_abstract_impl
......@@ -141,6 +140,7 @@ kernel)
#else /* WITH_OPENMP_TRADITIONAL */
integer(kind=ik), intent(in) :: l_nev, thread_width
integer(kind=ik), intent(in), optional :: last_stripe_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
......@@ -221,54 +221,39 @@ kernel)
#ifdef WITH_OPENMP_TRADITIONAL
if (my_thread==1) then
if (my_thread==1) then ! in the calling routine threads go form 1 .. max_threads
#endif
ttt = mpi_wtime()
#ifdef WITH_OPENMP_TRADITIONAL
endif
#endif
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#endif /* WITH_OPENMP_TRADITIONAL */
#ifndef WITH_OPENMP_TRADITIONAL
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP_TRADITIONAL */
if (istripe<stripe_count) then
nl = stripe_width
if (present(last_stripe_width)) then
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
if (istripe<stripe_count) then
nl = stripe_width
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP_TRADITIONAL
&_openmp" // &
&_openmp" // &
#else
&" // &
&" // &
#endif
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX &
)
return
return
endif
endif
endif
#endif /* not WITH_OPENMP_TRADITIONAL */
......
......@@ -629,65 +629,65 @@ max_threads)
aux1 = 0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
endif
enddo
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! Transform