Commit 20892fc3 authored by Pavel Kus's avatar Pavel Kus

repated code from elpa2.F90 moved to elpa2_template.X90

parent 263b7c83
......@@ -52,6 +52,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
......@@ -903,6 +904,7 @@ EXTRA_DIST = \
src/elpa2_bandred_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
......
......@@ -5,7 +5,6 @@ simple_tokens = [
"PRECISION",
"elpa_transpose_vectors_NUMBER_PRECISION",
"elpa_reduce_add_vectors_NUMBER_PRECISION",
"bandred_NUMBER_PRECISION",
"trans_ev_band_to_full_NUMBER_PRECISION",
"tridiag_band_NUMBER_PRECISION",
......@@ -16,7 +15,7 @@ simple_tokens = [
"solve_tridi_PRECISION",
"solve_tridi_col_PRECISION",
"solve_tridi_single_problem_PRECISION",
"solve_evp_NUMBER_2stage_PRECISION",
"qr_pdgeqrf_2dcomm_PRECISION",
"hh_transform_NUMBER_PRECISION",
"symm_matrix_allreduce_PRECISION",
......@@ -53,6 +52,11 @@ simple_tokens = [
"launch_my_unpack_c_kernel_NUMBER_PRECISION",
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION",
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION",
"AVAILABLE_UPCASENUMBER_ELPA_KERNELS",
"UPCASENUMBER_ELPA_KERNEL_GENERIC",
"DEFAULT_UPCASENUMBER_ELPA_KERNEL",
"UPCASENUMBER_ELPA_KERNEL_NAMES",
"UPCASENUMBER_ELPA_KERNEL_GPU",
]
blas_tokens = [
......@@ -100,6 +104,7 @@ explicit_tokens_complex = [
("CONST_COMPLEX_0_0", "0.0_ck8", "0.0_ck4"),
("CONST_COMPLEX_1_0", "1.0_ck8", "1.0_ck4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
("C_DATATYPE_KIND", "c_double", "c_float"),
]
explicit_tokens_real = [
......@@ -111,6 +116,7 @@ explicit_tokens_real = [
("CONST_8_0", "8.0_rk8", "8.0_rk4"),
("size_of_PRECISION_real", "size_of_double_real_datatype", "size_of_single_real_datatype"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("C_DATATYPE_KIND", "c_double", "c_float"),
]
......@@ -119,6 +125,7 @@ blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","s
def print_variant(number, precision, explicit):
for token in simple_tokens:
print "#define ", token, token.replace("PRECISION", precision).replace("UPCASENUMBER", number.upper()).replace("NUMBER", number)
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number)
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", blas_prefixes[(number, precision)])
......@@ -127,6 +134,7 @@ def print_variant(number, precision, explicit):
def print_undefs(number, explicit):
for token in simple_tokens:
print "#undef ", token
print "#undef ", token.replace("NUMBER", number)
for token in blas_tokens:
print "#undef ", token
......
This diff is collapsed.
#if REALCASE == 1
function solve_evp_real_2stage_PRECISION (na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_ELPA_KERNEL_API, useQR, useGPU) result(success)
#elif COMPLEXCASE == 1
function solve_evp_complex_2stage_PRECISION(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_ELPA_KERNEL_API, useGPU) result(success)
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use elpa1_utilities, only : gpu_usage_via_environment_variable
use elpa1_compute
use elpa2_compute
use elpa_mpi
use cuda_functions
use mod_check_for_gpu
use iso_c_binding
implicit none
logical, intent(in), optional :: useGPU
#if REALCASE == 1
logical, intent(in), optional :: useQR
#endif
logical :: useQRActual, useQREnvironment
integer(kind=c_int), intent(in), optional :: THIS_ELPA_KERNEL_API
integer(kind=c_int) :: THIS_ELPA_KERNEL
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
mpi_comm_cols, mpi_comm_all
integer(kind=c_int), intent(in) :: nblk
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,*), q(ldq,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=c_int) :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: tmat(:,:,:)
real(kind=C_DATATYPE_KIND), allocatable :: e(:)
#if COMPLEXCASE == 1
real(kind=C_DATATYPE_KIND), allocatable :: q_real(:,:)
#endif
integer(kind=c_intptr_t) :: tmat_dev, q_dev, a_dev
real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double
integer(kind=c_int) :: i
logical :: success
logical, save :: firstCall = .true.
logical :: wantDebug
integer(kind=c_int) :: istat
character(200) :: errorMessage
logical :: do_useGPU
integer(kind=c_int) :: numberOfGPUDevices
call timer%start("solve_evp_&
&MATH_DATATYPE&
&_2stage_" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call timer%stop("mpi_communication")
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
success = .true.
do_useGPU = .false.
#if REALCASE == 1
useQRActual = .false.
! set usage of qr decomposition via API call
if (present(useQR)) then
if (useQR) useQRActual = .true.
if (.not.(useQR)) useQRACtual = .false.
endif
! overwrite this with environment variable settings
if (qr_decomposition_via_environment_variable(useQREnvironment)) then
useQRActual = useQREnvironment
endif
if (useQRActual) then
if (mod(na,2) .ne. 0) then
if (wantDebug) then
write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
endif
print *, "Do not use QR-decomposition for this matrix and blocksize."
success = .false.
return
endif
endif
#endif
if (present(useGPU)) then
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
else
! check whether set by environment variable
do_useGPU = gpu_usage_via_environment_variable()
endif
if (present(THIS_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_ELPA_KERNEL = THIS_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_ELPA_KERNEL = get_actual_&
&MATH_DATATYPE&
&_kernel()
endif
! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
if (check_allowed_&
&MATH_DATATYPE&
&_kernels(THIS_ELPA_KERNEL)) then
if (my_pe == 0) then
write(error_unit,*) " "
write(error_unit,*) "The choosen kernel ",UPCASENUMBER_ELPA_KERNEL_NAMES(THIS_ELPA_KERNEL)
write(error_unit,*) "is not in the list of the allowed kernels!"
write(error_unit,*) " "
write(error_unit,*) "Allowed kernels are:"
do i=1,size(UPCASENUMBER_ELPA_KERNEL_NAMES(:))
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(i) .ne. 0) then
write(error_unit,*) UPCASENUMBER_ELPA_KERNEL_NAMES(i)
endif
enddo
write(error_unit,*) " "
! check whether generic kernel is defined
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(UPCASENUMBER_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel NUMBER_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",UPCASENUMBER_ELPA_KERNEL_NAMES(DEFAULT_UPCASENUMBER_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(UPCASENUMBER_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_ELPA_KERNEL = UPCASENUMBER_ELPA_KERNEL_GENERIC
else
THIS_ELPA_KERNEL = DEFAULT_UPCASENUMBER_ELPA_KERNEL
endif
endif
! check consistency between request for GPUs and defined kernel
if (do_useGPU) then
if (THIS_ELPA_KERNEL .ne. UPCASENUMBER_ELPA_KERNEL_GPU) then
write(error_unit,*) "GPU usage has been requested but compute kernel is defined as non-GPU! Aborting..."
success = .false.
return
endif
endif
if (do_useGPU) then
if (nblk .ne. 128) then
write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
success = .false.
return
endif
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
! on this and maybe allow a run-time optimization here
if (do_useGPU) then
nbw = nblk
else
#if REALCASE == 1
nbw = (63/nblk+1)*nblk
#elif COMPLEXCASE == 1
nbw = (31/nblk+1)*nblk
#endif
endif
num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when allocating tmat "//errorMessage
stop
endif
! Reduction full -> band
ttt0 = MPI_Wtime()
ttts = ttt0
#if REALCASE == 1
call bandred_real_PRECISION(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual)
#elif COMPLEXCASE == 1
call bandred_complex_PRECISION(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, do_useGPU, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time bandred_&
&MATH_DATATYPE&
& :',ttt1-ttt0
! Reduction band -> tridiagonal
allocate(e(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when allocating e "//errorMessage
stop
endif
ttt0 = MPI_Wtime()
call tridiag_band_NUMBER_PRECISION(na, nbw, nblk, a, lda, ev, e, matrixCols, &
hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_&
&MATH_DATATYPE&
& :',ttt1-ttt0
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
#if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage
stop
endif
#endif
! Solve tridiagonal system
ttt0 = MPI_Wtime()
#if REALCASE == 1
call solve_tridi_PRECISION(na, nev, ev, e, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#elif COMPLEXCASE == 1
call solve_tridi_PRECISION(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttts = ttt1
#if REALCASE == 1
deallocate(e, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
stop
endif
#elif COMPLEXCASE == 1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
deallocate(e, q_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage
stop
endif
#endif
! Backtransform stage 1
ttt0 = MPI_Wtime()
#if REALCASE == 1
call trans_ev_tridi_to_band_real_PRECISION(na, nev, nblk, nbw, q, q_dev, ldq, &
matrixCols, hh_trans, &
mpi_comm_rows, mpi_comm_cols, &
wantDebug, do_useGPU, success, THIS_ELPA_KERNEL)
#elif COMPLEXCASE == 1
call trans_ev_tridi_to_band_complex_PRECISION(na, nev, nblk, nbw, q, ldq, &
matrixCols, hh_trans, &
mpi_comm_rows, mpi_comm_cols, &
wantDebug, do_useGPU, success,THIS_ELPA_KERNEL)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&:',ttt1-ttt0
! We can now deallocate the stored householder vectors
deallocate(hh_trans, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when deallocating hh_trans "//errorMessage
stop
endif
! Backtransform stage 2
ttt0 = MPI_Wtime()
#if REALCASE == 1
call trans_ev_band_to_full_real_PRECISION(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, &
q, q_dev, ldq, matrixCols, num_blocks, &
mpi_comm_rows, mpi_comm_cols, do_useGPU, useQRActual)
#elif COMPLEXCASE == 1
call trans_ev_band_to_full_complex_PRECISION(na, nev, nblk, nbw, a, lda, tmat, &
q, ldq, matrixCols, num_blocks, &
mpi_comm_rows, mpi_comm_cols, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_band_to_full_&
&MATH_DATATYPE&
& :',ttt1-ttt0
time_evp_back = ttt1-ttts
deallocate(tmat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when deallocating tmat"//errorMessage
stop
endif
call timer%stop("solve_evp_real_2stage_&
&MATH_DATATYPE&
&")
1 format(a,f10.3)
end function solve_evp_NUMBER_2stage_PRECISION
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment