Commit 74a07ae8 authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify real/complex versions of elpa2_template

Due to a bug in the Intel compiler two stages could not be unified
parent 289c35ac
#if REALCASE == 1 function solve_evp_&
function solve_evp_real_2stage_PRECISION (na, nev, a, lda, ev, q, ldq, nblk, & &MATH_DATATYPE&
matrixCols, mpi_comm_rows, mpi_comm_cols, & &_&
mpi_comm_all, THIS_ELPA_KERNEL_API, useQR, useGPU, bandwidth) result(success) &2stage_&
#elif COMPLEXCASE == 1 &PRECISION &
function solve_evp_complex_2stage_PRECISION(na, nev, a, lda, ev, q, ldq, nblk, & (na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
matrixCols, mpi_comm_rows, mpi_comm_cols, & #if REALCASE == 1
mpi_comm_all, THIS_ELPA_KERNEL_API, useGPU, bandwidth) result(success) THIS_ELPA_KERNEL_API, useQR, &
#endif
#if COMPLEXCASE == 1
THIS_ELPA_KERNEL_API, &
#endif #endif
useGPU, bandwidth) result(success)
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#else #else
...@@ -23,7 +28,7 @@ ...@@ -23,7 +28,7 @@
logical, intent(in), optional :: useGPU logical, intent(in), optional :: useGPU
#if REALCASE == 1 #if REALCASE == 1
logical, intent(in), optional :: useQR logical, intent(in), optional :: useQR
#endif #endif
logical :: useQRActual, useQREnvironment logical :: useQRActual, useQREnvironment
integer(kind=c_int), intent(in), optional :: bandwidth integer(kind=c_int), intent(in), optional :: bandwidth
...@@ -62,7 +67,11 @@ ...@@ -62,7 +67,11 @@
logical :: do_useGPU logical :: do_useGPU
integer(kind=c_int) :: numberOfGPUDevices integer(kind=c_int) :: numberOfGPUDevices
call timer%start(solve_evp_NUMBER_2stage_PRECISION_STR) call timer%start("solve_evp_&
&MATH_DATATYPE&
&_2stage_" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication") call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
...@@ -108,7 +117,7 @@ ...@@ -108,7 +117,7 @@
return return
endif endif
endif endif
#endif #endif /* REALCASE */
if (present(useGPU)) then if (present(useGPU)) then
if (useGPU) then if (useGPU) then
...@@ -151,34 +160,72 @@ ...@@ -151,34 +160,72 @@
if (my_pe == 0) then if (my_pe == 0) then
write(error_unit,*) " " write(error_unit,*) " "
write(error_unit,*) "The choosen kernel ",UPCASENUMBER_ELPA_KERNEL_NAMES(THIS_ELPA_KERNEL) write(error_unit,*) "The choosen kernel ", &
&MATH_DATATYPE&
&_ELPA_KERNEL_NAMES(THIS_ELPA_KERNEL)
write(error_unit,*) "is not in the list of the allowed kernels!" write(error_unit,*) "is not in the list of the allowed kernels!"
write(error_unit,*) " " write(error_unit,*) " "
write(error_unit,*) "Allowed kernels are:" write(error_unit,*) "Allowed kernels are:"
do i=1,size(UPCASENUMBER_ELPA_KERNEL_NAMES(:)) do i=1,size( &
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(i) .ne. 0) then &MATH_DATATYPE&
write(error_unit,*) UPCASENUMBER_ELPA_KERNEL_NAMES(i) &_ELPA_KERNEL_NAMES(:))
if (AVAILABLE_&
&MATH_DATATYPE&
&_ELPA_KERNELS(i) .ne. 0) then
write(error_unit,*) &
#if REALCASE == 1
REAL_ELPA_KERNEL_NAMES(i)
#endif
#if COMPLEXCASE == 1
COMPLEX_ELPA_KERNEL_NAMES(i)
#endif
endif endif
enddo enddo
write(error_unit,*) " " write(error_unit,*) " "
! check whether generic kernel is defined ! check whether generic kernel is defined
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(UPCASENUMBER_ELPA_KERNEL_GENERIC) .eq. 1) then if (AVAILABLE_&
write(error_unit,*) "The default kernel NUMBER_ELPA_KERNEL_GENERIC will be used !" &MATH_DATATYPE&
&_ELPA_KERNELS( &
#if REALCASE == 1
REAL_ELPA_KERNEL_GENERIC ) .eq. 1) then
#endif
#if COMPLEXCASE == 1
COMPLEX_ELPA_KERNEL_GENERIC ) .eq. 1) then
#endif
write(error_unit,*) "The default kernel &
&MATH_DATATYPE&
&( &
&_ELPA_KERNEL_GENERIC will be used !"
else else
write(error_unit,*) "As default kernel ",UPCASENUMBER_ELPA_KERNEL_NAMES(DEFAULT_UPCASENUMBER_ELPA_KERNEL)," will be used" write(error_unit,*) "As default kernel ", &
&MATH_DATATYPE&
&_ELPA_KERNEL_NAMES(DEFAULT_&
&MATH_DATATYPE&
&_ELPA_KERNEL)," will be used"
endif endif
endif ! my_pe == 0 endif ! my_pe == 0
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(UPCASENUMBER_ELPA_KERNEL_GENERIC) .eq. 1) then if (AVAILABLE_&
THIS_ELPA_KERNEL = UPCASENUMBER_ELPA_KERNEL_GENERIC &MATH_DATATYPE&
&_ELPA_KERNELS( &
&MATH_DATATYPE&
&_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_ELPA_KERNEL = &
&MATH_DATATYPE&
&_ELPA_KERNEL_GENERIC
else else
THIS_ELPA_KERNEL = DEFAULT_UPCASENUMBER_ELPA_KERNEL THIS_ELPA_KERNEL = DEFAULT_&
&MATH_DATATYPE&
&_ELPA_KERNEL
endif endif
endif endif
! check consistency between request for GPUs and defined kernel ! check consistency between request for GPUs and defined kernel
if (do_useGPU) then if (do_useGPU) then
if (THIS_ELPA_KERNEL .ne. UPCASENUMBER_ELPA_KERNEL_GPU) then if (THIS_ELPA_KERNEL .ne. &
&MATH_DATATYPE&
&_ELPA_KERNEL_GPU) then
write(error_unit,*) "GPU usage has been requested but compute kernel is defined as non-GPU! Aborting..." write(error_unit,*) "GPU usage has been requested but compute kernel is defined as non-GPU! Aborting..."
success = .false. success = .false.
return return
...@@ -222,12 +269,14 @@ ...@@ -222,12 +269,14 @@
nbw = (31/nblk+1)*nblk nbw = (31/nblk+1)*nblk
#endif #endif
endif endif
num_blocks = (na-1)/nbw + 1 num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage) allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,solve_evp_NUMBER_2stage_PRECISION_STR//": error when allocating tmat "//errorMessage print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage_PRECISION " // ": error when allocating tmat "//errorMessage
stop stop
endif endif
...@@ -235,17 +284,29 @@ ...@@ -235,17 +284,29 @@
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
ttts = ttt0 ttts = ttt0
call bandred_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, a, &
#if REALCASE == 1 #if REALCASE == 1
call bandred_real_PRECISION(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & a_dev, &
tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual) #endif
#elif COMPLEXCASE == 1 lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
call bandred_complex_PRECISION(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & #if REALCASE == 1
tmat, wantDebug, do_useGPU, success) tmat_dev, &
#endif #endif
wantDebug, do_useGPU, success &
#if REALCASE == 1
, useQRActual &
#endif
)
if (.not.(success)) return if (.not.(success)) return
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time '//bandred_NUMBER_PRECISION_STR//' :',ttt1-ttt0 write(error_unit,*) "Time " // "bandred_&
&MATH_DATATYPE&
&PRECISION " // " :",ttt1-ttt0
end if ! matrix not already banded on input end if ! matrix not already banded on input
...@@ -253,18 +314,27 @@ ...@@ -253,18 +314,27 @@
allocate(e(na), stat=istat, errmsg=errorMessage) allocate(e(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,solve_evp_NUMBER_2stage_PRECISION_STR//": error when allocating e "//errorMessage print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage_&
&PRECISION " // ": error when allocating e "//errorMessage
stop stop
endif endif
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
call tridiag_band_NUMBER_PRECISION(na, nbw, nblk, a, lda, ev, e, matrixCols, & call tridiag_band_&
hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all) &MATH_DATATYPE&
&_&
&PRECISION&
(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time '//tridiag_band_NUMBER_PRECISION_STR//' :',ttt1-ttt0 write(error_unit,*) "Time " // "tridiag_band_&
&MATH_DATATYPE&
&_&
&PRECISION " // " :",ttt1-ttt0
#ifdef WITH_MPI #ifdef WITH_MPI
call timer%start("mpi_communication") call timer%start("mpi_communication")
...@@ -275,28 +345,34 @@ ...@@ -275,28 +345,34 @@
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts time_evp_fwd = ttt1-ttts
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage) allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when allocating q_real"//errorMessage
stop stop
endif endif
#endif #endif
! Solve tridiagonal system ! Solve tridiagonal system
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
call solve_tridi_&
&PRECISION &
(na, nev, ev, e, &
#if REALCASE == 1 #if REALCASE == 1
call solve_tridi_PRECISION(na, nev, ev, e, q, ldq, nblk, matrixCols, & q, ldq, &
mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#elif COMPLEXCASE == 1
call solve_tridi_PRECISION(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#endif #endif
#if COMPLEXCASE == 1
q_real, ubound(q_real,dim=1), &
#endif
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
if (.not.(success)) return if (.not.(success)) return
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
...@@ -305,78 +381,112 @@ ...@@ -305,78 +381,112 @@
time_evp_solve = ttt1-ttt0 time_evp_solve = ttt1-ttt0
ttts = ttt1 ttts = ttt1
#if REALCASE == 1
deallocate(e, stat=istat, errmsg=errorMessage) deallocate(e, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when deallocating e "//errorMessage
stop stop
endif endif
#elif COMPLEXCASE == 1
#if COMPLEXCASE == 1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
deallocate(e, q_real, stat=istat, errmsg=errorMessage) deallocate(q_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when deallocating q_real"//errorMessage
stop stop
endif endif
#endif #endif
! Backtransform stage 1 ! Backtransform stage 1
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
call trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, nev, nblk, nbw, q, &
#if REALCASE == 1 #if REALCASE == 1
call trans_ev_tridi_to_band_real_PRECISION(na, nev, nblk, nbw, q, q_dev, ldq, & q_dev, &
matrixCols, hh_trans, &
mpi_comm_rows, mpi_comm_cols, &
wantDebug, do_useGPU, success, THIS_ELPA_KERNEL)
#elif COMPLEXCASE == 1
call trans_ev_tridi_to_band_complex_PRECISION(na, nev, nblk, nbw, q, ldq, &
matrixCols, hh_trans, &
mpi_comm_rows, mpi_comm_cols, &
wantDebug, do_useGPU, success,THIS_ELPA_KERNEL)
#endif #endif
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU, &
success, THIS_ELPA_KERNEL)
if (.not.(success)) return if (.not.(success)) return
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time '//trans_ev_tridi_to_band_NUMBER_PRECISION_STR//' :',ttt1-ttt0 write(error_unit,*) "Time " // "trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&_&
&PRECISION " // " :",ttt1-ttt0
! We can now deallocate the stored householder vectors ! We can now deallocate the stored householder vectors
deallocate(hh_trans, stat=istat, errmsg=errorMessage) deallocate(hh_trans, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *, solve_evp_NUMBER_2stage_PRECISION_STR//": error when deallocating hh_trans "//errorMessage print *, "solve_evp_&
&MATH_DATATYPE&
&_2stage_&
&PRECISION " // ": error when deallocating hh_trans "//errorMessage
stop stop
endif endif
if(present(bandwidth)) then if(present(bandwidth)) then
time_evp_back = ttt1-ttts time_evp_back = ttt1-ttts
else else
! Backtransform stage 2 ! Backtransform stage 2
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
call trans_ev_band_to_full_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, nev, nblk, nbw, a, &
#if REALCASE == 1 #if REALCASE == 1
call trans_ev_band_to_full_real_PRECISION(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, & a_dev, &
q, q_dev, ldq, matrixCols, num_blocks, & #endif
mpi_comm_rows, mpi_comm_cols, do_useGPU, useQRActual) lda, tmat, &
#elif COMPLEXCASE == 1 #if REALCASE == 1
call trans_ev_band_to_full_complex_PRECISION(na, nev, nblk, nbw, a, lda, tmat, & tmat_dev, &
q, ldq, matrixCols, num_blocks, & #endif
mpi_comm_rows, mpi_comm_cols, do_useGPU) q, &
#if REALCASE == 1
q_dev, &
#endif #endif
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU &
#if REALCASE == 1
, useQRActual &
#endif
)
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time '//trans_ev_band_to_full_NUMBER_PRECISION_STR//' :',ttt1-ttt0 write(error_unit,*) "Time " // "trans_ev_band_to_full_&
&MATH_DATATYPE&
&_&
&PRECISION " // " :",ttt1-ttt0
time_evp_back = ttt1-ttts time_evp_back = ttt1-ttts
deallocate(tmat, stat=istat, errmsg=errorMessage) deallocate(tmat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,solve_evp_NUMBER_2stage_PRECISION_STR//": error when deallocating tmat"//errorMessage print *,"solve_evp_&
&MATH_DATATYPE&
_2stage_&
&PRECISION " // ": error when deallocating tmat"//errorMessage
stop stop
endif endif
endif ! not present(bandwidth) endif ! not present(bandwidth)
call timer%stop(solve_evp_NUMBER_2stage_PRECISION_STR) call timer%stop("solve_evp_&
&MATH_DATATYPE&
&_2stage_" // &
&PRECISION_SUFFIX &
)
1 format(a,f10.3) 1 format(a,f10.3)
end function solve_evp_NUMBER_2stage_PRECISION end function solve_evp_&
&MATH_DATATYPE&
&_2stage_&
&PRECISION
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment