Commit 77cd3307 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' of gitlab.mpcdf.mpg.de:elpa/elpa

parents c2b2dfa9 768db157
......@@ -84,9 +84,10 @@ enum ELPA_COMPLEX_KERNELS {
X(ELPA_OK, 0) \
X(ELPA_ERROR, -1) \
X(ELPA_ERROR_ENTRY_NOT_FOUND, -2) \
X(ELPA_ERROR_INVALID_VALUE, -3) \
X(ELPA_ERROR_VALUE_ALREADY_SET, -4) \
X(ELPA_ERROR_NO_STRING_REPRESENTATION, -5)
X(ELPA_ERROR_ENTRY_INVALID_VALUE, -3) \
X(ELPA_ERROR_ENTRY_ALREADY_SET, -4) \
X(ELPA_ERROR_ENTRY_NO_STRING_REPRESENTATION, -5) \
X(ELPA_ERROR_ENTRY_READONLY, -6)
enum ELPA_ERRORS {
ELPA_FOR_ALL_ERRORS(ELPA_ENUM_ENTRY)
......
......@@ -58,8 +58,7 @@ function elpa_solve_evp_&
&MATH_DATATYPE&
&_1stage_&
&PRECISION&
&_impl (obj, a, ev, q, time_evp_fwd, &
time_evp_solve, time_evp_back) result(success)
&_impl (obj, a, ev, q) result(success)
use precision
use cuda_functions
use mod_check_for_gpu
......@@ -105,15 +104,11 @@ function elpa_solve_evp_&
logical :: useGPU
logical :: success
real(kind=c_double) :: time_evp_fwd, &
time_evp_solve, time_evp_back
logical :: summary_timings
logical :: do_useGPU
integer(kind=ik) :: numberOfGPUDevices
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, mpierr
real(kind=C_DATATYPE_KIND), allocatable :: e(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical :: wantDebug
integer(kind=c_int) :: istat
character(200) :: errorMessage
......@@ -142,11 +137,11 @@ function elpa_solve_evp_&
else
useGPU = .false.
endif
if (obj%get("summary_timings") .eq. 1) then
summary_timings = .true.
else
summary_timings = .false.
endif
! if (obj%get("summary_timings") .eq. 1) then
! summary_timings = .true.
! else
! summary_timings = .false.
! endif
call timer%start("mpi_communication")
......@@ -228,17 +223,18 @@ function elpa_solve_evp_&
&" // ": error when allocating e, tau "//errorMessage
stop 1
endif
ttt0 = MPI_Wtime()
! ttt0 = MPI_Wtime()
call tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. summary_timings) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
!ttt1 = MPI_Wtime()
!if(my_prow==0 .and. my_pcol==0 .and. summary_timings) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
!time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
!ttt0 = MPI_Wtime()
call solve_tridi_&
&PRECISION&
& (na, nev, ev, e, &
......@@ -251,11 +247,6 @@ function elpa_solve_evp_&
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. summary_timings) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttt0 = MPI_Wtime()
#if COMPLEXCASE == 1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
#endif
......@@ -264,9 +255,9 @@ function elpa_solve_evp_&
&_&
&PRECISION&
& (na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. summary_timings) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
time_evp_back = ttt1-ttt0
!ttt1 = MPI_Wtime()
!if(my_prow==0 .and. my_pcol==0 .and. summary_timings) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
!time_evp_back = ttt1-ttt0
#if COMPLEXCASE == 1
deallocate(q_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......
......@@ -163,13 +163,11 @@ function elpa_solve_evp_&
endif
endif
if (elpa_print_times) then
call e%set("summary_timings", 1,successInternal)
if (successInternal .ne. ELPA_OK) then
print *, "Cannot set summary_timings"
success = .false.
return
endif
call e%set("summary_timings", 1,successInternal)
if (successInternal .ne. ELPA_OK) then
print *, "Cannot set summary_timings"
success = .false.
return
endif
call e%solve(a(1:lda,1:matrixCols), ev, q(1:ldq,1:matrixCols), successInternal)
......@@ -180,11 +178,10 @@ function elpa_solve_evp_&
return
endif
if (elpa_print_times) then
time_evp_fwd = e%get_double("time_evp_fwd")
time_evp_solve = e%get_double("time_evp_solve")
time_evp_back = e%get_double("time_evp_back")
endif
time_evp_fwd = e%get_double("time_evp_fwd")
time_evp_solve = e%get_double("time_evp_solve")
time_evp_back = e%get_double("time_evp_back")
call elpa_deallocate(e)
call elpa_uninit()
......
......@@ -65,12 +65,7 @@
#else
last_stripe_width, &
#endif
#if REALCASE == 1
THIS_REAL_ELPA_KERNEL)
#endif
#if COMPLEXCASE == 1
THIS_COMPLEX_ELPA_KERNEL)
#endif
kernel)
use precision
use elpa_api
......@@ -154,12 +149,7 @@
#endif /* WITH_OPENMP */
#if REALCASE == 1
integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
#endif
#if COMPLEXCASE ==1
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
#endif
integer(kind=ik), intent(in) :: kernel
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
......@@ -184,13 +174,13 @@
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
#if REALCASE == 1
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_GPU) then
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
endif
#endif
#if COMPLEXCASE == 1
if (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_GPU) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
endif
......@@ -221,7 +211,7 @@
#else /* WITH_OPENMP */
#if REALCASE == 1
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_GPU) then
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
......@@ -229,7 +219,7 @@
endif
#endif
#if COMPLEXCASE == 1
if (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_GPU) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
......@@ -259,7 +249,7 @@
#if REALCASE == 1
! GPU kernel real
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_GPU) then
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
&PRECISION&
&_&
......@@ -272,7 +262,7 @@
#endif /* REALCASE */
#if COMPLEXCASE == 1
! GPU kernel complex
if (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_GPU) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
&PRECISION&
......@@ -303,15 +293,15 @@
#if REALCASE == 1
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_GENERIC .or. &
THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_BGP .or. &
THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_BGQ) then
if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC .or. &
kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
kernel .eq. ELPA_2STAGE_REAL_BGP .or. &
kernel .eq. ELPA_2STAGE_REAL_BGQ) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* REALCASE */
......@@ -323,7 +313,7 @@
! generic kernel real case
#if defined(WITH_REAL_GENERIC_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_GENERIC) then
if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
#endif /* not WITH_FIXED_REAL_KERNEL */
do j = ncols, 2, -2
......@@ -379,9 +369,9 @@
! generic kernel complex case
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
......@@ -420,7 +410,7 @@
enddo
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_BGP .or. THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_BGQ )
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */
......@@ -430,7 +420,7 @@
! generic simple real kernel
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
#endif /* not WITH_FIXED_REAL_KERNEL */
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -484,7 +474,7 @@
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
......@@ -522,7 +512,7 @@
#endif /* WITH_OPENMP */
enddo
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
#endif /* COMPLEXCASE */
......@@ -531,7 +521,7 @@
! sse assembly kernel real case
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
#endif /* not WITH_FIXED_REAL_KERNEL */
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -563,7 +553,7 @@
! sse assembly kernel complex case
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
......@@ -584,7 +574,7 @@
#endif
enddo
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_SSE)
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
#endif /* COMPLEXCASE */
......@@ -597,7 +587,7 @@
! sse block1 complex kernel
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
......@@ -620,7 +610,7 @@
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
......@@ -634,8 +624,8 @@
! avx block1 complex kernel
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if ((THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
(kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
......@@ -658,7 +648,7 @@
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! ((THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */
......@@ -672,7 +662,7 @@
! avx512 block1 complex kernel
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if ((THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
......@@ -695,7 +685,7 @@
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! ((THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL */
#endif /* COMPLEXCASE */
......@@ -705,7 +695,7 @@
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then
if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
......@@ -740,7 +730,7 @@
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2) then
if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
......@@ -776,7 +766,7 @@
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */
......@@ -787,8 +777,8 @@
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if ((THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX_BLOCK2) .or. &
(THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2)) then
if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2) .or. &
(kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2)) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
......@@ -823,8 +813,8 @@
! implementation of avx block 2 complex case
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. &
(THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) ) then
if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. &
(kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) ) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
......@@ -860,7 +850,7 @@
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) )
endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) )
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */
......@@ -872,7 +862,7 @@
#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if ((THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2)) then
if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2)) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK4_KERNEL))
......@@ -907,7 +897,7 @@
! implementation of avx512 block 2 complex case
#if defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
if ( (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2)) then
if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
ttt = mpi_wtime()
......@@ -943,7 +933,7 @@
#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2))
endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
#endif /* COMPLEXCASE */
......@@ -953,7 +943,7 @@
#if defined(WITH_REAL_BGP_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_BGP) then
if (kernel .eq. ELPA_2STAGE_REAL_BGP) then
#endif /* not WITH_FIXED_REAL_KERNEL */
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -975,7 +965,7 @@
#if defined(WITH_REAL_BGQ_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_BGQ) then
if (kernel .eq. ELPA_2STAGE_REAL_BGQ) then
#endif /* not WITH_FIXED_REAL_KERNEL */
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
......@@ -1032,7 +1022,7 @@
#if defined(WITH_REAL_SSE_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_SSE_BLOCK4) then
if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
......@@ -1105,8 +1095,8 @@
! avx block4 real kernel
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if ((THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX_BLOCK4) .or. &
(THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX2_BLOCK4)) then
if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK4) .or. &
(kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK4)) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
......@@ -1180,7 +1170,7 @@
#if defined(WITH_REAL_AVX512_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4) then
if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL))
......@@ -1254,7 +1244,7 @@
!sse block6 real kernel
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_SSE_BLOCK6) then
if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK6) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
......@@ -1345,8 +1335,8 @@
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if ((THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX_BLOCK6) .or. &
(THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX2_BLOCK6)) then
if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK6) .or. &
(kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK6)) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
......@@ -1436,7 +1426,7 @@
! avx512 block6 kernel
#if defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if ((THIS_REAL_ELPA_KERNEL .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6)) then
if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6)) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
......
......@@ -109,7 +109,7 @@ module elpa2_impl
!> \param mpi_comm_cols MPI communicator for columns
!> \param mpi_comm_all MPI communicator for the total processor set
!>
!> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!> \param kernel specify ELPA2 kernel to use
!>
!> \param useQR (optional) use QR decomposition
!> \param useGPU (optional) decide whether to use GPUs or not
......@@ -157,7 +157,7 @@ module elpa2_impl
!> \param mpi_comm_cols MPI communicator for columns
!> \param mpi_comm_all MPI communicator for the total processor set
!>
!> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!> \param kernel specify ELPA2 kernel to use
!>
!> \param useQR (optional) use QR decomposition
!> \param useGPU (optional) decide whether GPUs should be used or not
......@@ -205,7 +205,7 @@ module elpa2_impl
!> \param mpi_comm_cols MPI communicator for columns
!> \param mpi_comm_all MPI communicator for the total processor set
!>
!> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!> \param kernel specify ELPA2 kernel to use
!> \param useGPU (optional) decide whether GPUs should be used or not
!>
!> \result success logical, false if error occured
......@@ -252,8 +252,8 @@ module elpa2_impl
!> \param mpi_comm_cols MPI communicator for columns
!> \param mpi_comm_all MPI communicator for the total processor set
!>
!> \param THIS_COMPLEX_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API
!> \param useGPU (optional) decide whether GPUs should be used or not
!> \param kernel specify ELPA2 kernel to use
!> \param useGPU (optional) decide whether GPUs should be used or not
!>
!> \result success logical, false if error occured
!-------------------------------------------------------------------------------
......
......@@ -54,8 +54,7 @@
&_&
&2stage_&
&PRECISION&
&_impl (obj, a, ev, q, &
time_evp_fwd, time_evp_solve, time_evp_back) result(success)
&_impl (obj, a, ev, q) result(success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
......@@ -99,10 +98,7 @@
#if COMPLEXCASE == 1
real(kind=C_DATATYPE_KIND), allocatable :: q_real(:,:)
#endif
real(kind=c_double) :: time_evp_fwd, time_evp_solve, time_evp_back
logical :: summary_timings
integer(kind=c_intptr_t) :: tmat_dev, q_dev, a_dev
real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double
integer(kind=c_int) :: i
logical :: success, successCUDA
......@@ -158,11 +154,11 @@
mpi_comm_cols = obj%get("mpi_comm_cols")
mpi_comm_all = obj%get("mpi_comm_parent")
if (obj%get("summary_timings") .eq. 1) then
summary_timings = .true.
else
summary_timings = .false.
endif
! if (obj%get("summary_timings") .eq. 1) then
! summary_timings = .true.
! else
! summary_timings = .false.
! endif
if (obj%get("gpu") .eq. 1) then
useGPU = .true.
else
......@@ -271,8 +267,6 @@
success = .false.
return
endif
ttts = MPI_Wtime()
else
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
......@@ -303,9 +297,6 @@
endif
! Reduction full -> band
ttt0 = MPI_Wtime()
ttts = ttt0
call bandred_&
&MATH_DATATYPE&
&_&
......@@ -318,13 +309,6 @@
#endif
)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. summary_timings) &
write(error_unit,*) "Time " // "bandred_&
&MATH_DATATYPE&
&_&
&PRECISION " // " :",ttt1-ttt0
end if ! matrix not already banded on input
! Reduction band -> tridiagonal
......@@ -338,29 +322,18 @@
stop 1
endif
ttt0 = MPI_Wtime()
call tridiag_band_&
&MATH_DATATYPE&
&_&
&PRECISION&
(na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, do_useGPU)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. summary_timings) &
write(error_unit,*) "Time " // "tridiag_band_&
&MATH_DATATYPE&
&_&
&PRECISION " // " :",ttt1-ttt0
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
#if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
......@@ -377,8 +350,6 @@
#endif
! Solve tridiagonal system
ttt0 = MPI_Wtime()
call solve_tridi_&
&PRECISION &
(na, nev, ev, e, &
......@@ -392,12 +363,6 @@
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. summary_timings) &
write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttts = ttt1
deallocate(e, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
......@@ -418,9 +383,6 @@
endif
#endif
! Backtransform stage 1
ttt0 = MPI_Wtime()
call trans_ev_tridi_to_band_&